In [4]:
import torch


def time_elapsed_pytorch(func, inp):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    for _ in range(5):
        func(inp)
    start.record()
    func(inp)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)

In [5]:
b = torch.randn(10000, 10000).cuda()

In [6]:
def square_2(n):
    return n * n


def power_2(n):
    return n**2

In [7]:
time_elapsed_pytorch(torch.square, b)
time_elapsed_pytorch(square_2, b)
time_elapsed_pytorch(power_2, b)

4.676671981811523

In [8]:
print("=============")
print("Profiling torch.square")
print("=============")

with torch.autograd.profiler.profile(use_device=True) as prof:
    torch.square(b)

prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)

print("=============")
print("Profiling square_2(n*n)")
print("=============")

with torch.autograd.profiler.profile(use_device=True) as prof:
    square_2(b)

prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)

print("=============")
print("Profiling power_2(n**2)")
print("=============")

with torch.autograd.profiler.profile(use_device=True) as prof:
    power_2(b)

prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)

Profiling torch.square
Profiling square_2(n*n)
Profiling power_2(n**2)


  warn(f"The {self.use_device} is not a valid device option.")


'---------------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  \n---------------------  ------------  ------------  ------------  ------------  ------------  ------------  \n            aten::pow        93.33%     428.935us       100.00%     459.588us     459.588us             1  \n    aten::result_type         0.52%       2.396us         0.52%       2.396us       2.396us             1  \n             aten::to         0.20%       0.908us         0.20%       0.908us       0.908us             1  \n     cudaLaunchKernel         5.95%      27.349us         5.95%      27.349us      27.349us             1  \n---------------------  ------------  ------------  ------------  ------------  ------------  ------------  \nSelf CPU time total: 459.588us\n'

In [12]:
import torch

torch.cuda.is_available(), torch.cuda.is_initialized()

(True, True)