In [1]:
# Ref : https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html

In [2]:
import torch

In [3]:
cuda_available = torch.cuda.is_available()

In [4]:
# This returns a compute capability of the gpu, (major,minor)
# My GPU has the compute capability of 8.9

torch.cuda.get_device_capability()

(8, 9)

In [5]:
# Any arbitary function


def f1(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a * b

In [6]:
opt_foo1 = torch.compile(f1)

In [7]:
def compile_bench(dim, device):
    print("dim >>", dim)
    %timeit f1(torch.randn(dim, dim,device=device), torch.randn(dim, dim,device=device))
    print("with torch.compile")
    %timeit opt_foo1(torch.randn(dim, dim,device=device), torch.randn(dim, dim,device=device))
    print()

In [8]:
device = "cpu"
compile_bench(10, device)
compile_bench(100, device)
compile_bench(1000, device)
compile_bench(10000, device)

dim >> 10
4.24 µs ± 29.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
with torch.compile
The slowest run took 5.63 times longer than the fastest. This could mean that an intermediate result is being cached.
40 µs ± 32.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

dim >> 100
112 µs ± 2.84 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
with torch.compile
68.5 µs ± 480 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

dim >> 1000
7.38 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
with torch.compile
7.13 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

dim >> 10000
724 ms ± 8.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
with torch.compile
783 ms ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)



In [9]:
device = "cuda"
compile_bench(10, device)
compile_bench(100, device)
compile_bench(1000, device)
compile_bench(10000, device)

dim >> 10
14.6 µs ± 235 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
with torch.compile
63 µs ± 38.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

dim >> 100
15.1 µs ± 450 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
with torch.compile
32.7 µs ± 120 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

dim >> 1000
51.2 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
with torch.compile
37.8 µs ± 968 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

dim >> 10000
9.6 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
with torch.compile
4.7 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [10]:
# we can also decorate the func with torch.compile

In [11]:
def timed(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000  # second


def foo(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a * b


dim = 10000
device = "cuda"
timed(lambda: foo(torch.randn(dim, dim, device=device), torch.randn(dim, dim, device=device)))

(tensor([[ 0.3345,  0.3350, -0.9634,  ..., -0.4105,  0.9161, -0.2657],
         [ 0.1466,  0.2699, -0.7055,  ..., -0.2511, -0.4539, -0.2040],
         [ 0.7115,  0.6667,  0.6049,  ..., -0.0721, -0.8734, -0.1940],
         ...,
         [-0.3434,  0.0315,  0.0081,  ..., -0.2987, -0.3092, -0.4870],
         [-0.9001,  0.1684, -0.2905,  ..., -0.7135,  0.7816, -0.5135],
         [-0.8590, -0.6099,  0.0650,  ..., -0.9240, -0.1360,  0.7181]],
        device='cuda:0'),
 0.008256511688232422)

In [12]:
# Non Pytorch function

In [13]:
import scipy


def f2(x):
    x = x * 2
    x = scipy.fft.dct(x.cpu().numpy())
    x = torch.from_numpy(x)
    x = x * 2
    return x

In [14]:
opt_f2 = torch.compile(
    f2,
)

In [16]:
def compile_bench(dim, device):
    print("dim >>", dim)
    %timeit f2(torch.randn(dim, dim,device=device))
    print("with torch.compile")
    %timeit opt_f2(torch.randn(dim, dim,device=device))
    print()

In [17]:
# import torch._dynamo
# torch._dynamo.config.suppress_errors = True


device = "cuda"
compile_bench(1000, device)
compile_bench(10000, device)

dim >> 1000
1.76 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
with torch.compile
11.8 ms ± 3.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

dim >> 10000
390 ms ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
with torch.compile
311 ms ± 24.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)



In [18]:
# For inductor you can see the full list of configs that it supports by calling
# torch._inductor.list_options()

### Demonstrating Speedups

In [19]:
import torchvision
from torchvision.models import densenet121

In [20]:
model = densenet121(weights=torchvision.models.densenet.DenseNet121_Weights)



In [21]:
model = model.cuda()
opt_model = torch.compile(model, mode="reduce-overhead")

In [22]:
## Infrence

In [23]:
inp = torch.rand(128, 3, 128, 128).cuda()
with torch.no_grad():
    %timeit model(inp)
    %timeit opt_model(inp)

38.4 ms ± 218 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)




The slowest run took 2290.62 times longer than the fastest. This could mean that an intermediate result is being cached.
51.3 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Notice that torch.compile takes a  approx same time to complete compared to eager. This is because torch.compile compiles the model into optimized kernels as it executes. In our example, the structure of the model doesn’t change, and so recompilation is not needed. So if we run our optimized model several more times, we should see a significant improvement compared to eager.

In [24]:
## let's repeat this
for _ in range(10):
    inp = torch.rand(128, 3, 128, 128).cuda()
    # Eager mode
    with torch.no_grad():
        %time model(inp)

print("~" * 30)

print("Compile mode")

for _ in range(10):
    inp = torch.rand(128, 3, 128, 128).cuda()
    # Compile mode
    with torch.no_grad():
        %time opt_model(inp)

CPU times: user 5.69 ms, sys: 300 µs, total: 5.99 ms
Wall time: 6.01 ms
CPU times: user 4.63 ms, sys: 73 µs, total: 4.7 ms
Wall time: 4.7 ms
CPU times: user 4.47 ms, sys: 0 ns, total: 4.47 ms
Wall time: 4.48 ms
CPU times: user 4.49 ms, sys: 0 ns, total: 4.49 ms
Wall time: 4.5 ms
CPU times: user 4.53 ms, sys: 0 ns, total: 4.53 ms
Wall time: 4.54 ms
CPU times: user 8.93 ms, sys: 0 ns, total: 8.93 ms
Wall time: 8.77 ms
CPU times: user 16.9 ms, sys: 0 ns, total: 16.9 ms
Wall time: 17 ms
CPU times: user 9.81 ms, sys: 0 ns, total: 9.81 ms
Wall time: 9.82 ms
CPU times: user 4.39 ms, sys: 0 ns, total: 4.39 ms
Wall time: 4.4 ms
CPU times: user 5.56 ms, sys: 0 ns, total: 5.56 ms
Wall time: 5.46 ms
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Compile mode
CPU times: user 518 µs, sys: 0 ns, total: 518 µs
Wall time: 524 µs
CPU times: user 344 µs, sys: 0 ns, total: 344 µs
Wall time: 348 µs
CPU times: user 295 µs, sys: 52 µs, total: 347 µs
Wall time: 350 µs
CPU times: user 308 µs, sys: 54 µs, total: 362 µs
Wall ti

In [25]:
# We see a significant speed bump after first iteration

In [26]:
### Training

In [27]:
# There are one 1000 image sample in the dataset
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fun = torch.nn.CrossEntropyLoss()

y = torch.randint(1000, (12,), device="cuda")
num_epoch = 20


for epoch in range(num_epoch):
    optimizer.zero_grad()
    x = torch.rand(12, 3, 128, 128, device="cuda")
    %time pred = model(x)
    loss = loss_fun(pred, y)
    loss.backward()
    optimizer.step()

CPU times: user 125 ms, sys: 0 ns, total: 125 ms
Wall time: 124 ms
CPU times: user 21.2 ms, sys: 203 µs, total: 21.4 ms
Wall time: 21.4 ms
CPU times: user 20.1 ms, sys: 13 µs, total: 20.1 ms
Wall time: 20.1 ms
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 20.1 ms
CPU times: user 19.9 ms, sys: 0 ns, total: 19.9 ms
Wall time: 20 ms
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 20 ms
CPU times: user 20.1 ms, sys: 0 ns, total: 20.1 ms
Wall time: 20.2 ms
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 20 ms
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 20.1 ms
CPU times: user 13.1 ms, sys: 0 ns, total: 13.1 ms
Wall time: 12.9 ms
CPU times: user 5.99 ms, sys: 0 ns, total: 5.99 ms
Wall time: 6 ms
CPU times: user 4.99 ms, sys: 0 ns, total: 4.99 ms
Wall time: 5 ms
CPU times: user 5.06 ms, sys: 0 ns, total: 5.06 ms
Wall time: 5.07 ms
CPU times: user 5 ms, sys: 0 ns, total: 5 ms
Wall time: 5.01 ms
CPU times: user 4.96 ms, sys: 0 ns, total: 4.96 ms
Wal

In [28]:
for epoch in range(num_epoch):
    optimizer.zero_grad()
    x = torch.rand(12, 3, 128, 128, device="cuda")
    %time pred = opt_model(x)
    loss = loss_fun(pred, y)
    loss.backward()
    optimizer.step()

CPU times: user 59 s, sys: 10.8 s, total: 1min 9s
Wall time: 1min 12s
CPU times: user 944 ms, sys: 4.03 ms, total: 948 ms
Wall time: 945 ms
CPU times: user 3.46 ms, sys: 0 ns, total: 3.46 ms
Wall time: 3.46 ms
CPU times: user 1.53 ms, sys: 0 ns, total: 1.53 ms
Wall time: 1.54 ms
CPU times: user 1.38 ms, sys: 0 ns, total: 1.38 ms
Wall time: 1.39 ms
CPU times: user 1.29 ms, sys: 0 ns, total: 1.29 ms
Wall time: 1.3 ms
CPU times: user 1.25 ms, sys: 0 ns, total: 1.25 ms
Wall time: 1.28 ms
CPU times: user 1.15 ms, sys: 117 µs, total: 1.26 ms
Wall time: 1.27 ms
CPU times: user 1.91 ms, sys: 0 ns, total: 1.91 ms
Wall time: 1.91 ms
CPU times: user 0 ns, sys: 1.9 ms, total: 1.9 ms
Wall time: 1.91 ms
CPU times: user 1.91 ms, sys: 0 ns, total: 1.91 ms
Wall time: 1.91 ms
CPU times: user 1.67 ms, sys: 185 µs, total: 1.85 ms
Wall time: 1.86 ms
CPU times: user 1.82 ms, sys: 0 ns, total: 1.82 ms
Wall time: 1.83 ms
CPU times: user 1.43 ms, sys: 247 µs, total: 1.68 ms
Wall time: 1.68 ms
CPU times: user 1