# Using Torch.Compile

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch._dynamo as dynamo
from torchviz import make_dot

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(10, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create an instance of the model
model = MyModel()

# Compile the model
compiled_model = torch.compile(model)

# Use the compiled model as you would normally
input_data = torch.randn(1, 10)
output = compiled_model(input_data)
print(output)

tensor([[0.2838]], grad_fn=<CompiledFunctionBackward>)


In [None]:
# 'nvfuser' is optimized for NVIDIA GPUs
compiled_model = torch.compile(model, backend='nvfuser')

Profiling:

In [12]:
# Compile the model and explain the compilation process
compiled_model = torch.compile(model)
dynamo.explain(compiled_model, torch.randn(1, 10))



ExplainOutput(graphs=[GraphModule(
  (L__self___fc1): Linear(in_features=10, out_features=10, bias=True)
  (L__self___relu): ReLU()
  (L__self___fc2): Linear(in_features=10, out_features=1, bias=True)
)], graph_count=1, graph_break_count=0, break_reasons=[], op_count=0, ops_per_graph=[[]], out_guards=[
        shape_env '' SHAPE_ENV
        {
            'guard_types': None,
            'code': None,
            'obj_weakref': None
            'guarded_class': None
        }
        , 
        global '' TORCH_FUNCTION_STATE
        {
            'guard_types': None,
            'code': None,
            'obj_weakref': None
            'guarded_class': None
        }
        , 
        local_nn_module "L['self'].fc1" NN_MODULE
        {
            'guard_types': None,
            'code': None,
            'obj_weakref': None
            'guarded_class': None
        }
        , 
        local_nn_module "L['self'].relu" NN_MODULE
        {
            'guard_types': None,
            'c

In [15]:
input_data = torch.randn(1, 10)
output = model(input_data)

# Visualize the computational graph
dot = make_dot(output, params=dict(list(model.named_parameters()) + [('input', input_data)]))
dot.render("model_graph", format="png")

'model_graph.png'

Often useful to only compile part of the model to reduce overhead:

In [4]:
# Compiling only part of a model
class PartModel(nn.Module):
    def __init__(self, model_part):
        super(PartModel, self).__init__()
        self.model_part = torch.compile(model_part)

    def forward(self, x):
        return self.model_part(x)

compiled_model_part = PartModel(model.fc1)
output = compiled_model_part(input_data)
output

tensor([[ 0.2673, -0.0399,  0.2411, -0.4851,  0.2301,  0.1615,  0.4815,  0.4151,
         -0.3506,  0.2458]], grad_fn=<CompiledFunctionBackward>)

An actual example:

In [9]:
# Example: Compiling only the convolutional layers
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv_layers = torch.compile(nn.Sequential(
            nn.Conv2d(1, 32, 3),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3),
            nn.ReLU(),
        ))
        self.lstm = nn.LSTM(64, 128)
        self.fc = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv_layers(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x