In [1]:
import time
import os

import torch
import torch._dynamo
from transformers import BertTokenizer, BertModel
from triton.testing import do_bench

In [2]:
torch.__version__

'2.0.0'

In [3]:
torch.set_float32_matmul_precision('high')

In [4]:
def run_benchmark(fn):
    exec_time, prctl20, prctl80 = do_bench(fn, warmup=100, rep=1000)
    return exec_time

In [5]:
def cal_improved_percent(exec_time, opt_exec_time):
    avg_exec_time = (sum(exec_time) / len(exec_time))
    avg_opt_exec_time = (sum(opt_exec_time) / len(opt_exec_time))
    
    print('avg_exec_time', avg_exec_time)
    print('avg_opt_exec_time', avg_opt_exec_time)

    return f'{round((avg_exec_time - avg_opt_exec_time) / avg_opt_exec_time * 100, 2)}%'

# 1. Test ResNet

In [6]:
def run_batch(model, optimizer, device):
    x = torch.randn(32, 3, 64, 64).to(device)
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()

In [7]:
def run_only_forward(model, device):
    x = torch.randn(32, 3, 64, 64).to(device)
    out = model(x)

In [8]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

Using cache found in /home/nguyenhuuminh/.cache/torch/hub/pytorch_vision_v0.10.0


In [9]:
# device = 'cpu'
device = 'cuda:0'
NUM_LOOP = 1

In [10]:
exec_time = [
    run_benchmark(lambda: run_batch(model.to(device), optimizer, device))
    for _ in range(NUM_LOOP)
]

In [11]:
torch._dynamo.reset() # Reset before compile again
opt_model_default = torch.compile(model)

opt_exec_time_default = [
    run_benchmark(lambda: run_batch(opt_model_default.to(device), optimizer, device))
    for _ in range(NUM_LOOP)
]

cal_improved_percent(exec_time, opt_exec_time_default)

avg_exec_time 19.8604793548584
avg_opt_exec_time 22.9171199798584


'-13.34%'

In [12]:
torch._dynamo.reset() # Reset before compile again

opt_model_max = torch.compile(model, mode='max-autotune')

opt_exec_time_max = [
    run_benchmark(lambda: run_batch(opt_model_max.to(device), optimizer, device))
    for _ in range(NUM_LOOP)
]

cal_improved_percent(exec_time, opt_exec_time_max)

avg_exec_time 19.8604793548584
avg_opt_exec_time 13.406208038330078


'48.14%'

In [13]:
exec_time = [
    run_benchmark(lambda: run_only_forward(model.to(device), device))
    for _ in range(NUM_LOOP)
]

In [14]:
torch._dynamo.reset() # Reset before compile again

opt_model_default = torch.compile(model)

opt_exec_time_default = [
    run_benchmark(lambda: run_only_forward(opt_model_default.to(device), device))
    for _ in range(NUM_LOOP)
]

cal_improved_percent(exec_time, opt_exec_time_default)

avg_exec_time 5.532671928405762
avg_opt_exec_time 5.895167827606201


'-6.15%'

In [15]:
torch._dynamo.reset() # Reset before compile again

opt_model_max = torch.compile(model, mode='max-autotune')

opt_exec_time_max = [
    run_benchmark(lambda: run_only_forward(opt_model_max.to(device), device))
    for _ in range(NUM_LOOP)
]

cal_improved_percent(exec_time, opt_exec_time_max)

avg_exec_time 5.532671928405762
avg_opt_exec_time 3.7201919555664062


'48.72%'

# 2. Test Transformers

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_ = tokenizer('This is a text to test the model', return_tensors='pt')

In [17]:
model = BertModel.from_pretrained("bert-base-uncased")

torch._dynamo.reset() # Reset before compile again

opt_model = torch.compile(model, mode='max-autotune')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
def run_foward_transformers(model, input_):
    output = model(**input_)

In [19]:
exec_time = [
    run_benchmark(lambda: run_foward_transformers(model.to(device), input_.to(device)))
    for _ in range(NUM_LOOP)
]

In [20]:
opt_exec_time_max = [
    run_benchmark(lambda: run_foward_transformers(opt_model.to(device), input_.to(device)))
    for _ in range(NUM_LOOP)
]

cal_improved_percent(exec_time, opt_exec_time_max)

avg_exec_time 9.825535774230957
avg_opt_exec_time 1.7269760370254517


'468.94%'