<a href="https://colab.research.google.com/github/LeSaUi/DLtest/blob/main/OPTIMIZING_VISION_TRANSFORMER_MODEL_FOR_DEPLOYMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install torch torchvision timm pandas requests


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
from PIL import Image
import torch
import timm
import requests
import torchvision.transforms as transforms
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

print(torch.__version__)
# should be 1.8.0

# https://github.com/facebookresearch/deit
model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True)
model.eval()

transform = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)
])

img = Image.open(requests.get("https://raw.githubusercontent.com/pytorch/ios-demo-app/master/HelloWorld/HelloWorld/HelloWorld/image.png", stream=True).raw)
img = transform(img)[None,]
out = model(img)
clsidx = torch.argmax(out)
print(clsidx.item()) # the output should be 269, which, according to the ImageNet list of class index to https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a, maps to timber wolf, grey wolf, gray wolf, Canis Iupus


2.0.0+cu118


Using cache found in /root/.cache/torch/hub/facebookresearch_deit_main


269


In [24]:
# To use the model on mobile, we first need to script the model. See the https://pytorch.org/tutorials/recipes/script_optimized.html
# Run the code below to convert the DeiT model used in the previous step to the TorchScript format that can run on mobile.
model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True)
model.eval()
scripted_model = torch.jit.script(model)
scripted_model.save('fbdeit_scripted.pt')


Using cache found in /root/.cache/torch/hub/facebookresearch_deit_main


In [25]:
# about quantization, refer to https://pytorch.org/docs/stable/quantization.html?highlight=quantization#dynamic-quantization

# Use 'x86' for server inference (the old 'fbgemm' is still availbable but 'x86' is the recommended default) and ''qnnpack'' for mobile inference.
backend = 'x86' # replaced with ''qnnpack'' causing much worse inference speed for quantized model on this nobtebook
model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.backends.quantized.engine = backend

quantized_model = torch.quantization.quantize_dynamic(model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
scripted_quantized_model = torch.jit.script(quantized_model)
scripted_quantized_model.save('fbdeit_scripted_quantized.pt')



In [26]:
out = scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item()) # The same output 269 should be printed

269


In [27]:
from torch.utils.mobile_optimizer import optimize_for_mobile
optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model)
optimized_scripted_quantized_model.save('fbdeit_optimized_scripted_quantized.pt')


In [28]:
out = optimized_scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item()) # Again, the same output 269 should be printed

269


In [29]:
optimized_scripted_quantized_model._save_for_lite_interpreter('fbdeit_optimized_scripted_quantized_lite.ptl')
ptl = torch.jit.load('fbdeit_optimized_scripted_quantized_lite.ptl')


In [39]:
# this version torch 1.8.0 profile
uc = False
with torch.autograd.profiler.profile(use_cuda=uc) as prof1:
  out = model(img)
with torch.autograd.profiler.profile(use_cuda=uc) as prof2:
  out = scripted_model(img)
with torch.autograd.profiler.profile(use_cuda=uc) as prof3:
  out = scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=uc) as prof4:
  out = optimized_scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=uc) as prof5:
  out = ptl(img)

print(f'original model: {prof1.self_cpu_time_total/1000:.2f}ms')
print(f'scripted model: {prof2.self_cpu_time_total/1000:.2f}ms')
print(f'scripted & quantized model: {prof3.self_cpu_time_total/1000:.2f}ms')
print(f'scripted & quantized & optimized model: {prof4.self_cpu_time_total/1000:.2f}.ms')
print(f'scripted & quantized & optimized & lite model: {prof5.self_cpu_time_total/1000:.2f}ms')



# The result running on a Google Colab are:
# original model: 1236.69ms
# scripted model: 1226.72ms
# scripted & quantized model: 593.19ms
# scripted & quantized & optimized model: 598.01ms
# lite model: 600.72ms

original model: 649.09ms
scripted model: 667.20ms
scripted & quantized model: 400.80ms
scripted & quantized & optimized model: 366.27.ms
scripted & quantized & optimized & lite model: 399.75ms


In [38]:
# this version torch 2.0.0 profile. refer https://tutorials.pytorch.kr/recipes/recipes/profiler_recipe.html

uc = [torch.profiler.ProfilerActivity.CPU,torch.profiler.ProfilerActivity.CUDA]
with torch.profiler.profile(activities=uc) as pf1:
  out = model(img)
with torch.profiler.profile(activities=uc) as pf2:
  out = scripted_model(img)
with torch.profiler.profile(activities=uc) as pf3:
  out = scripted_quantized_model(img)
with torch.profiler.profile(activities=uc) as pf4:
  out = optimized_scripted_quantized_model(img)
with torch.profiler.profile(activities=uc) as pf5:
  out = ptl(img)

# if you want to see cuda profile, convert model to cuda
# this example model maybe not cuda
print(pf1.key_averages().table(
    sort_by="cuda_time_total", row_limit=3)) # row_limit = -1 means print all NN layer
print(pf2.key_averages().table(
    sort_by="cpu_time_total", row_limit=3))

----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::conv2d         0.00%      11.000us         1.37%       8.929ms       8.929ms             1  
           aten::convolution         0.01%      54.000us         1.37%       8.918ms       8.918ms             1  
          aten::_convolution         0.00%      22.000us         1.36%       8.864ms       8.864ms             1  
----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 651.570ms

----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                        Name    Self CPU %      

In [41]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'Model': ['original model', 'scripted model', 'scripted & quantized model', 'scripted & quantized & optimized model', 'scripted & quantized & optimized & lite model']})
df = pd.concat([df, pd.DataFrame([
    [f'{prof1.self_cpu_time_total / 1000:.2f}ms', '0%'],
    [f'{prof2.self_cpu_time_total / 1000:.2f}ms',
     f'{(prof1.self_cpu_time_total - prof2.self_cpu_time_total) / prof1.self_cpu_time_total * 100}'],
    [f'{prof3.self_cpu_time_total / 1000:.2f}ms',
     f'{(prof1.self_cpu_time_total - prof3.self_cpu_time_total) / prof1.self_cpu_time_total * 100}'],
    [f'{prof4.self_cpu_time_total / 1000:.2f}ms',
     f'{(prof1.self_cpu_time_total - prof4.self_cpu_time_total) / prof1.self_cpu_time_total * 100}'],
    [f'{prof5.self_cpu_time_total / 1000:.2f}ms',
     f'{(prof1.self_cpu_time_total - prof5.self_cpu_time_total) / prof1.self_cpu_time_total * 100}']],
    columns=['Inference Time', 'Reduction'])], axis=1)
print(df)

                                           Model Inference Time  \
0                                 original model       649.09ms   
1                                 scripted model       667.20ms   
2                     scripted & quantized model       400.80ms   
3         scripted & quantized & optimized model       366.27ms   
4  scripted & quantized & optimized & lite model       399.75ms   

            Reduction  
0                  0%  
1  -2.789751806375079  
2   38.25201435856353  
3   43.57130752283967  
4  38.414549600209526  
