# Getting Dependencies
For this tutorial we will be needing the following depencies, we expreminted in the following 

In [284]:
%%capture
!pip install torch
!pip install torchvision
!pip install tqdm
!pip install onnxruntime-gpu
!pip install ipywidgets

In [285]:
import torch
from torch import nn
from torchvision import models
import torchvision 
from timeit import timeit

N = 100
example = torch.rand(1, 3, 224, 224).cuda()

## 1. Ordinary Torch Model
We begin by downloading the pre-trained ResNet50 pytorch model from the torchhub, then we setup the timing function, and last, we always add the model. cuda() was used since the data must transit from the ram to the gpu's vram while running infernce on cuda, and we wanted to compare it fairly to onnx runtime that uses a numpy Array.

In [286]:
# Download the Pytorch Resnet50
model = torchvision.models.resnet50(pretrained=True)
model.eval()
model.cuda()


def inf_torch(example):
  with torch.no_grad():
    output = model(example.cuda())
    torch.cuda.synchronize()
    return output
example = example.cpu()

torch_t = timeit(lambda : inf_torch(example), number=N)/N
torch_output = F.softmax(inf_torch(example), dim=1).topk(1).indices

# 2. Scripting the model 
Convert the model to an IR using scripting 

In [287]:
# convert to TorchScript
traced_script_module = torch.jit.trace(model, example.cuda())
traced_script_module.eval()
traced_script_module.cuda()

def inf_torch_scripted(example):
  with torch.no_grad():
    output = traced_script_module(example.cuda())
    torch.cuda.synchronize()
    return output

example = example.cpu()
# We count the time it takes to pas the data from cpu to gpu 
scripted_t = timeit(lambda : inf_torch_scripted(example), number=N)/N
scripted_output = F.softmax(inf_torch_scripted(example), dim=1).topk(1).indices

## 3. Convert To Onnx
We convert the pytorch model to onnx

In [288]:
from collections import OrderedDict
from torch.onnx import TrainingMode

# Export the model
def convert_to_onnx(
    model_pytorch , output_path: str, inputs_pytorch, opset: int = 12
) -> None:
    # dynamic axis == variable length axis
    dynamic_axis = OrderedDict()
    for k in inputs_pytorch.keys():
        dynamic_axis[k] = { 0: "batch_size"}
    dynamic_axis["output"] = { 0: "batch_size"}
    with torch.no_grad():
        torch.onnx.export(
            model_pytorch,  # model to optimize
            args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
            f=output_path,  # output path / file object
            opset_version=opset,  # the ONNX version to use, 13 if quantized model, 12 for not quantized ones
            do_constant_folding=True,  # simplify model (replace constant expressions)
            input_names=list(inputs_pytorch.keys()),  # input names
            output_names=["output"],  # output axis name
            dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
            training=TrainingMode.EVAL,  # always put the model in evaluation mode
            verbose=False,
        )

In [289]:
#create this folder structure for Triton in the next tutorial

!mkdir -p models/onnx-model-gpu
!mkdir -p models/onnx-model-gpu/

convert_to_onnx(model.cuda(),'models/onnx-model-gpu/1/model.onnx',{"input":example.cuda()})

Now that we have model exported as onnx, we will be using onnx-runtime to run the onnx with different runtimes

# 4. Run the onnx with Cuda Backend

In [290]:
import numpy as np
data = example.cpu().detach().numpy()

In [291]:
import onnxruntime

onnx_model = onnxruntime.InferenceSession('models/onnx-model-gpu/1/model.onnx',
providers=[
    ('CUDAExecutionProvider',
    {
        'device_id': 0,
    })
])

def onnx_inf(data):
    return onnx_model.run(None,{
                onnx_model.get_inputs()[0].name: data
           })
           


In [292]:
#warmup
onnx_inf(data)
onnx_t = timeit(lambda : onnx_inf(data), number=N)/N
onnx_output = np.argpartition(onnx_inf(data)[0][0],-1)[-1:]


# Run the model with TensorRT Backend 

In [293]:
import onnxruntime

onnx_model = onnxruntime.InferenceSession('models/onnx-model-gpu/1/model.onnx',
providers=[
    ('TensorrtExecutionProvider',
    {
        'device_id': 0,
    })
])

data = example.cpu().detach().numpy()

def tensorrt_inf(data):
    return onnx_model.run(None,{
                onnx_model.get_inputs()[0].name: data
           })



In [294]:
#warmup
tensorrt_inf(data)
tensorrt_inf_t = timeit(lambda : tensorrt_inf(data), number=N)/N
tensorrt_inf_output  = np.argpartition(tensorrt_inf(data)[0][0],-1)[-1:]



# 5. Run the model with TensorRT FP16 Backend
In this example we ran the same TensoRT model, but in FP16 precision

In [295]:
import onnxruntime

onnx_model = onnxruntime.InferenceSession('models/onnx-model-gpu/1/model.onnx',
providers=[
    ('TensorrtExecutionProvider',
    {
        'device_id': 0,
        'trt_fp16_enable':True,
    })
])

data = example.cpu().detach().numpy()

def tensorrt_inf_fp16(data):
    return onnx_model.run(None,{
                onnx_model.get_inputs()[0].name: data
           })



In [296]:
#warmup phase 
timeit(lambda : tensorrt_inf_fp16(data), number=N)/N

tensorrt_inf_fp16_t = timeit(lambda : tensorrt_inf_fp16(data), number=N)/N
tensorrt_inf_fp16_t_output = np.argpartition(tensorrt_inf_fp16(data)[0][0],-1)[-1:]



In [297]:
print(f"Results compraion : \n \
        PyTorch-cuda {torch_t} \n \
        Pytorch-cuda-scripted {scripted_t} \n \
        onnx-cuda {onnx_t} \n \
        TensorRT {tensorrt_inf_t} \n \
        TensorRT-FP16 {tensorrt_inf_fp16_t}")


Results compraion : 
         PyTorch-cuda 0.005198871344327926 
         Pytorch-cuda-scripted 0.00447928112000227 
         onnx-cuda 0.0031806880980730055 
         TensorRT 0.0025665512308478355 
         TensorRT-FP16 0.0011617697402834892


In [298]:
print(f"SpeedUPs : \n \
        PyTorch-cuda {torch_t/torch_t} \n \
        Pytorch-cuda-scripted {torch_t/scripted_t} \n \
        onnx-cuda {torch_t/onnx_t} \n \
        TensorRT {torch_t/tensorrt_inf_t} \n \
        TensorRT-FP16 {torch_t/tensorrt_inf_fp16_t}")


SpeedUPs : 
         PyTorch-cuda 1.0 
         Pytorch-cuda-scripted 1.1606485962920075 
         onnx-cuda 1.6345115220438058 
         TensorRT 2.0256253924884753 
         TensorRT-FP16 4.474958474180369


# Quantization INT8 CPU

In [299]:
model_quantized = torchvision.models.quantization.resnet50(pretrained=True, quantize=True)
model_quantized.eval()

def inf_quantized(example):
  with torch.no_grad():
    output = model_quantized(example)
    torch.cuda.synchronize()
    return output
example = example.cpu()

quantized_inf_t = timeit(lambda : inf_quantized(example), number=N)/N
quantized_inf_output = F.softmax(inf_quantized(example), dim=1).topk(1).indices

In [300]:
model.cpu()

def inf_torch_cpu(example):
  with torch.no_grad():
    output = model(example)
    torch.cuda.synchronize()
    return output
example = example.cpu()

torch_cpu_t = timeit(lambda : inf_torch_cpu(example), number=N)/N
torch_cpu_output = F.softmax(inf_torch_cpu(example), dim=1).topk(1).indices

In [301]:
print(f"SpeedUPs : \n \
        PyTorch-CPU {torch_cpu_t/torch_cpu_t} \n \
        INT8-CPU {torch_cpu_t/quantized_inf_t}")


SpeedUPs : 
         PyTorch-CPU 1.0 
         INT8-CPU 1.9307451741422934


##  Comparing outputs

In [302]:
print(f"Outputs : \n \
        PyTorch-cuda {torch_output} \n \
        Pytorch-cuda-scripted {scripted_output} \n \
        onnx-cuda {onnx_output} \n \
        TensorRT {tensorrt_inf_output} \n \
        TensorRT-FP16 {tensorrt_inf_fp16_t_output} \n \
        INT8 {quantized_inf_output}       ")

Outputs : 
         PyTorch-cuda tensor([[610]], device='cuda:0') 
         Pytorch-cuda-scripted tensor([[610]], device='cuda:0') 
         onnx-cuda [610] 
         TensorRT [610] 
         TensorRT-FP16 [610] 
         INT8 tensor([[610]])       


In [303]:
def get_model_size(model):
    torch.save(model.state_dict(), 'temp_saved_model.pt')
    model_size_in_mb = os.path.getsize('temp_saved_model.pt') >> 20
    os.remove('temp_saved_model.pt')
    return model_size_in_mb

In [304]:
original_model_size = get_model_size(model)
quantized_model_size = get_model_size(model_quantized)

print(f"Model size : \n \
        original model size {original_model_size} mb \n \
        quantized_model_size {quantized_model_size} mb")

Model size : 
         original model size 97 mb 
         quantized_model_size 24 mb
