In [19]:
import numpy as np

import onnx 
import onnxruntime as ort
import torch
import torch_tensorrt
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils import benchmark

import torchvision.models as models
import transformers

from matplotlib import pyplot as plt
from thop import profile as thop_profile


In [2]:
DEVICE = 'cuda:1'

# Init Input Data
inputs = torch.randint(0, 1000, (1, 128))
pixels = torch.randn((1,3,224,224))

# Init Models
bert = transformers.BertModel.from_pretrained('bert-base-uncased', torchscript=True)
funnel = transformers.FunnelModel.from_pretrained('funnel-transformer/small', torchscript=True)

resnet18 = models.resnet18()
resnet50 = models.resnet50()
wide_resnet = models.wide_resnet50_2()
maskrcnn = models.detection.maskrcnn_resnet50_fpn()
resnet152 = models.resnet152()
alexnet = models.alexnet()

# Move data to GPU
bert = bert.to(DEVICE).eval()
funnel = funnel.to(DEVICE).eval()

resnet18 = resnet18.to(DEVICE).eval()
resnet50 = resnet50.to(DEVICE).eval()
resnet152 = resnet152.to(DEVICE).eval()
wide_resnet = wide_resnet.to(DEVICE).eval()
maskrcnn = maskrcnn.to(DEVICE).eval()
alexnet = alexnet.to(DEVICE).eval()

pixels = pixels.to(DEVICE)
inputs = inputs.to(DEVICE)


# Warm up Runs
_ = bert(inputs)
_ = funnel(inputs)

_ = resnet18(pixels)
_ = resnet50(pixels)
_ = wide_resnet(pixels)
_ = resnet152(pixels)
_ = alexnet(pixels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
jit_resnet = torch.jit.script(resnet, pixels)
jit_maskrcnn = torch.jit.script(maskrcnn, pixels)

# jit_bert = torch.jit.trace(bert, inputs)
# jit_funnel = torch.jit.trace(funnel, inputs)

In [79]:
mnasnet_fp16 = models.mnasnet1_0().eval().half().to(DEVICE)
torch.onnx.export(mnasnet_fp16, pixels, 'mnasnet-fp16-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})

In [54]:


resnet18_fp16 = resnet18.half()
pixels = torch.randn((1,3,224,224), dtype=torch.float16).to('cuda:1')

torch.onnx.export(resnet18_fp16, pixels, 'resnet18-fp16-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['inputs'], output_names=['output'],
                  dynamic_axes={"inputs": {0: "batch"}, "outputs": {0: "batch"}})

In [3]:
torch.onnx.export(resnet18, pixels, 'resnet18-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})

torch.onnx.export(resnet, pixels, 'resnet50-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})


torch.onnx.export(resnet152, pixels, 'resnet152-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})

torch.onnx.export(wide_resnet, pixels, 'wideresnet-dynamic.onnx', 
                  export_params=True, opset_version=10, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})

torch.onnx.export(maskrcnn, pixels, 'maskrcnn-dynamic.onnx', 
                  export_params=True, opset_version=13, do_constant_folding=True,
                  input_names=['pixels'], output_names=['output'],
                  dynamic_axes={"pixels": {0: "batch"}, "output": {0: "batch"}})

torch.onnx.export(bert, inputs, 'bert-dynamic.onnx',
                  export_params=True, opset_version=13, do_constant_folding=True,
                  input_names=['ids'], output_names=['output'],
                  dynamic_axes={"ids": {0: "batch"}, "output": {0: "batch"}})

torch.onnx.export(funnel, inputs, 'funnel-dynamic.onnx',
                  export_params=True, opset_version=13, do_constant_folding=True,
                  input_names=['ids'], output_names=['output'],
                  dynamic_axes={"ids": {0: "batch"}, "output": {0: "batch"}})

  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
  torch.empty((), dtype=torch.int64, device=device).fill_(image_size[0] // g[0]),
  torch.empty((), dtype=torch.int64, device=device).fill_(image_size[1] // g[1]),
  A = Ax4 // 4
  C = AxC // A
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
  return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  num_remove = shift * len(pooled_pos)
  pooling_flag = pooling_flag and block_index > 0
  shift = 2 if q_head.shape[1] != context_len else 1


In [10]:
providers = [
    # ('CUDAExecutionProvider', {
    #     'device_id': 1,
    #     'arena_extend_strategy': 'kNextPowerOfTwo',
    #     'gpu_mem_limit': 2 * 1024 * 1024 * 1024,
    #     'cudnn_conv_algo_search': 'EXHAUSTIVE',
    #     'do_copy_in_default_stream': True,
    # }),
    'CPUExecutionProvider',
]

In [3]:
def create_and_bind_rgb(io_binding, bs, dtype, use_cuda=False):
    pixels_cpu = torch.randn((bs,3,224,224), dtype=dtype).cpu().numpy()
    out = torch.empty((bs,1000), dtype=dtype).cpu().numpy()
    device = 'cuda' if use_cuda else 'cpu'
    pixels_ortvalue = ort.OrtValue.ortvalue_from_numpy(pixels_cpu, device, 0)
    out_ortvalue = ort.OrtValue.ortvalue_from_numpy(out, device, 0)
    io_binding.bind_ortvalue_input('pixels', pixels_ortvalue)
    io_binding.bind_ortvalue_output('output', out_ortvalue)
    return pixels_ortvalue, out_ortvalue 

def create_and_bind_ids(io_binding, bs,):
    ids = torch.randint(0, 1000, (1, 128)).cpu().numpy()
    ids_ortvalue = ort.OrtValue.ortvalue_from_numpy(ids, 'cuda', 0)
    io_binding.bind_ortvalue_input('ids', ids_ortvalue)


In [4]:
def profile_model(onnx_fpath, providers, batch_size, dtype):
    session = ort.InferenceSession(onnx_fpath, providers=providers)
    io_binding = session.io_binding()

    # One regular run for the necessary memory allocation and cuda graph capturing
    create_and_bind_rgb(io_binding, batch_size, dtype)
    session.run_with_iobinding(io_binding)

    # One regular run for the necessary memory allocation and cuda graph capturing
    create_and_bind_rgb(io_binding, batch_size, dtype)
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(f'pytorch/log/{onnx_fpath}-batch{batch_size}'),
                 record_shapes=True, with_flops=True, profile_memory=True) as prof:
        with record_function('graph_inference'):
            session.run_with_iobinding(io_binding)
            
def eval_latency_and_macs(model, onnx_fpath, providers=["CUDAExecutionProvider"],dtype=torch.float32):
    latency, macs = [], []
    for bs in [1,2,3,4,6,8,16,32,64]:
        print(f"Evaluating {onnx_fpath} with batch size {bs}")
        session = ort.InferenceSession(onnx_fpath, providers=providers)
        io_binding = session.io_binding()
        create_and_bind_rgb(io_binding, bs,dtype)

        # Warmup Run
        session.run_with_iobinding(io_binding)
        
        # Measure ONNX latency
        timer = benchmark.Timer(
            stmt="session.run_with_iobinding(io_binding)",
            globals={"session": session, "io_binding": io_binding}
        )
        
#         # Measure FLOPs
#         inputs = torch.randn((bs,3,224,224), dtype=dtype).to('cuda:1')
        latency.append(timer.timeit(100).mean)
#         macs.append(thop_profile(model, (inputs,))[0])
    return latency, macs


In [18]:
dtype = torch.float32
device = 'cpu'

for bs in [1,2,3,4,6,8,12]:
    session = ort.InferenceSession('resnet18-dynamic.onnx', providers=providers)
    io_binding = session.io_binding()

    pixels_cpu = torch.randn((bs ,3,224,224), dtype=dtype).cpu().numpy()
    out = torch.empty((bs,1000), dtype=dtype).cpu().numpy()

    pixels_ortvalue = ort.OrtValue.ortvalue_from_numpy(pixels_cpu, device, 0)
    out_ortvalue = ort.OrtValue.ortvalue_from_numpy(out, device, 0)

    io_binding.bind_ortvalue_input('pixels', pixels_ortvalue)
    io_binding.bind_ortvalue_output('output', out_ortvalue)

    timer = benchmark.Timer(
        stmt="session.run_with_iobinding(io_binding)",
        globals={"session": session, "io_binding": io_binding}
    )
    print(timer.timeit(100).mean)

0.0034613483771681786
0.004808780588209629
0.006889117360115052
0.008522093817591668
0.011561025455594063
0.014963545054197312
0.02186247918754816


In [14]:
# Measure ONNX latency
timer = benchmark.Timer(
    stmt="session.run_with_iobinding(io_binding)",
    globals={"session": session, "io_binding": io_binding}
)
print(timer.timeit(100).mean)

        # latency_resnet18_cpu, macs_resnet18_cpu = eval_latency_and_macs(resnet18.cpu(), "resnet18-dynamic.onnx", providers, dtype=torch.float32)
# profile_model("resnet18-fp16-dynamic.onnx", providers, 8, torch.half)

0.003409965671598911


In [1]:
latency_resnet18_fp16, macs_resnet18 = eval_latency_and_macs(resnet18.half(), "resnet18-fp16-dynamic.onnx", providers, dtype=torch.float16)
latency_mnasnet_fp16, macs_mnasnet_fp16 = eval_latency_and_macs(mnasnet.half(), "mnasnet-fp16-dynamic.onnx", providers, dtype=torch.float16)


NameError: name 'eval_latency_and_macs' is not defined

In [None]:


latency_resnet18, macs_resnet18 = eval_latency_and_macs(resnet18, "resnet18-dynamic.onnx", providers)
latency_resnet50, macs_resnet50 = eval_latency_and_macs(resnet, "resnet50-dynamic.onnx", providers)
latency_resnet152, macs_resnet152 = eval_latency_and_macs(resnet152, "resnet152-dynamic.onnx", providers)
latency_wideresnet50, macs_wideresnet50 = eval_latency_and_macs(wide_resnet, "wideresnet-dynamic.onnx", providers)

In [90]:
latency_resnet18_fp16, latency_mnasnet_fp16

([0.0013843486830592156,
  0.0012388412654399871,
  0.0012840042635798455,
  0.0013786252960562705,
  0.0014433470740914346,
  0.0017063410207629204,
  0.0030685196071863173,
  0.00543682936578989,
  0.013508461378514768],
 [0.0015029918029904365,
  0.0014550980925559997,
  0.0015874777734279633,
  0.001561770886182785,
  0.0017023448646068572,
  0.0018440712988376617,
  0.003331167474389076,
  0.00543928399682045,
  0.010700393170118332])

In [20]:
batch = [1, 2, 3, 4, 6, 8, 16, 32, 64]
plt.plot(macs_mnasnet_fp16, latency_mnasnet_fp16, label="Mnasnet")

plt.plot(macs_resnet18, latency_resnet18, label="ResNet18")
plt.plot(batch, latency_resnet18_fp16, label="ResNet18 fp 16")
plt.plot(macs_resnet50, latency_resnet50, label="ResNet50")
plt.plot(macs_resnet152, latency_resnet152, label="ResNet152")
plt.plot(macs_wideresnet50, latency_wideresnet50, label="Wide ResNet50")

plt.xlabel('MACs')
plt.ylabel('Latency')
plt.title('ONNX with CUDA Backend on RTX-8000')
plt.xscale('log')
plt.yscale('log')

plt.legend()
plt.show()
plt.plot(batch, latency_resnet18_fp16, label="ResNet18 fp 16")
plt.plot(batch, latency_resnet18, label="ResNet18")
plt.plot(batch, latency_resnet50, label="ResNet50")
plt.plot(batch, latency_resnet152, label="Resnet152")
plt.plot(batch, latency_wideresnet50, label="Wide ResNet50")

plt.xlabel('Batch Size')
plt.ylabel('Latency')
plt.title('ONNX with CUDA Backend on RTX-8000')
plt.legend()
plt.show()


NameError: name 'macs_mnasnet_fp16' is not defined

In [33]:
BATCH = [1, 2, 3, 4, 6, 8, 16, 32, 64]


jit_latency = []
for b in BATCH:
# Measure TorchScript Latency
    ins = torch.randn((b, 3,224,224)).to('cuda:1')
    timer = benchmark.Timer(
        stmt="model(ins)",
        globals={"model": jit_resnet18, "ins": ins}
    )
    jit_latency.append(timer.timeit(100).mean)

print(jit_latency)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             on_trace_ready=torch.profiler.tensorboard_trace_handler(f'pytorch/log/resnet18-vanilla'),
             record_shapes=True, with_flops=True, profile_memory=True) as prof:
    with record_function('graph_inference'):
        resnet18(ins)

[0.0020879823341965675, 0.0019292379915714264, 0.0019974905997514726, 0.0019260652735829354, 0.0028885624557733534, 0.003465595655143261, 0.005266679674386978, 0.009869575463235379, 0.018852105885744094]


In [47]:
latency_resnet18

[0.0015636616200208663,
 0.0014715524017810822,
 0.0016967617347836495,
 0.0018857941031455994,
 0.002570772022008896,
 0.0030693775415420533,
 0.004890320859849453,
 0.010048705376684666,
 0.018493369854986667]

In [None]:
BATCH = 1

ins = torch.randn((BATCH, 3,224,224)).to('cuda:1')
outs = torch.empty((BATCH, 1000), dtype=torch.float32).to('cuda:1')

def bind_io(io_binding, ins, outs):
    pixels_ortvalue = ort.OrtValue.ortvalue_from_numpy(ins.cpu().numpy(), 'cuda', 0)
    out_ortvalue = ort.OrtValue.ortvalue_from_numpy(outs.cpu().numpy(), 'cuda', 0)

    io_binding.bind_ortvalue_input('pixels', pixels_ortvalue)
    io_binding.bind_ortvalue_output('output', out_ortvalue)

session = ort.InferenceSession("resnet-dynamic.onnx", providers=providers)
io_binding = session.io_binding()
bind_io(io_binding, ins, outs)

session.run_with_iobinding(io_binding)
onnx_out = io_binding.copy_outputs_to_cpu()[0]

vanilla_out = resnet(ins)

np.array(onnx_out) - vanilla_out.cpu().detach().numpy()

In [7]:
!nvidia-smi

Sun Oct 16 14:41:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.00    Driver Version: 470.82.00    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 8000     Off  | 00000000:19:00.0 Off |                  Off |
| 33%   42C    P8    33W / 260W |  11197MiB / 48601MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000     Off  | 00000000:1A:00.0 Off |                  Off |
| 33%   49C    P8    34W / 260W |   4191MiB / 48601MiB |      0%      Default |
|       

In [19]:
stream = torch.cuda.Stream()
stream.wait_stream(torch.cuda.current_stream())

with torch.cuda.stream(stream):
    resnet(pixels)
    
torch.cuda.current_stream().wait_stream(stream)

graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
    alexnet(pixels)

graph.replay()