In [1]:
import numpy as np

import torch
from torch.profiler import ProfilerActivity, profile, record_function
from torch.utils import benchmark

import torch_tensorrt as ttrt
import onnxruntime as ort
from torchvision import models
!pwd

/home/jaredfer/Projects/Year3/DeviceBenchmarking


In [2]:
def create_and_bind_rgb(io_binding, bs, dtype, use_cuda=False):
    device = 'cuda' if use_cuda else 'cpu'
    dtype = torch.half if dtype == "fp16" else torch.float32
    
    pixels_cpu = torch.randn((bs,3,224,224), dtype=dtype).cpu().numpy()
    out = torch.empty((bs,1000), dtype=dtype).cpu().numpy()
    
    pixels_ortvalue = ort.OrtValue.ortvalue_from_numpy(pixels_cpu, device, 0)
    out_ortvalue = ort.OrtValue.ortvalue_from_numpy(out, device, 0)
    
    io_binding.bind_ortvalue_input('inputs', pixels_ortvalue)
    io_binding.bind_ortvalue_output('outputs', out_ortvalue)


In [3]:
def profile_onnx(onnx_fpath, batch_size, dtype, providers=['CUDAExecutionProvider']):
    session = ort.InferenceSession(onnx_fpath, providers=providers)
    io_binding = session.io_binding()

    create_and_bind_rgb(io_binding, batch_size, dtype, True)
    session.run_with_iobinding(io_binding)
    
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gpu_util/log/onnx-batch{batch_size}'),
                 record_shapes=True, with_flops=True, profile_memory=True) as prof:
        with record_function('graph_inference'):
            session.run_with_iobinding(io_binding)
    
def profile_pytorch(model, inputs, batch_size, device, dtype):
    _ = model(inputs)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gpu_util/log/pytorch-batch{batch_size}-{dtype}'),
                 record_shapes=True, with_flops=True, profile_memory=True) as prof:
        with record_function('graph_inference'):
            model(inputs)

def profile_jit(model, inputs, batch_size, device, dtype):
    jit_model = torch.jit.trace(model, (inputs,)).to(device)
    _ = jit_model(inputs)
    
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gpu_util/log/torchscript-batch{batch_size}-{dtype}'),
                 record_shapes=True, with_flops=True, profile_memory=True) as prof:
        with record_function('graph_inference'):
            jit_model(inputs)

def profile_trt(model, inputs, batch_size, device, dtype):
    if dtype == "fp16":
        enabled_precisions= { ttrt.dtype.half, ttrt.dtype.float }
    else:
        enabled_precisions= { ttrt.dtype.float }
    
    trt_model = ttrt.compile(model, inputs=inputs, enabled_precisions=enabled_precisions)
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gpu_util/log/trt-batch{batch_size}-{dtype}'),
                 record_shapes=True, with_flops=True, profile_memory=True) as prof:
        with record_function('graph_inference'):
            ttrt_model(inputs)


In [4]:
device = "cuda:1"

for batch_size in [1]:
    for dtype in ["fp32", "fp16"]:
        print(f"Profiling ResNet for {dtype} at batch {batch_size}")
        profile_onnx(f'onnx/onnx_models/resnet50-{dtype}.onnx', batch_size, dtype)

        dtype = torch.half if dtype == "fp16" else torch.float32
        inputs = torch.randn((batch_size, 3, 224,224), device=device, dtype=dtype)
        model = models.resnet50().eval().to(device, dtype=dtype)


        # profile_pytorch(model, inputs, batch_size, device, dtype)
        # profile_jit(model, inputs, batch_size, device, dtype)
        # try: profile_trt(model, inputs, batch_size, device, dtype)
        
    
    


Profiling ResNet for fp32 at batch 1
Profiling ResNet for fp16 at batch 1


In [5]:
CUDA_LAUNCH_BLOCKING=1
device="cuda:1"
dtype=torch.float

torch.cuda.synchronize()

for batch_size in [1, 2, 4, 8, 16, 32, 64]:
    inputs = torch.randn((batch_size, 3, 224,224), device=device, dtype=dtype)
    model = models.resnet50().eval().to(device, dtype=dtype)
    jit_model = torch.jit.trace(model, (inputs,))
    trt_model = ttrt.compile(
        model, 
        inputs = [ttrt.Input((batch_size, 3, 224, 224), dtype=torch.float32)],
        enabled_precisions = torch.float32, # Run with FP32
        workspace_size = 1 << 22
    )
    
    torch.cuda.synchronize()
    _ = trt_model(inputs)
    
    torch.cuda.synchronize()
    
    timer = benchmark.Timer(
            stmt="model(inputs)",
            globals={
                "model": trt_model,
                "inputs": inputs,
            },
        )
    print(timer.timeit(100).mean)



RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [12]:
for batch_size in [1, 2, 4, 8, 16, 32, 64]:
    session = ort.InferenceSession("onnx/onnx_models/resnet50-fp32.onnx", providers=["CUDAExecutionProvider"])
    io_binding = session.io_binding()

    create_and_bind_rgb(io_binding, batch_size, torch.float , True)
    session.run_with_iobinding(io_binding)

    timer = benchmark.Timer(
            stmt="session.run_with_iobinding(iobinds)",
            globals={
                "session": session,
                "iobinds": io_binding,
            },
        )
    print(timer.timeit(100).mean)

0.002735559493303299
0.0035587241873145103
0.005048961080610752
0.00730186615139246
0.01217206597328186
0.023698123916983605
0.04552473366260529
