In [1]:
import os
import cv2
import time
import onnx
import torch
import torchvision
from PIL import Image

# trt
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

from torch2trt import torch2trt
from torch2trt import TRTModule

### Export PyTorch model to ONNX

In [2]:
def torch_2_onnx(torch_model, savepath, batch_size=1, input_name=['input'], output_name=['output'], shape=[3, 224, 224]):
    dummy_input = torch.randn(batch_size, *shape).cuda()
    
    with torch.no_grad():
        torch.onnx.export(
            torch_model,
            dummy_input,
            savepath,
            input_names=input_name,
            output_names=output_name,
            verbose=False, # True for show INFO
            opset_version=11
        )
        
    print('ONNX model exported to {}\n'.format(onnx_model_path))

    # check
    test = onnx.load(savepath)
    onnx.checker.check_model(test)
    print("==> Passed")

In [3]:
def save_torch_model(model, pth_model_path):
    print('PyTorch model saved to {}\n'.format(pth_model_path))
    torch.save(model.state_dict(), pth_model_path)

In [4]:
onnx_model_path = 'model.onnx'
pth_model_path  = 'model.pth'

# Load the model
model = torchvision.models.resnet50(pretrained=True).eval().cuda()
save_torch_model(model, pth_model_path)

torch_2_onnx(model, onnx_model_path)

PyTorch model saved to model.pth

ONNX model exported to model.onnx

==> Passed


### Export ONNX model to TRT

In [5]:
def onnx_2_trt(onnx_model_path, trt_model_path='model.trt'):
    TRT_LOGGER = trt.Logger() # This logger is required to build an engine

    EXPLICIT_BATCH = []
    print('trt version', trt.__version__)
    if trt.__version__[0] >= '7':
        EXPLICIT_BATCH.append(
            1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

    with trt.Builder(TRT_LOGGER) as builder,\
        builder.create_network(*EXPLICIT_BATCH) as network,\
        trt.OnnxParser(network, TRT_LOGGER) as parser:

        builder.max_workspace_size = 1 << 28
        builder.max_batch_size = 1

        with open(onnx_model_path, 'rb') as f:
            if not parser.parse(f.read()):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))

        # reshape input from 32 to 1
        shape = list(network.get_input(0).shape)
        engine = builder.build_cuda_engine(network)
        with open(trt_model_path, 'wb') as f:
            f.write(engine.serialize())
    
    print('TRT model exported to {}\n'.format(trt_model_path))
    
onnx_2_trt(onnx_model_path)

trt version 7.2.3.4
TRT model exported to model.trt



### Inference with TRT Engine

In [6]:
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem 
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

In [7]:
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    """do_inference (for TensorRT 6.x or lower)
    This function is generalized for multiple inputs/outputs.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size,
                          bindings=bindings,
                          stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [8]:
def do_inference_v2(context, bindings, inputs, outputs, stream):
    """do_inference_v2 (for TensorRT 7.0+)
    This function is generalized for multiple inputs/outputs for full
    dimension networks.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings,
                             stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [14]:
def allocate_buffer(engine):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    output_idx = 0
    stream = cuda.Stream()
    
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
            
    return inputs, outputs, bindings, stream

In [59]:
class TRT_Engine(object):
    """
    A TRT Engine Demo, include engine inintial and inference
    """ 
    def __init__(self, trt_bin, cuda_ctx=None):
        """
        param:
            trt_bin: 'path/to/model.trt'
            cuda_ctx: cuda.Device(0).make_context(), 0 is GPU number
        """
        self.trt_bin = trt_bin
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx:
            self.cuda_ctx.push()
        
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self.engine = self._load_engine()
        self.inference_fn = do_inference if trt.__version__[0] < '7' else do_inference_v2
        
        try:
            self.context = self.engine.create_execution_context()
            self.inputs, self.outputs, self.bindings, self.stream = allocate_buffer(self.engine)
        except Exception as e:
            raise RuntimeError('Fail to allocate CUDA resources') from e
        finally:
            if self.cuda_ctx:
                self.cuda_ctx.pop()
                
    def _load_engine(self):
        with open(self.trt_bin, 'rb')as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
        
    def __del__(self):
        """Free CUDA memories."""
        del self.outputs
        del self.inputs
        del self.stream
        
    def __call__(self, img):
        self.inputs[0].host = np.ascontiguousarray(img)
        
        if self.cuda_ctx:
            self.cuda_ctx.push()
        
        trt_outputs = self.inference_fn(
            context = self.context,
            bindings = self.bindings,
            inputs = self.inputs,
            outputs = self.outputs,
            stream = self.stream
        )
        
        if self.cuda_ctx:
            self.cuda_ctx.pop()
        
        return trt_outputs

In [60]:
engine = TRT_Engine('model.trt', False)

In [61]:
img = cv2.imread('1.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (224, 224))

mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

img_norm = (img / 255. - mean) / std
img_norm = img_norm.transpose((2, 0, 1)).astype(np.float32)
img_4dims = np.expand_dims(img_norm, axis=0)

img_norm.shape, img_4dims.shape

((3, 224, 224), (1, 3, 224, 224))

In [62]:
s = 0
for i in range(100):
    t1 = time.time()
    y_trt = engine(img_norm)
    t2 = time.time()
    s += t2 - t1
s / 100

0.003329808712005615

In [53]:
np.array(y_trt).shape

(1, 1000)

### Export PyTorch to TRT

In [23]:
def torch_2_trt(model, x, trt_model_path):
    assert isinstance(x, torch.cuda.FloatTensor), print('Invalid Input Type - {}'.format(type(x)))
    model_trt = torch2trt(model, [x])
    torch.save(model_trt.state_dict(), trt_model_path)
    print('TRT model exported to {}\n'.format(trt_model_path))

In [24]:
x = torch.from_numpy(img_4dims).float().cuda()
print(x.size())
torch_2_trt(model, x, 'model_trt.pth')

Exception ignored in: <function TRT_Engine.__del__ at 0x7f7fe63651f0>
Traceback (most recent call last):
  File "/tmp/ipykernel_28441/798862869.py", line 32, in __del__
AttributeError: outputs
Exception ignored in: <function TRT_Engine.__del__ at 0x7f7fe636bd30>
Traceback (most recent call last):
  File "/tmp/ipykernel_28441/3935465325.py", line 32, in __del__
AttributeError: outputs


torch.Size([1, 3, 224, 224])
TRT model exported to model_trt.pth



### Inference with torch_2_TRT

In [25]:
model_trt1 = torch2trt(model, [x])

In [63]:
s = 0
for i in range(100):
    t1 = time.time()
    y1 = model_trt1(x)
    t2 = time.time()
    s += t2 - t1
s / 100

0.003029191493988037

In [27]:
model_trt2 = TRTModule()
model_trt2.load_state_dict(torch.load('model_trt.pth'))

<All keys matched successfully>

In [64]:
s = 0
for i in range(100):
    t3 = time.time()
    y2 = model_trt2(x)
    t4 = time.time()
    s += t4 - t3
s / 100

0.0027715039253234865

In [65]:
s = 0
for i in range(100):
    t3 = time.time()
    y3 = model(x)
    t4 = time.time()
    s += t4 - t3
s / 100

0.006194736957550049

In [66]:
y1 = y1.detach().cpu()
y2 = y2.detach().cpu()
y3 = y3.detach().cpu()
y_trt = torch.from_numpy(np.array(y_trt))

In [67]:
torch.mean(torch.pow(y3 - y2, 2)), torch.mean(torch.pow(y3 - y1, 2)), torch.mean(torch.pow(y2 - y1, 2))

(tensor(1.0541e-11), tensor(1.0549e-11), tensor(1.3717e-13))

In [68]:
torch.mean(torch.pow(y3 - y_trt, 2)), torch.mean(torch.pow(y2 - y_trt, 2)), torch.mean(torch.pow(y1 - y_trt, 2))

(tensor(1.0541e-11), tensor(0.), tensor(1.3717e-13))