This file is the implementation of example and performance evaluation of PyTorch-TensorRT.

In [None]:
# Link your Google Drive to /content/drive/
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!mkdir -p /content/drive
!google-drive-ocamlfuse /content/drive

In [None]:
# Check python version and cuda version
!python --version
!nvcc --version

In [None]:
# Install TensorRT and other requirements
# You need to download the DEB file and TAR file of TensorRT in correct version firstly and upload them to your Google Drive
!dpkg -i /content/drive/nv-tensorrt-repo-ubuntu1804-cuda11.0-trt7.2.3.4-ga-20210226_1-1_amd64.deb

!apt-key add /var/nv-tensorrt-repo-cuda11.0-trt7.2.3.4-ga-20210226/7fa2af80.pub

!apt-get update

!apt-get install libnvinfer7=7.2.3-1+cuda11.0 libnvonnxparsers7=7.2.3-1+cuda11.0 libnvparsers7=7.2.3-1+cuda11.0 libnvinfer-plugin7=7.2.3-1+cuda11.0 libnvinfer-dev=7.2.3-1+cuda11.0 libnvonnxparsers-dev=7.2.3-1+cuda11.0 libnvparsers-dev=7.2.3-1+cuda11.0 libnvinfer-plugin-dev=7.2.3-1+cuda11.0 python-libnvinfer=7.2.3-1+cuda11.0 python3-libnvinfer=7.2.3-1+cuda11.0

!apt-mark hold libnvinfer7 libnvonnxparsers7 libnvparsers7 libnvinfer-plugin7 libnvinfer-dev libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python-libnvinfer python3-libnvinfer

!apt-get install tensorrt
!pip install pycuda
!pip install tensorflow-gpu==2.4.1

%cd /content
!cp /content/drive/TensorRT-7.2.3.4.Ubuntu-18.04.x86_64-gnu.cuda-11.0.cudnn8.1.tar.gz TensorRT-7.2.3.4.Ubuntu-18.04.x86_64-gnu.cuda-11.0.cudnn8.1.tar.gz
!tar -zxvf TensorRT-7.2.3.4.Ubuntu-18.04.x86_64-gnu.cuda-11.0.cudnn8.1.tar.gz
%cd /content/TensorRT-7.2.3.4/python
!pip install tensorrt-7.2.3.4-cp37-none-linux_x86_64.whl
%cd /content/TensorRT-7.2.3.4/graphsurgeon
!pip install graphsurgeon-0.4.5-py2.py3-none-any.whl
%cd /content/TensorRT-7.2.3.4/uff
!pip install uff-0.6.9-py2.py3-none-any.whl

In [4]:
# Check if the installation of TensorRT is success
import tensorrt as trt

In [54]:
onnx_file_path = 'mobilenet.onnx'
engine_file_path = 'mobilenet.trt'
verbose = False
baseline = True  # whether do the inference with original PyTorch or not
input_image_path = 'img0.JPG'

In [None]:
%cd /content
!wget -O img0.JPG "https://thumbs-prod.si-cdn.com/ej9KRK9frB5AXD6W9LXKFnuRc-0=/fit-in/1600x0/https://public-media.si-cdn.com/filer/ad/7b/ad7b3860-ad5f-43dc-800e-af57830cd1d3/labrador.jpg"
!wget -O img1.JPG "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRHjtOYuK2n_CZoxQs9zxK96N1_qMiv3ZWSYQ&usqp=CAUg"
!wget -O img2.JPG "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRoEAt7d8PuZPBxWsjzvgQ_Y8Zfhgn1MvvA3Q&usqp=CAU"
!wget -O img3.JPG "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ9BZGaN2WhgsJJfLmEcEiwMRmgpSzJPjnacg&usqp=CAU"
!wget -O img4.JPG "https://media.nature.com/lw800/magazine-assets/d41586-020-01430-5/d41586-020-01430-5_17977552.jpg"
!wget -O img5.JPG "https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/golden-retriever-royalty-free-image-506756303-1560962726.jpg?crop=1.00xw:0.756xh;0,0.0756xh&resize=980:*"
!wget -O img6.JPG "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRH7_Z_Frxo_RbvJ6StY2TzQ0zFCgv6podjzw&usqp=CAU"
!wget -O img7.JPG "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR4X0fwAtbfiSwRvN3-Fk1pE1rKMsAgWjcpbA&usqp=CAU"

In [None]:
# Example code

import os
import torch
import time
import argparse
import tensorrt as trt
import torchvision
import torchvision.transforms as transforms
import pycuda
import pycuda.autoinit
import numpy as np
from PIL import Image
from sklearn.metrics import mean_squared_error

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def onnx(onnx_file_path, verbose):
  if not os.path.exists(onnx_file_path):
    print("Generating ONNX file for MobileNet_V2: ", onnx_file_path)
    dummy_input = torch.randn(1, 3, 224, 224, device='cuda')
    model = torchvision.models.mobilenet_v2(pretrained=True).cuda()
    input_names = ["actual_input_1"] + ["learned_%d" % i for i in range(16)]
    output_names = ["output1"]
    torch.onnx.export(model, dummy_input, onnx_file_path, verbose=verbose, input_names=input_names, output_names=output_names)

  print("Loading ONNX file from: ", onnx_file_path)
  onnx_model = open(onnx_file_path, 'rb')
  return onnx_model

def trt_engine(engine_file_path, onnx_model):
	model_engine = None
	if os.path.exists(engine_file_path):
		print("Reading engine from: ", engine_file_path)
		# deserialize the engine file
		with open(engine_file_path, "rb") as model, trt.Runtime(TRT_LOGGER) as runtime:
			model_engine = runtime.deserialize_cuda_engine(model.read())
	else:
		with trt.Builder(TRT_LOGGER) as builder:
			# Specify that the network should be created with an explicit batch dimension
			EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
			network = builder.create_network(EXPLICIT_BATCH)
			parser = trt.OnnxParser(network, TRT_LOGGER)
			builder.max_workspace_size = 1 << 28
			builder.max_batch_size = 1
			parser.parse(onnx_model.read())
			model_engine = builder.build_cuda_engine(network)
			with open(engine_file_path, "wb") as f:
				f.write(model_engine.serialize())
	return model_engine

def get_image(input_image_path):
	print("Get image: ", input_image_path)
	image = Image.open(input_image_path)
	print("Input image format {}, size {}, mode {}.".format(image.format, image.size, image.mode))
	preprocess = transforms.Compose([
		transforms.Resize(256),
		transforms.CenterCrop(224),
		transforms.ToTensor(),
		transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
	])
	image = preprocess(image)
	print("Image size after preprocessing: ", image.shape)
	image_binary = np.array(image, dtype=np.float32, order='C')
	return image_binary

def allocate_buffers(model_engine):
	bindings 	= []
	inputs 		= []
	outputs 	= []
	# binding: describe the input and output ports of the engine
	for binding in model_engine:
		data_size 		= trt.volume(model_engine.get_binding_shape(binding)) * model_engine.max_batch_size
		data_type 		= trt.nptype(model_engine.get_binding_dtype(binding))
		host_memory 	= pycuda.driver.pagelocked_empty(data_size, data_type)
		device_memory 	= pycuda.driver.mem_alloc(host_memory.nbytes)
		# stored the memory index in CUDA context
		bindings.append(int(device_memory))
		if model_engine.binding_is_input(binding):
			inputs.append({"host": host_memory, "device": device_memory})
		else:
			outputs.append({"host": host_memory, "device": device_memory})
	return inputs, outputs, bindings

def do_inference(context, bindings, inputs, outputs, stream):
	# send inputs to device (GPU)
	for input in inputs:
		pycuda.driver.memcpy_htod_async(input["device"], input["host"], stream)
	# do inference
	start = time.clock()
	context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
	end = time.clock()
	# send outputs to host (CPU)
	for output in outputs:
		pycuda.driver.memcpy_dtoh_async(output["host"], output["device"], stream)
	# waot for all activity on this stream to cease, then return.
	stream.synchronize()
	return [output["host"] for output in outputs], (end-start)*1000

def post_process(outputs):
	output = torch.Tensor(outputs[0])
	return torch.nn.functional.softmax(output, dim=0).argmax(dim=0)

def pytorch_baseline(input_image_path):
	model = torch.hub.load('pytorch/vision:v0.9.0', 'mobilenet_v2', pretrained=True)
	model.eval()
	input_image = Image.open(input_image_path)
	print("Input image format {}, size {}, mode {}.".format(input_image.format, input_image.size, input_image.mode))
	preprocess = transforms.Compose([
		transforms.Resize(256),
		transforms.CenterCrop(224),
		transforms.ToTensor(),
		transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
	])
	input_tensor = preprocess(input_image)
	input_batch = input_tensor.unsqueeze(0)

	if torch.cuda.is_available():
		input_batch = input_batch.to('cuda')
		model.to('cuda')
	output = None
	start = time.clock()
	with torch.no_grad():
		output = model(input_batch)
	end = time.clock()

	return output, (end-start)*1000

# Get model onnx file
onnx_model = onnx(onnx_file_path, verbose)
# Build cuda engine
model_engine = trt_engine(engine_file_path, onnx_model)
# Prepare inputs
image_binary = get_image(input_image_path)

# Create cuda context
trt_output = None
trt_time = 0
with model_engine.create_execution_context() as context:
  inputs, outputs, bindings = allocate_buffers(model_engine)
  # A handle for a queue of operations that will be carried out in order.
  stream = pycuda.driver.Stream()
  inputs[0]["host"] = image_binary
  # Do inference
  outputs, trt_time = do_inference(context, bindings, inputs, outputs, stream)
  # Process outputs
  trt_output = torch.nn.functional.softmax(torch.Tensor(outputs[0]), dim=0)
  print("trt_label:   ", trt_output.argmax(dim=0).numpy())
  print("trt_time:     %.3f ms." % trt_time)
  torch.cuda.empty_cache()

if baseline:
  pth_output, pth_time = pytorch_baseline(input_image_path)
  pth_output = torch.nn.functional.softmax(pth_output[0], dim=0).cpu()
  pth_label = pth_output.argmax(dim=0).item()
  print("pth_label:", pth_label)
  print("pth_time: %.3f ms." % pth_time)

In [None]:
# Modified code for benchmark

import os
import torch
import time
import argparse
import tensorrt as trt
import torchvision
import torchvision.transforms as transforms
import pycuda
import pycuda.autoinit
import numpy as np
from PIL import Image
from sklearn.metrics import mean_squared_error

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def onnx(onnx_file_path, verbose):
  if not os.path.exists(onnx_file_path):
    print("Generating ONNX file for MobileNet_V2: ", onnx_file_path)
    dummy_input = torch.randn(1, 3, 224, 224, device='cuda')
    model = torchvision.models.mobilenet_v2(pretrained=True).cuda()
    input_names = ["actual_input_1"] + ["learned_%d" % i for i in range(16)]
    output_names = ["output1"]
    torch.onnx.export(model, dummy_input, onnx_file_path, verbose=verbose, input_names=input_names, output_names=output_names)

  print("Loading ONNX file from: ", onnx_file_path)
  onnx_model = open(onnx_file_path, 'rb')
  return onnx_model

def trt_engine(engine_file_path, onnx_model):
	model_engine = None
	if os.path.exists(engine_file_path):
		print("Reading engine from: ", engine_file_path)
		# deserialize the engine file
		with open(engine_file_path, "rb") as model, trt.Runtime(TRT_LOGGER) as runtime:
			model_engine = runtime.deserialize_cuda_engine(model.read())
	else:
		with trt.Builder(TRT_LOGGER) as builder:
			# Specify that the network should be created with an explicit batch dimension
			EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
			network = builder.create_network(EXPLICIT_BATCH)
			parser = trt.OnnxParser(network, TRT_LOGGER)
			builder.max_workspace_size = 1 << 28
			builder.max_batch_size = 1
			parser.parse(onnx_model.read())
			model_engine = builder.build_cuda_engine(network)
			with open(engine_file_path, "wb") as f:
				f.write(model_engine.serialize())
	return model_engine

def get_image(input_image_path):
	print("Get image: ", input_image_path)
	image = Image.open(input_image_path)
	print("Input image format {}, size {}, mode {}.".format(image.format, image.size, image.mode))
	preprocess = transforms.Compose([
		transforms.Resize(256),
		transforms.CenterCrop(224),
		transforms.ToTensor(),
		transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
	])
	image = preprocess(image)
	print("Image size after preprocessing: ", image.shape)
	image_binary = np.array(image, dtype=np.float32, order='C')
	return image_binary

def allocate_buffers(model_engine):
	bindings 	= []
	inputs 		= []
	outputs 	= []
	# binding: describe the input and output ports of the engine
	for binding in model_engine:
		data_size 		= trt.volume(model_engine.get_binding_shape(binding)) * model_engine.max_batch_size
		data_type 		= trt.nptype(model_engine.get_binding_dtype(binding))
		host_memory 	= pycuda.driver.pagelocked_empty(data_size, data_type)
		device_memory 	= pycuda.driver.mem_alloc(host_memory.nbytes)
		# stored the memory index in CUDA context
		bindings.append(int(device_memory))
		if model_engine.binding_is_input(binding):
			inputs.append({"host": host_memory, "device": device_memory})
		else:
			outputs.append({"host": host_memory, "device": device_memory})
	return inputs, outputs, bindings

def do_inference(context, bindings, inputs, outputs, stream):
	# send inputs to device (GPU)
	for input in inputs:
		pycuda.driver.memcpy_htod_async(input["device"], input["host"], stream)
	# do inference
	start = time.clock()
	context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
	end = time.clock()
	# send outputs to host (CPU)
	for output in outputs:
		pycuda.driver.memcpy_dtoh_async(output["host"], output["device"], stream)
	# waot for all activity on this stream to cease, then return.
	stream.synchronize()
	return [output["host"] for output in outputs], (end-start)*1000

def post_process(outputs):
	output = torch.Tensor(outputs[0])
	return torch.nn.functional.softmax(output, dim=0).argmax(dim=0)

def pytorch_baseline(model, input_batch):
  batch_time = 0
  for i in range(8):
    output = None
    start = time.clock()
    with torch.no_grad():
      output = model(input_batch[i])
    end = time.clock()
    batch_time += (end-start)*1000
  return output, batch_time


# Get model onnx file
onnx_model = onnx(onnx_file_path, verbose)
# Build cuda engine
model_engine = trt_engine(engine_file_path, onnx_model)
# Prepare inputs
images_binary = []
for i in range(8):
  images_binary.append(get_image('img'+str(i)+'.JPG'))

# Create cuda context
trt_output = None
trt_time = 0
trt_time_total = 0
for _ in range(1000):
  for i in range(8):
    with model_engine.create_execution_context() as context:
      inputs, outputs, bindings = allocate_buffers(model_engine)
      # A handle for a queue of operations that will be carried out in order.
      stream = pycuda.driver.Stream()
      inputs[0]["host"] = images_binary[i]
      # Do inference
      outputs, trt_time = do_inference(context, bindings, inputs, outputs, stream)
      trt_time_total += trt_time
print("avg_trt_time: %.3f ms/image" % (trt_time_total/(8*1000)))
torch.cuda.empty_cache()

if baseline:
  pth_time_total = 0
  model = torch.hub.load('pytorch/vision:v0.9.0', 'mobilenet_v2', pretrained=True)
  model.eval()
  input_batch = []
  for i in range(8):
    input_image = Image.open('img'+str(i)+'.JPG')
    preprocess = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    input_tensor = preprocess(input_image)
    input_batch.append(input_tensor.unsqueeze(0))
  if torch.cuda.is_available():
    for i in range(8):
      input_batch[i] = input_batch[i].to('cuda')
      model.to('cuda')
  for _ in range(1000):
    pth_output, pth_time = pytorch_baseline(model, input_batch)
    pth_time_total += pth_time
  print("avg_pth_time: %.3f ms/image" % (pth_time_total/(1000*8)))
