In [None]:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

import common # See the folder.

In [None]:
TRT_LOGGER = trt.Logger()

# The following depend on float16, FP16!!!
onnx_file_path = '../models/DFA_model_simple.onnx'
i_file = '../test/rs02_1564022347_05.jpg'
d_file = '../test/rs02_1564022347_05.pgm'

def build_engine(onnx_file_path):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = 1 << 30
        builder.max_batch_size = 1
        # FP16
        builder.fp16_mode = True
        #builder.strict_type_constraints = True
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            if not parser.parse(model.read()):
                print('[ERROR]: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        network.get_input(0).shape = [1,4,288,512]
        engine = builder.build_cuda_engine(network)
        return engine

engine = build_engine(onnx_file_path)
context = engine.create_execution_context()

inputs, outputs, bindings, stream = common.allocate_buffers(engine)

In [None]:
from PIL import Image

i_raw = Image.open(i_file)
i_raw = i_raw.resize((512, 288)) # WHC
i_raw = np.asarray(i_raw, dtype=np.float32, order='C') # HWC
d_raw = Image.open(d_file)
d_raw = d_raw.resize((512, 288))
d_raw = np.asarray(d_raw, dtype=np.int16, order='C')

def toTensor(data):
    '''Change the data range to [0.0, 1.0]
    Return [B, C, H, W]
    '''
    assert data.ndim == 2 or data.ndim == 3
    if data.ndim == 2:
        # depth
        data = data[..., np.newaxis]
        mask = data > 10000
        data[mask] = 0
        data = data / 10000.
    elif data.ndim == 3:
        # image
        data = data / 255.
    data = np.transpose(data, (2, 0, 1))
    data = data[np.newaxis, ...]
    return data

def normalize(data, mean, std):
    assert data.ndim == 4
    assert data.shape[1] == len(mean) == len(std)
    batch_size = data.shape[0]
    mean, std = np.asarray(mean), np.asarray(std)
    mean = mean[..., np.newaxis, np.newaxis]
    std = std[..., np.newaxis, np.newaxis]
    for i in range(batch_size):
        data[i, ...] = (data[i, ...] - mean) / std
    return data


##############################
# Start Process Input data   #
##############################

data = toTensor(d_raw)
data = normalize(data, [0.1864497], [0.07711394])
img_data = toTensor(i_raw)
img_data = normalize(img_data, [0.368, 0.393, 0.404], [0.286, 0.290, 0.296])

# [1, 4, 288, 512] NCHW
test = np.concatenate((img_data, data), axis=1)
# FP16
test = test.astype(np.float16)
###################################
# Make sure the flags of the
# input data [C_CONTIGUOUS = True]
#
# If not, try np.ascontiguousarray
# test = np.ascontiguousarray(test)
###################################

# Inputs
inputs[0].host = test
trt_outputs = common.do_inference_v2(context, 
                                     bindings=bindings, 
                                     inputs=inputs, 
                                     outputs=outputs, 
                                     stream=stream)

In [None]:
import time
t0 = time.time()                                                                     
for i in range(9):                                                                  
    trt_outputs = common.do_inference_v2(context, 
                                         bindings=bindings, 
                                         inputs=inputs, 
                                         outputs=outputs, 
                                         stream=stream)
t1 = time.time()
print('[FPS]: %.3f' %(1./((t1-t0)/9.)))

1. DFA_model_simple.onnx 
```
Jetson Nano: {
    'Inference FPS': {'FP32': ~11.52, 'FP16': ~14.25}
}
```