In [1]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import numpy as np

In [10]:
import onnx 

def load_onnx_model(onnx_model_path):
    with open(onnx_model_path, "rb") as f: 
        onnx_model = onnx.load_model(f) 
        return onnx_model 
    
onnx_model_path = "rtdetr.onnx" 
engine_path="detr_3rd.trt"
onnx_model = load_onnx_model(onnx_model_path)

def create_engine(onnx_model, max_batch_size, fp16_mode=False): 
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 
    builder = trt.Builder(TRT_LOGGER) 
    network = builder.create_network(flags=1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 
    parser = trt.OnnxParser(network, TRT_LOGGER) 
    parser.parse(onnx_model.SerializeToString()) 
    builder.max_batch_size = max_batch_size 
    config = builder.create_builder_config() 
    config.max_workspace_size = 1 << 30 
    if fp16_mode:
        config.set_flag(trt.BuilderFlag.FP16) 
    engine = builder.build_serialized_network(network, config) 
    return engine 

max_batch_size = 1 
fp16_mode = False 
engine = create_engine(onnx_model, max_batch_size, fp16_mode)

[08/11/2023-09:10:10] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[08/11/2023-09:10:10] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[08/11/2023-09:10:10] [TRT] [W] onnx2trt_utils.cpp:400: One or more weights outside the range of INT32 was clamped


  builder.max_batch_size = max_batch_size
  config.max_workspace_size = 1 << 30


In [11]:
with open(engine_path, "wb") as f:
    f.write(engine)
    print("generate file success!")

generate file success!


In [4]:
f1 = open(engine_path, "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 

engine = runtime.deserialize_cuda_engine(f1.read())
context = engine.create_execution_context()

[08/11/2023-08:44:10] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [5]:
def preprocess(frame):
    [height, width, _] = frame.shape
    length = max((height, width))
    image = np.zeros((length, length, 3), np.uint8)
    image[0:height, 0:width] = frame
    scale = length / 640
    blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
    return blob,scale

image=cv2.imread("120.jpg")
img,scale=preprocess(image)

In [6]:
output = np.empty([1,5,8400], dtype = np.float32) 

# allocate device memory
d_input = cuda.mem_alloc(1 * img.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

bindings = [int(d_input), int(d_output)]

stream = cuda.Stream()

In [7]:
def predict(batch): # result gets copied into output
    # transfer input data to device
    cuda.memcpy_htod_async(d_input, batch, stream)
    # execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # syncronize threads
    stream.synchronize()
    
    return output

In [8]:
outputs = predict(img)

In [9]:
%%timeit

pred = predict(img)

7.13 ms ± 190 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
outputs.shape

(1, 5, 8400)