In [21]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os

In [22]:
# Paths
ONNX_MODEL_PATH = "../converted_models/ONNX/fp16_quantized_model.onnx"  # Path to your ONNX model
TRT_ENGINE_PATH = "../converted_models/ONNX/fp16_quantized_model.trt"   # Output TensorRT engine path
MAX_BATCH_SIZE = 32              # Set max batch size
MODE = "fp16"                # Enable FP16 for optimization (if supported)

In [24]:
import onnx

model = onnx.load(ONNX_MODEL_PATH)
for input_tensor in model.graph.input:
    print(input_tensor.name, input_tensor.type.tensor_type.shape)


serving_default_keras_tensor_312:0 dim {
  dim_param: "unk__751"
}
dim {
  dim_value: 224
}
dim {
  dim_value: 224
}
dim {
  dim_value: 3
}



In [25]:
def build_engine(onnx_file_path, engine_file_path, max_batch_size=1, mode="fp16"):
    """Converts ONNX model to TensorRT engine."""
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    
    # Configure the builder
    config = builder.create_builder_config()
    
    # Set cache
    cache = config.create_timing_cache(b"")
    config.set_timing_cache(cache, ignore_mismatch=False)
    
    flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(flag)
    parser = trt.OnnxParser(network, logger)

    # Read ONNX file
    if not os.path.exists(onnx_file_path):
        raise FileNotFoundError(f"ONNX file not found: {onnx_file_path}")
    with open(onnx_file_path, "rb") as model_file:
        if not parser.parse(model_file.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            raise RuntimeError(f"Failed to parse ONNX file: {onnx_file_path}")

    # Inspect inputs and outputs
    inputs = [network.get_input(i) for i in range(network.num_inputs)]
    outputs = [network.get_output(i) for i in range(network.num_outputs)]

    for input_tensor in inputs:
        print(f"Model input: {input_tensor.name}, shape: {input_tensor.shape}, dtype: {input_tensor.dtype}")
    for output_tensor in outputs:
        print(f"Model output: {output_tensor.name}, shape: {output_tensor.shape}, dtype: {output_tensor.dtype}")

    # Define optimization profile for dynamic input shapes (NHWC format)
    input_tensor = network.get_input(0)  # Assuming single input model
    profile = builder.create_optimization_profile()

    # Define shapes for NHWC format (Batch size dynamic)
    min_shape = (1, 224, 224, 3)         # Minimum input size (batch_size=1)
    opt_shape = (max_batch_size // 2, 224, 224, 3)  # Optimum input size
    max_shape = (max_batch_size, 224, 224, 3)       # Maximum input size
    profile.set_shape(input_tensor.name, min=min_shape, opt=opt_shape, max=max_shape)
    config.add_optimization_profile(profile)

    # Set precision mode
    if mode == "fp16":
        config.set_flag(trt.BuilderFlag.FP16)
    elif mode == "int8":
        config.set_flag(trt.BuilderFlag.INT8)
    else:
        raise RuntimeError(f"Precision mode {mode} not supported")

    # Build engine
    print("Building TensorRT engine. This may take a while...")
    serialized_engine = builder.build_serialized_network(network, config)
    if serialized_engine is None:
        raise RuntimeError("Failed to build TensorRT engine")

    # Save the serialized engine to a file
    with open(engine_file_path, "wb") as engine_file:
        engine_file.write(serialized_engine)
    print(f"TensorRT engine saved to: {engine_file_path}")


# Convert ONNX to TensorRT
build_engine(ONNX_MODEL_PATH, TRT_ENGINE_PATH, MAX_BATCH_SIZE, mode="fp16")


Model input: serving_default_keras_tensor_312:0, shape: (-1, 224, 224, 3), dtype: DataType.FLOAT
Model output: StatefulPartitionedCall_1:0, shape: (-1, 1), dtype: DataType.FLOAT
Building TensorRT engine. This may take a while...
[12/07/2024-14:21:15] [TRT] [W] DLA requests all profiles have same min, max, and opt value. All dla layers are falling back to GPU
TensorRT engine saved to: ../converted_models/ONNX/fp16_quantized_model.trt
