In [7]:
import os
import numpy as np
import tensorflow as tf
import tensorrt as trt
from onnx import ModelProto
import onnxruntime as ort
import time
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import matplotlib.pyplot as plt
import cv2
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from tensorflow.keras.applications import MobileNet, VGG16
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from mtcnn import MTCNN
from matplotlib.patches import Rectangle, Circle
import pycuda.driver as cuda
import pycuda.autoinit
import tf2onnx
import onnx
import onnxruntime
import onnx_graphsurgeon as gs

2024-10-03 22:16:19.813167: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-03 22:16:19.813224: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-03 22:16:19.813256: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-03 22:16:19.821213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# MOBILENET

### conversion of tensorflow to onnx 

In [3]:
# Load your TensorFlow model
model = tf.keras.models.load_model('/home/gourav/Desktop/vss/capstone/best_model.h5')

# Specify the input shape with dynamic batch size
input_signature = [tf.TensorSpec([1, 224, 224, 3], tf.float32, name="input")]

# Convert the model to ONNX
onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature=input_signature)

# Save the ONNX model
with open("model_mobilenet.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

2024-09-23 22:15:44.367273: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-23 22:15:44.372757: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-23 22:15:44.372968: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

### knowing tensorflow model throughput and accuracy

In [3]:

# Load the TensorFlow model
model = tf.keras.models.load_model('/home/gourav/Desktop/vss/capstone/best_model.h5')

# Set up the test data generator
test_data_generator = ImageDataGenerator(rescale=1.0/255.0)
test_generator = test_data_generator.flow_from_directory(
    '/home/gourav/Desktop/vss/capstone/dataset/test',
    target_size=(224, 224),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)

# Evaluate the TensorFlow model
start_time = time.time()
tf_result = model.evaluate(test_generator)
end_time = time.time()

# Calculate throughput
total_samples = len(test_generator.filenames)
total_time = end_time - start_time
throughput = total_samples / total_time

# Print the results
print("TensorFlow Test Loss:", tf_result[0])
print("TensorFlow Test Accuracy:", tf_result[1])
print("TensorFlow Test Accuracy (%):", tf_result[1]*100)
print("TensorFlow Throughput (samples/second):", throughput)


Found 1269 images belonging to 2 classes.


2024-09-23 22:06:11.463500: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700


TensorFlow Test Loss: 0.07977847754955292
TensorFlow Test Accuracy: 0.9826635122299194
TensorFlow Test Accuracy (%): 98.26635122299194
TensorFlow Throughput (samples/second): 141.78509883080616


## model accuracy and throughput for onnx

In [4]:
# Load the ONNX model
model_path = './model_mobilenet.onnx'
session = onnxruntime.InferenceSession(model_path)

# Get model input details
input_name = session.get_inputs()[0].name
input_shape = session.get_inputs()[0].shape
_, height, width, channels = input_shape

# Set up the test data generator
test_data_generator = ImageDataGenerator(rescale=1.0/255.0)
test_generator = test_data_generator.flow_from_directory(
    '/home/gourav/Desktop/vss/capstone/dataset/test',
    target_size=(height, width),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)

# Evaluate the ONNX model
correct_predictions = 0
total_samples = len(test_generator.filenames)

start_time = time.time()

for i in range(total_samples):
    image, label = test_generator.next()
    
    # Process the single sample
    output = session.run(None, {input_name: image})[0]
    predicted_class = (output > 0.5).astype(int).flatten()
    correct_predictions += int(predicted_class == label)

end_time = time.time()

# Calculate metrics
accuracy = correct_predictions / total_samples
total_time = end_time - start_time
throughput = total_samples / total_time

# Print the results
print("ONNX Test Accuracy:", accuracy)
print("ONNX Test Accuracy (%):", accuracy * 100)
print("ONNX Throughput (samples/second):", throughput)
print("ONNX Total inference time (seconds):", total_time)


Found 1269 images belonging to 2 classes.
ONNX Test Accuracy: 0.9826635145784082
ONNX Test Accuracy (%): 98.26635145784081
ONNX Throughput (samples/second): 218.0456164532443
ONNX Total inference time (seconds): 5.819883108139038


## printing shapes of onnx and tensorfloow


In [None]:
# Load the TensorFlow model
model = tf.keras.models.load_model('/home/gourav/Desktop/vss/capstone/best_model.h5')

# Print input and output shapes and types
print("TensorFlow Model Inputs:")
for layer in model.inputs:
    print(f"Name: {layer.name}, Shape: {layer.shape}, Type: {layer.dtype}")

print("\nTensorFlow Model Outputs:")
for layer in model.outputs:
    print(f"Name: {layer.name}, Shape: {layer.shape}, Type: {layer.dtype}")

# Load the ONNX model
onnx_model_path = '/home/gourav/Desktop/vss/tensorrt/model.onnx'
onnx_model = onnx.load(onnx_model_path)

# Print input and output shapes and types
print("ONNX Model Inputs:")
for input in onnx_model.graph.input:
    input_type = input.type.tensor_type
    shape = [dim.dim_value for dim in input_type.shape.dim]
    print(f"Name: {input.name}, Shape: {shape}, Type: {input_type.elem_type}")

print("\nONNX Model Outputs:")
for output in onnx_model.graph.output:
    output_type = output.type.tensor_type
    shape = [dim.dim_value for dim in output_type.shape.dim]
    print(f"Name: {output.name}, Shape: {shape}, Type: {output_type.elem_type}")



### converting to TensorRT

In [5]:

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(onnx_path, shape=[1, 224, 224, 3]):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 256 << 20)  # 256 MiB
        
        # Parse ONNX model
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                raise RuntimeError("Failed to parse ONNX model.")
        
        # Set the input shape for the network
        network.get_input(0).shape = shape
        
        # Build the serialized engine
        serialized_engine = builder.build_serialized_network(network, config)
        if serialized_engine is None:
            raise RuntimeError("Failed to build TensorRT engine.")
        
        # Deserialize the engine
        engine = trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(serialized_engine)
        return engine

def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)

def load_engine(trt_runtime, plan_path):
    with open(plan_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

# Define paths and parameters
engine_name = "ssd_mobilenet_fp32.plan"
onnx_path = "/home/gourav/Desktop/vss/tensorrt/model_mobilenet.onnx"
batch_size = 1

# Load ONNX model to get input shape
model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

# Extract dimensions from ONNX model
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size, d0, d1, d2]

# Build and save TensorRT engine
engine = build_engine(onnx_path, shape=shape)
save_engine(engine, engine_name)


[09/23/2024-22:09:55] [TRT] [W] onnx2trt_utils.cpp:377: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.


## Fp16

In [6]:
import tensorrt as trt
from onnx import ModelProto

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(onnx_path, shape=[1, 224, 224, 3], use_fp16=False, use_int8=False, calibration_data=None):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:
        # Set memory pool limit
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 256 << 20)  # 256 MiB
        
        # Enable FP16 precision
        if use_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        
        # Enable INT8 precision
        if use_int8:
            if calibration_data is None:
                raise ValueError("Calibration data must be provided for INT8 precision.")
            config.set_flag(trt.BuilderFlag.INT8)
            # Create a calibration profile and set it up
            calib = trt.IInt8Calibrator()
            config.int8_calibrator = calib
            # You need to implement a proper calibration method for your dataset

        # Parse ONNX model
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                raise RuntimeError("Failed to parse ONNX model.")
        
        # Set the input shape for the network
        network.get_input(0).shape = shape
        
        # Build the serialized engine
        serialized_engine = builder.build_serialized_network(network, config)
        if serialized_engine is None:
            raise RuntimeError("Failed to build TensorRT engine.")
        
        # Deserialize the engine
        engine = trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(serialized_engine)
        return engine

def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)

def load_engine(trt_runtime, plan_path):
    with open(plan_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

# Define paths and parameters
engine_name = "ssd_mobilenet_fp16.plan"
onnx_path = "/home/gourav/Desktop/vss/tensorrt/model_mobilenet.onnx"
batch_size = 1

# Load ONNX model to get input shape
model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

# Extract dimensions from ONNX model
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size, d0, d1, d2]

# Build and save TensorRT engine with FP16 and INT8 precision
use_fp16 = True  # Set to True to enable FP16 optimization
use_int8 = False  # Set to True to enable INT8 optimization
calibration_data = None  # Provide calibration data for INT8 if enabled

engine = build_engine(onnx_path, shape=shape, use_fp16=use_fp16, use_int8=use_int8, calibration_data=calibration_data)
save_engine(engine, engine_name)


[09/23/2024-22:10:53] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[09/23/2024-22:10:53] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.
[09/23/2024-22:10:53] [TRT] [W] Check verbose logs for the list of affected weights.
[09/23/2024-22:10:53] [TRT] [W] - 23 weights are affected by this issue: Detected subnormal FP16 values.
[09/23/2024-22:10:53] [TRT] [W] - 13 weights are affected by this issue: Detected values less than smallest positive FP16 subnormal value and converted them to the FP16 minimum subnormalized value.


## Inference 

In [7]:
# Load TensorRT engine and allocate buffers
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def load_engine(trt_runtime, plan_path):
    with open(plan_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine
def nptype_fix(trt_type):
    import numpy as np
    mapping = {
        trt.DataType.FLOAT: np.float32,
        trt.DataType.HALF: np.float16,
        trt.DataType.INT8: np.int8,
        trt.DataType.INT32: np.int32,
        trt.DataType.BOOL: np.bool_,  # Use np.bool_ instead of np.bool
    }
    return mapping[trt_type]

# Then modify your allocate_buffers function:
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = nptype_fix(engine.get_binding_dtype(binding))  # Use the new function here
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append({'host': host_mem, 'device': device_mem})
        else:
            outputs.append({'host': host_mem, 'device': device_mem})
    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp['device'], inp['host'], stream) for inp in inputs]
    
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out['host'], out['device'], stream) for out in outputs]
    
    # Synchronize the stream
    stream.synchronize()
    
    # Return the output data.
    return [out['host'] for out in outputs]


In [8]:

# Load the TensorRT engine
engine_path = './ssd_mobilenet_fp32.plan'
trt_runtime = trt.Runtime(TRT_LOGGER)
engine = load_engine(trt_runtime, engine_path)

# Create an execution context
context = engine.create_execution_context()

# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)

# Set up the test data generator
input_shape = engine.get_binding_shape(0)  # Assuming input is at index 0
batch_size, height, width, channels = input_shape
test_data_generator = ImageDataGenerator(rescale=1.0/255.0)
test_generator = test_data_generator.flow_from_directory(
    '/home/gourav/Desktop/vss/capstone/dataset/test',
    target_size=(height, width),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)

# Evaluate the TensorRT model
correct_predictions = 0
total_samples = len(test_generator.filenames)

start_time = time.time()

for i in range(total_samples):
    image, label = test_generator.next()
    
    # Prepare input data
    inputs[0]['host'] = np.ascontiguousarray(image.astype(np.float32))
    
    # Perform inference
    output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
    
    # Process the output
    predicted_class = (output > 0.5).astype(int).flatten()
    correct_predictions += int(predicted_class == label)

end_time = time.time()

# Calculate metrics
accuracy = correct_predictions / total_samples
total_time = end_time - start_time
throughput = total_samples / total_time

# Print the results
print("TensorRT FP32 Test Accuracy:", accuracy)
print("TensorRT FP32 Test Accuracy (%):", accuracy * 100)
print("TensorRT FP32 Throughput (samples/second):", throughput)
print("TensorRT FP32 Total inference time (seconds):", total_time)


[09/23/2024-22:12:01] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
[09/23/2024-22:12:01] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
Found 1269 images belonging to 2 classes.


  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  dtype = nptype_fix(engine.get_binding_dtype(binding))  # Use the new function here
  if engine.binding_is_input(binding):
  input_shape = engine.get_binding_shape(0)  # Assuming input is at index 0


TensorRT FP32 Test Accuracy: 0.9826635145784082
TensorRT FP32 Test Accuracy (%): 98.26635145784081
TensorRT FP32 Throughput (samples/second): 492.4043798744254
TensorRT FP32 Total inference time (seconds): 2.5771501064300537


In [9]:

# Load the TensorRT engine
engine_path = './ssd_mobilenet_fp16.plan'
trt_runtime = trt.Runtime(TRT_LOGGER)
engine = load_engine(trt_runtime, engine_path)

# Create an execution context
context = engine.create_execution_context()

# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)

# Set up the test data generator
input_shape = engine.get_binding_shape(0)  # Assuming input is at index 0
batch_size, height, width, channels = input_shape
test_data_generator = ImageDataGenerator(rescale=1.0/255.0)
test_generator = test_data_generator.flow_from_directory(
    '/home/gourav/Desktop/vss/capstone/dataset/test',
    target_size=(height, width),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)

# Evaluate the TensorRT model
correct_predictions = 0
total_samples = len(test_generator.filenames)

start_time = time.time()

for i in range(total_samples):
    image, label = test_generator.next()
    
    # Prepare input data
    inputs[0]['host'] = np.ascontiguousarray(image.astype(np.float32))
    
    # Perform inference
    output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
    
    # Process the output
    predicted_class = (output > 0.5).astype(int).flatten()
    correct_predictions += int(predicted_class == label)

end_time = time.time()

# Calculate metrics
accuracy = correct_predictions / total_samples
total_time = end_time - start_time
throughput = total_samples / total_time

# Print the results
print("TensorRT FP16 Test Accuracy:", accuracy)
print("TensorRT FP16 Test Accuracy (%):", accuracy * 100)
print("TensorRT FP16 Throughput (samples/second):", throughput)
print("TensorRT FP16 Total inference time (seconds):", total_time)


[09/23/2024-22:12:30] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
[09/23/2024-22:12:30] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
Found 1269 images belonging to 2 classes.


  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  dtype = nptype_fix(engine.get_binding_dtype(binding))  # Use the new function here
  if engine.binding_is_input(binding):
  input_shape = engine.get_binding_shape(0)  # Assuming input is at index 0


TensorRT FP16 Test Accuracy: 0.9818754925137904
TensorRT FP16 Test Accuracy (%): 98.18754925137904
TensorRT FP16 Throughput (samples/second): 683.9236832414038
TensorRT FP16 Total inference time (seconds): 1.8554701805114746


In [8]:
trt.__version__

'8.5.3.1'