In [1]:
import os
import time
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt

from sklearn.model_selection import train_test_split
from pathlib import Path
import tensorflow as tf

In [2]:
# Function to measure the size of the model on disk
def get_model_size(model_path):
    total_size = os.path.getsize(model_path)
    return total_size / (1024 * 1024)  # Convert to MB

# Helper function to allocate buffers for TensorRT
def allocate_buffers(engine):
    """
    Allocates host and device buffers for TensorRT inference and prepares bindings.
    """
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()

    # Iterate over all tensors in the engine
    for tensor_name in engine:
        shape = engine.get_tensor_shape(tensor_name)  # Get tensor shape
        dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))  # Get tensor data type

        # Handle dynamic shapes by replacing -1 with default batch size
        if -1 in shape:
            shape = (32,) + tuple(-1 if dim == -1 else dim for dim in shape[1:])

        size = trt.volume(shape)  # Compute the size of the buffer
        device_mem = cuda.mem_alloc(size * dtype().itemsize)  # Allocate device memory
        bindings.append(int(device_mem))  # Append device memory pointer to bindings

        # Classify tensor as input or output
        if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
            inputs.append({"host": cuda.pagelocked_empty(size, dtype), "device": device_mem})
        elif engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.OUTPUT:
            outputs.append({"host": cuda.pagelocked_empty(size, dtype), "device": device_mem})

    return inputs, outputs, bindings, stream




# Function to perform inference using TensorRT engine
def infer_with_trt(engine, context, inputs, outputs, bindings, stream, batch_data):
    """
    Perform inference using TensorRT engine with dynamic input shapes.
    """
    # print(f"Batch data shape: {batch_data.shape}")
    # print(f"Expected input shape: {inputs[0]['host'].shape}")
    input_name = engine.get_tensor_name(0)  # Get the name of the first input tensor
    input_shape = batch_data.shape

    # Set the shape for the input tensor
    context.set_input_shape(input_name, input_shape)  # TensorRT 10.x API for setting dynamic shapes

    # Copy input data to the device
    np.copyto(inputs[0]["host"], batch_data.ravel())
    cuda.memcpy_htod_async(inputs[0]["device"], inputs[0]["host"], stream)

    # Execute inference
    context.execute_async_v3(int(stream.handle))  # Pass the stream handle as an integer

    # Copy output data from the device
    cuda.memcpy_dtoh_async(outputs[0]["host"], outputs[0]["device"], stream)
    stream.synchronize()


    return outputs[0]["host"].reshape(input_shape[0], -1)


# Measure inference time
def measure_inference_time(engine_path, X_test, batch_size, trials=50):
    # Load the TensorRT engine
    logger = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    num_batches = len(X_test) // batch_size
    
    inference_times = []

    # Warm-up phase
    print(f"Running {10} warm-up trials to initialize the engine...")
    for _ in range(10):
        for j in range(num_batches):
            batch_start = j * batch_size
            batch_end = (j + 1) * batch_size
            batch_data = X_test[batch_start:batch_end]
            infer_with_trt(engine, context, inputs, outputs, bindings, stream, batch_data)

    print(f"Running {trials} inference trials on {len(X_test)} test images...")
    for i in range(trials):
        start_time = time.perf_counter()

        # Loop over batches
        for j in range(num_batches):
            batch_start = j * batch_size
            batch_end = (j + 1) * batch_size
            batch_data = X_test[batch_start:batch_end]
            infer_with_trt(engine, context, inputs, outputs, bindings, stream, batch_data)

        end_time = time.perf_counter()
        inference_times.append(end_time - start_time)

        if i % 10 == 0:
            avg_inference = np.mean(inference_times)
            print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")

    # Compute throughput
    total_time = sum(inference_times)
    throughput = (trials * len(X_test)) / total_time
    return np.mean(inference_times), throughput

# Measure metrics
def measure_metrics(engine_path, X_test, Y_test, batch_size):
    # Load the TensorRT engine
    logger = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    num_batches = len(X_test) // batch_size

    all_predicted_classes = []

    # Perform inference and collect predictions
    print(f"Evaluating metrics on {len(X_test)} test images...")
    for j in range(num_batches):
        batch_start = j * batch_size
        batch_end = (j + 1) * batch_size
        batch_data = X_test[batch_start:batch_end]
        predictions = infer_with_trt(engine, context, inputs, outputs, bindings, stream, batch_data)
        probabilities = 1 / (1 + np.exp(-predictions))  # Sigmoid for binary classification
        predicted_classes = (probabilities > 0.5).astype(int)
        all_predicted_classes.extend(predicted_classes)

    # Handle remaining samples
    remaining_samples = len(X_test) % batch_size
    if remaining_samples > 0:
        batch_data = X_test[-remaining_samples:]
        predictions = infer_with_trt(engine, context, inputs, outputs, bindings, stream, batch_data)
        probabilities = 1 / (1 + np.exp(-predictions))
        predicted_classes = (probabilities > 0.5).astype(int)
        all_predicted_classes.extend(predicted_classes)

    all_predicted_classes = np.array(all_predicted_classes).flatten()
    Y_test = np.array(Y_test).flatten()

    # Compute metrics
    metrics_dict = {
        "accuracy": accuracy_score(Y_test, all_predicted_classes),
        "precision": precision_score(Y_test, all_predicted_classes, average="binary"),
        "recall": recall_score(Y_test, all_predicted_classes, average="binary"),
        "f1_score": f1_score(Y_test, all_predicted_classes, average="binary"),
    }

    print(f"Metrics: {metrics_dict}")
    return metrics_dict

In [3]:
engine_path = "../converted_models/ONNX/fp16_quantized_model.trt"

print(get_model_size(engine_path))

4.706439971923828


In [4]:

data_root = Path("../data/Monkeypox_Data/Original_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
batch_size = 32         # size of the batch that will be fed to model
img_height = 224        # input image height
img_width = 224         # input image width
test_size = 0.14

# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=False,                                # shufle data when loaded
    seed=42
)

# normalization_layer = layers.Rescaling(1./255)
# dataset = dataset.map(lambda x, y: (normalization_layer(x), y))

image_batches, labels = [], []
for image_batch, label_batch in dataset:
    image_batches.append(image_batch)
    labels.append(label_batch)

image_batches = np.concatenate(image_batches) # Flatten batches to get all images
labels = np.concatenate(labels)               # Flatten batches to get all labels  
print(f"Total Images: {image_batches.shape[0]} \nTotal Labels: {labels.shape[0]}")

# Split the data into test subset for benchmarking
_, X_test, _, Y_test = train_test_split(image_batches, labels, test_size=test_size, random_state=42)

# Normalize the data
X_test = X_test / 255.0

Found 228 files belonging to 2 classes.


2024-12-07 17:15:48.151633: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-07 17:15:48.224860: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-07 17:15:48.225101: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-07 17:15:48.228004: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-07 17:15:48.228233: I external/local_xla/xla/stream_executor

Total Images: 228 
Total Labels: 228


2024-12-07 17:15:48.764813: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:

inference_time, throughput = measure_inference_time(engine_path, X_test, batch_size)
print("Inference Time: ", inference_time)
print("Throughput Time: ", throughput)

Running 10 warm-up trials to initialize the engine...
[12/07/2024-17:15:49] [TRT] [E] IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter check failed, condition: mContext.profileObliviousBindings.at(profileObliviousIndex) != nullptr. Address is not set for input tensor serving_default_keras_tensor_312:0. Call setInputTensorAddress or setTensorAddress before enqueue/execute.)
[12/07/2024-17:15:49] [TRT] [E] IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter check failed, condition: mContext.profileObliviousBindings.at(profileObliviousIndex) != nullptr. Address is not set for input tensor serving_default_keras_tensor_312:0. Call setInputTensorAddress or setTensorAddress before enqueue/execute.)
[12/07/2024-17:15:49] [TRT] [E] IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter check failed, condition: mContext.profileObliviousBindings.at(profileObliviousIndex) != nullptr. Address is not set for input tensor serving_default_keras_

In [6]:
metrics = measure_metrics(engine_path, X_test, Y_test, batch_size)
print(metrics)

Evaluating metrics on 32 test images...
[12/07/2024-17:15:49] [TRT] [E] IExecutionContext::enqueueV3: Error Code 3: API Usage Error (Parameter check failed, condition: mContext.profileObliviousBindings.at(profileObliviousIndex) != nullptr. Address is not set for input tensor serving_default_keras_tensor_312:0. Call setInputTensorAddress or setTensorAddress before enqueue/execute.)
Metrics: {'accuracy': 0.34375, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
{'accuracy': 0.34375, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
