# TensorRT

In [4]:
import pathlib
import os
import time
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# How to create the docker run directly in Lambda2
# docker run --gpus all -it --mount type=bind,source=/home/jacob-delgado/Documents/CAPSTONE,target=/workspace/CAPSTONE nvcr.io/nvidia/tensorflow:24.10-tf2-py3

### Benchmarking

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15426006686882890183
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 18310823936
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6426993958775620940
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:41:00.0, compute capability: 8.9"
xla_global_id: 416903419
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 22671196160
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4619222676308665363
physical_device_desc: "device: 1, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:82:00.0, compute capability: 8.9"
xla_global_id: 2144165316
]


2024-11-07 22:54:53.151232: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151401: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151534: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
print(trt.trt_utils._pywrap_py_utils.get_linked_tensorrt_version())

(10, 5, 0)


### Benchmarking

In [5]:
data_root = pathlib.Path("../data/Monkeypox_Data/Original_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
batch_size = 32         # size of the batch that will be fed to model
img_height = 224        # input image height
img_width = 224         # input image width
test_size = 0.2

# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=True                                # shufle data when loaded
)

# Preprocess dataset (we only need the images for inference)
image_batches = []
for image_batch, _ in dataset:
    image_batches.append(image_batch)

image_batches = np.concatenate(image_batches)  # Flatten batches to get all images
print(f"Total images: {image_batches.shape[0]}")

# Split the data into test subset for benchmarking
_, X_test = train_test_split(image_batches, test_size=test_size, random_state=42)

Found 228 files belonging to 2 classes.


Total images: 228


2024-11-08 01:34:23.788544: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# run inference
trials = 1000
inference_times = []

# Load the SavedModel directly
saved_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')

# # Load the SavedModel using TFSMLayer, treating it as a Keras layer
# model_layer = tf.keras.layers.TFSMLayer('../best_model/model1/best_f1score_fold', call_endpoint='serving_default')

# # Wrap the TFSMLayer in a Sequential model for inference
# saved_model = tf.keras.Sequential([model_layer])

# Wrap the SavedModel for inference
def saved_model_call(inputs):
    return saved_model.signatures["serving_default"](inputs)["output_0"]

# Create a Keras Sequential model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
    tf.keras.layers.Lambda(saved_model_call)
])

print(f"Running {trials} inference trials on {len(X_test)} test images...")
for i in range(trials):
    start_time = time.perf_counter()

    # Run inference for a batch of images
    model.predict(X_test, batch_size=batch_size, verbose=0)

    end_time = time.perf_counter()
    inference_time = end_time - start_time
    inference_times.append(inference_time)

    if i % 50 == 0:
        avg_inference = np.mean(inference_times)  # Average inference time per trial
        print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")
        
    tf.keras.backend.clear_session()
        
# Compute throughput (images per second)
total_time = np.sum(inference_times)
throughput = (trials * len(X_test)) / total_time
print(f"Throughput: {throughput:.2f} images/second")

### TensorRT Optimization

In [25]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test'

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP16,                   # You can use FP32 or INT8 if supported
    max_workspace_size_bytes=8000000000                         # 8GB, adjust as per your GPU memory
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params)

# Convert the model
converter.convert()
converter.summary()

# Save the optimized model
converter.save(optimized_model_dir)

INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)
2024-11-08 02:41:14.959551: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:14.959746: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:14.959935: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-08 02:41:14.960173: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-08 02:41:14.960536: 

INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Automatic mixed precision will be used on the whole TensorFlow Graph. This behavior can be deactivated using the environment variable: TF_TRT_EXPERIMENTAL_FEATURES=deactivate_mixed_precision.
More information can be found on: https://www.tensorflow.org/guide/mixed_precision.


INFO:tensorflow:Automatic mixed precision will be used on the whole TensorFlow Graph. This behavior can be deactivated using the environment variable: TF_TRT_EXPERIMENTAL_FEATURES=deactivate_mixed_precision.
More information can be found on: https://www.tensorflow.org/guide/mixed_precision.
2024-11-08 02:41:18.265120: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:18.265363: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:18.265574: I tensorfl

TRTEngineOP Name                 Device        # Nodes # Inputs      # Outputs     Input DTypes       Output Dtypes      Input Shapes       Output Shapes     
TRTEngineOp_003_000              device:GPU:0  304     1             1             ['float32']        ['float32']        [[-1, 224, 224 ... [[-1, 1]]         

	- AddV2: 63x
	- Cast: 4x
	- Const: 126x
	- Conv2D: 35x
	- DepthwiseConv2dNative: 17x
	- MatMul: 1x
	- Mean: 1x
	- Mul: 18x
	- Pad: 4x
	- Relu6: 35x

[*] Total number of TensorRT engines: 1
[*] % of OPs Converted: 99.35% [304/306]

INFO:tensorflow:Could not find TRTEngineOp_003_000 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.


2024-11-08 02:41:18.967039: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: NOT_FOUND: TRTEngineCacheResource not yet created
INFO:tensorflow:Could not find TRTEngineOp_003_000 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.


INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


In [26]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
import tensorflow as tf

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test_INT8'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.INT8,  # Enable INT8 precision
    max_workspace_size_bytes=8000000000       # 8GB, adjust based on your GPU memory
)

# Create a converter
converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params
)

# Function to provide calibration data (generator)
def calibration_input_fn():
    dataset = tf.keras.utils.image_dataset_from_directory(
        "../data/Monkeypox_Data/Original_Images",
        image_size=(224, 224),  # Fixed image size must match model input
        batch_size=32,
        shuffle=True
    )
    for batch, _ in dataset.take(10):  # Limit to 10 batches for calibration
        yield tf.convert_to_tensor(batch)

# Convert the model and calibrate
converter.convert(calibration_input_fn=calibration_input_fn)

# Explicitly define input shapes for TensorRT during `build()`
print("Building and calibrating the model with fixed input shapes...")
converter.build(input_fn=lambda: [tf.ones((32, 224, 224, 3))])  # Fixed batch size and input shape

# Save the optimized model
converter.save(optimized_model_dir)
print(f"INT8 optimized model saved at {optimized_model_dir}")


INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)
2024-11-08 02:41:52.507599: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:52.507842: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:52.507973: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-08 02:41:52.508103: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-08 02:41:52.508373: 

INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Automatic mixed precision will be used on the whole TensorFlow Graph. This behavior can be deactivated using the environment variable: TF_TRT_EXPERIMENTAL_FEATURES=deactivate_mixed_precision.
More information can be found on: https://www.tensorflow.org/guide/mixed_precision.


INFO:tensorflow:Automatic mixed precision will be used on the whole TensorFlow Graph. This behavior can be deactivated using the environment variable: TF_TRT_EXPERIMENTAL_FEATURES=deactivate_mixed_precision.
More information can be found on: https://www.tensorflow.org/guide/mixed_precision.
2024-11-08 02:41:55.774077: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:55.774492: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 02:41:55.774613: I tensorfl

Building and calibrating the model with fixed input shapes...
Found 228 files belonging to 2 classes.


2024-11-08 02:42:08.620012: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


INFO:tensorflow:Assets written to: ../tensorRT_model/test_INT8/assets


INFO:tensorflow:Assets written to: ../tensorRT_model/test_INT8/assets


INT8 optimized model saved at ../tensorRT_model/test_INT8


### Run Inference on Optimized Model

In [27]:
# Load models
original_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')
optimized_model = tf.saved_model.load('../tensorRT_model/test_INT8')

# Wrap the SavedModel for inference
# def saved_model_call_og(inputs):
#     return original_model.signatures["serving_default"](inputs)["output_0"]

# def saved_model_call_op(inputs):
#     return optimized_model.signatures["serving_default"](inputs)["output_0"]

# # Create a Keras Sequential model
# original_model = tf.keras.Sequential([
#     tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
#     tf.keras.layers.Lambda(saved_model_call_og)
# ])

# optimized_model = tf.keras.Sequential([
#     tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
#     tf.keras.layers.Lambda(saved_model_call_op)
# ])


def measure_inference_time(model, X_test, batch_size, trials=500):
    inference_times = []

    print(f"Running {trials} inference trials on {len(X_test)} test images...")
    num_batches = len(X_test) // batch_size

    for i in range(trials):
        start_time = time.perf_counter()

        # Loop over the batches of X_test
        for j in range(num_batches):
            batch_start = j * batch_size
            batch_end = (j + 1) * batch_size
            batch_images = X_test[batch_start:batch_end]  # Get a batch of images
            inputs = tf.convert_to_tensor(batch_images)  # Ensure the batch is in tensor format

            # Run inference for the batch
            model.signatures["serving_default"](inputs)  # Perform inference directly

        end_time = time.perf_counter()
        inference_time = end_time - start_time
        inference_times.append(inference_time)

        if i % 100 == 0:
            avg_inference = np.mean(inference_times)  # Average inference time per trial
            print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")

        tf.keras.backend.clear_session()

    # Compute throughput (images per second)
    total_time = np.sum(inference_times)
    throughput = (trials * len(X_test)) / total_time
    return np.mean(inference_times), throughput

# Measure inference time for both models
# original_avg_time, original_throughput = measure_inference_time(
#     original_model, X_test, batch_size=batch_size
# )
optimized_avg_time, optimized_throughput = measure_inference_time(
    optimized_model, X_test, batch_size=batch_size
)


Running 500 inference trials on 46 test images...
Step 0: average inference time = 0.143881 seconds
Step 100: average inference time = 0.112207 seconds
Step 200: average inference time = 0.111715 seconds
Step 300: average inference time = 0.110767 seconds
Step 400: average inference time = 0.110440 seconds


In [28]:
# Print results
print("\nResults:")
# print(f"Original Model - Average Inference Time: {original_avg_time:.6f} seconds")
# print(f"Original Model - Throughput: {original_throughput:.2f} images/second")
print(f"Optimized Model - Average Inference Time: {optimized_avg_time:.6f} seconds")
print(f"Optimized Model - Throughput: {optimized_throughput:.2f} images/second")

# Compute speedup
# speedup_factor = original_avg_time / optimized_avg_time
# print(f"\nSpeedup Factor: {speedup_factor:.2f}")



Results:
Optimized Model - Average Inference Time: 0.110499 seconds
Optimized Model - Throughput: 416.29 images/second


In [None]:
# # Load the optimized TensorRT model
# saved_model_loaded = tf.saved_model.load('path/to/save/tensorrt_model')
# infer = saved_model_loaded.signatures['serving_default']

# # Example input data (adjust as per your model's input requirements)
# input_tensor = tf.convert_to_tensor(your_input_data)

# # Run inference
# output = infer(input_tensor)

# # Process output as needed