# TensorRT

In [2]:
import pathlib
import os
import time
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# How to create the docker run directly in Lambda2
# docker run --gpus all -it --mount type=bind,source=/home/jacob-delgado/Documents/CAPSTONE,target=/workspace/CAPSTONE nvcr.io/nvidia/tensorflow:24.10-tf2-py3

2024-11-08 21:17:37.778348: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 21:17:37.811657: I tensorflow/core/platform/cpu_feature_guard.cc:211] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Benchmarking

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15426006686882890183
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 18310823936
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6426993958775620940
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:41:00.0, compute capability: 8.9"
xla_global_id: 416903419
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 22671196160
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4619222676308665363
physical_device_desc: "device: 1, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:82:00.0, compute capability: 8.9"
xla_global_id: 2144165316
]


2024-11-07 22:54:53.151232: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151401: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151534: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
print(trt.trt_utils._pywrap_py_utils.get_linked_tensorrt_version())

(10, 5, 0)


### Benchmarking

In [3]:
data_root = pathlib.Path("../data/Monkeypox_Data/Original_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
batch_size = 32         # size of the batch that will be fed to model
img_height = 224        # input image height
img_width = 224         # input image width
test_size = 0.2

# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=True                                # shufle data when loaded
)

Found 228 files belonging to 2 classes.


2024-11-08 21:17:41.483219: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:17:41.483391: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:17:41.519819: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

### Inference Setup

In [22]:
# Preprocess dataset (we only need the images for inference)
image_batches = []
for image_batch, _ in dataset:
    image_batches.append(image_batch)

image_batches = np.concatenate(image_batches)  # Flatten batches to get all images
print(f"Total images: {image_batches.shape[0]}")

# Split the data into test subset for benchmarking
_, X_test,  = train_test_split(image_batches, test_size=test_size, random_state=42)

Total images: 228


2024-11-08 21:51:00.443322: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Metrics Setup

In [4]:
image_batches, labels = [], []
for image_batch, label_batch in dataset:
    image_batches.append(image_batch)
    labels.append(label_batch)

image_batches = np.concatenate(image_batches)
labels = np.concatenate(labels)
print(f"Total images: {image_batches.shape[0]} and Labels: {labels.shape[0]}")

train_images_UNUSED, X_test, train_labels_UNUSED, Y_test = train_test_split(
    image_batches, labels, test_size=test_size, random_state=42
)

Total images: 228 and Labels: 228


2024-11-08 21:17:45.322350: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Run inference on original model


In [23]:
# run inference
trials = 1000
inference_times = []

# Load the SavedModel directly
saved_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')

# # Load the SavedModel using TFSMLayer, treating it as a Keras layer
# model_layer = tf.keras.layers.TFSMLayer('../best_model/model1/best_f1score_fold', call_endpoint='serving_default')

# # Wrap the TFSMLayer in a Sequential model for inference
# saved_model = tf.keras.Sequential([model_layer])

# Wrap the SavedModel for inference
def saved_model_call(inputs):
    return saved_model.signatures["serving_default"](inputs)["output_0"]

# Create a Keras Sequential model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
    tf.keras.layers.Lambda(saved_model_call)
])

print(f"Running {trials} inference trials on {len(X_test)} test images...")
for i in range(trials):
    start_time = time.perf_counter()

    # Run inference for a batch of images
    model.predict(X_test, batch_size=batch_size, verbose=0)

    end_time = time.perf_counter()
    inference_time = end_time - start_time
    inference_times.append(inference_time)

    if i % 50 == 0:
        avg_inference = np.mean(inference_times)  # Average inference time per trial
        print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")
        
    tf.keras.backend.clear_session()
        
# Compute throughput (images per second)
total_time = np.sum(inference_times)
throughput = (trials * len(X_test)) / total_time
print(f"Throughput: {throughput:.2f} images/second")

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'sequential_1/dense_1/bias:0' shape=(1,) dtype=float32>
  <tf.Variable 'sequential_1/dense_1/kernel:0' shape=(1280, 1) dtype=float32>
  <tf.Variable 'Conv_1_bn/beta:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1_bn/gamma:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1/kernel:0' shape=(1, 1, 320, 1280) dtype=float32>
  <tf.Variable 'block_16_project_BN/beta:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project_BN/gamma:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project/kernel:0' shape=(1, 1, 960, 320) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/gamma:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise/kernel:0' shape=(3, 3, 960, 1) dtype=float32>
  <tf.Variable 'block_16_expand_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 

KeyboardInterrupt: 

### TensorRT Optimization FP16

In [20]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP16,                   # You can use FP32 or INT8 if supported
    max_workspace_size_bytes=8000000000                         # 8GB, adjust as per your GPU memory
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params)

# Convert the model
converter.convert()
converter.summary()

# Save the optimized model
converter.save(optimized_model_dir)

INFO:tensorflow:Linked TensorRT version: (10, 5, 0)
INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)


2024-11-08 21:01:33.206442: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:01:33.206646: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:01:33.206748: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-08 21:01:33.206819: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-08 21:01:33.207064: I external/local_xla/xla/stream_executor/cuda/cuda_e

INFO:tensorflow:Clearing prior device assignments in loaded saved model
INFO:tensorflow:Automatic mixed precision will be used on the whole TensorFlow Graph. This behavior can be deactivated using the environment variable: TF_TRT_EXPERIMENTAL_FEATURES=deactivate_mixed_precision.
More information can be found on: https://www.tensorflow.org/guide/mixed_precision.


2024-11-08 21:01:34.498053: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:01:34.498283: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 21:01:34.498390: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-08 21:01:34.498476: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-08 21:01:34.498732: I external/local_xla/xla/stream_executor/cuda/cuda_e

TRTEngineOP Name                 Device        # Nodes # Inputs      # Outputs     Input DTypes       Output Dtypes      Input Shapes       Output Shapes     
TRTEngineOp_000_000              device:GPU:0  304     1             1             ['float32']        ['float32']        [[-1, 224, 224 ... [[-1, 1]]         

	- AddV2: 63x
	- Cast: 4x
	- Const: 126x
	- Conv2D: 35x
	- DepthwiseConv2dNative: 17x
	- MatMul: 1x
	- Mean: 1x
	- Mul: 18x
	- Pad: 4x
	- Relu6: 35x

[*] Total number of TensorRT engines: 1
[*] % of OPs Converted: 99.35% [304/306]

INFO:tensorflow:Could not find TRTEngineOp_000_000 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.
INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


2024-11-08 21:01:34.779195: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: NOT_FOUND: TRTEngineCacheResource not yet created
INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


### TensorRT Optimization INT8

In [None]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
import tensorflow as tf

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test_INT8'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.INT8,  # Enable INT8 precision
    max_workspace_size_bytes=8000000000       # 8GB, adjust based on your GPU memory
)

# Create a converter
converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params
)

# Function to provide calibration data (generator)
def calibration_input_fn():
    dataset = tf.keras.utils.image_dataset_from_directory(
        "../data/Monkeypox_Data/Original_Images",
        image_size=(224, 224),  # Fixed image size must match model input
        batch_size=32,
        shuffle=True
    )
    for batch, _ in dataset.take(10):  # Limit to 10 batches for calibration
        yield tf.convert_to_tensor(batch)

# Convert the model and calibrate
converter.convert(calibration_input_fn=calibration_input_fn)

# Explicitly define input shapes for TensorRT during `build()`
print("Building and calibrating the model with fixed input shapes...")
converter.build(input_fn=lambda: [tf.ones((32, 224, 224, 3))])  # Fixed batch size and input shape

# Save the optimized model
converter.save(optimized_model_dir)
print(f"INT8 optimized model saved at {optimized_model_dir}")


### Run Inference on Optimized Model

In [24]:
# Load models
original_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')
optimized_model = tf.saved_model.load('../tensorRT_model/test_INT8')


def measure_inference_time(model, X_test, batch_size, trials=500):
    inference_times = []

    print(f"Running {trials} inference trials on {len(X_test)} test images...")
    num_batches = len(X_test) // batch_size

    for i in range(trials):
        start_time = time.perf_counter()

        # Loop over the batches of X_test
        for j in range(num_batches):
            batch_start = j * batch_size
            batch_end = (j + 1) * batch_size
            batch_images = X_test[batch_start:batch_end]  # Get a batch of images
            inputs = tf.convert_to_tensor(batch_images) # Ensure the batch is in tensor format

            # Run inference for the batch
            model.signatures["serving_default"](inputs)  # Perform inference directly

        end_time = time.perf_counter()
        inference_time = end_time - start_time
        inference_times.append(inference_time)

        if i % 100 == 0:
            avg_inference = np.mean(inference_times)  # Average inference time per trial
            print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")

        tf.keras.backend.clear_session()

    # Compute throughput (images per second)
    total_time = np.sum(inference_times)
    throughput = (trials * len(X_test)) / total_time
    return np.mean(inference_times), throughput

# # Measure inference time for both models
original_avg_time, original_throughput = measure_inference_time(
    original_model, X_test, batch_size=batch_size
)
optimized_avg_time, optimized_throughput = measure_inference_time(
    optimized_model, X_test, batch_size=batch_size
)


Running 500 inference trials on 46 test images...
Step 0: average inference time = 0.504493 seconds
Step 100: average inference time = 0.491925 seconds


KeyboardInterrupt: 

In [None]:
# Print results
print("\nResults:")
print(f"Original Model - Average Inference Time: {original_avg_time:.6f} seconds")
print(f"Original Model - Throughput: {original_throughput:.2f} images/second")
print(f"Optimized Model - Average Inference Time: {optimized_avg_time:.6f} seconds")
print(f"Optimized Model - Throughput: {optimized_throughput:.2f} images/second")

# Compute speedup
speedup_factor = original_avg_time / optimized_avg_time
print(f"\nSpeedup Factor: {speedup_factor:.2f}")



Results:
Optimized Model - Average Inference Time: 0.043120 seconds
Optimized Model - Throughput: 1066.78 images/second


### Run Metrics on Optimized Model

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load models
original_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')
# optimized_model = tf.saved_model.load('../tensorRT_model/test_INT8')

def measure_metrics(model, X_test, Y_test, batch_size):
    print(f"Evaluating metrics on {len(X_test)} test images...")
    
    num_batches = len(X_test) // batch_size
    all_predicted_classes = []

    # Loop over the test dataset in batches
    for j in range(num_batches):
        batch_start = j * batch_size
        batch_end = (j + 1) * batch_size
        batch_images = X_test[batch_start:batch_end]
        inputs = tf.convert_to_tensor(batch_images)  # Convert batch to tensor

        # Run predictions
        result = model.signatures["serving_default"](inputs)  # Inference
        prediction_logits = result["output_0"].numpy()  # Extract logits
        probabilities = tf.nn.sigmoid(prediction_logits).numpy()  # Apply sigmoid
        predicted_classes = (probabilities > 0.5).astype(int)  # Threshold for binary classification

        # Collect predictions
        all_predicted_classes.extend(predicted_classes)

    # Handle any remaining images that don't fit evenly in batches
    remaining_samples = len(X_test) % batch_size
    if remaining_samples > 0:
        batch_images = X_test[-remaining_samples:]
        inputs = tf.convert_to_tensor(batch_images)
        result = model.signatures["serving_default"](inputs)
        prediction_logits = result["output_0"].numpy()
        probabilities = tf.nn.sigmoid(prediction_logits).numpy()
        predicted_classes = (probabilities > 0.5).astype(int)
        all_predicted_classes.extend(predicted_classes)

    # Flatten predictions and labels to ensure they are 1D arrays
    all_predicted_classes = np.array(all_predicted_classes).flatten()
    Y_test = np.array(Y_test).flatten()

    # Ensure the number of predictions matches the number of ground truth labels
    if len(all_predicted_classes) != len(Y_test):
        raise ValueError(f"Number of predicted classes ({len(all_predicted_classes)}) "
                         f"does not match the number of ground truth labels ({len(Y_test)}).")

    # Calculate metrics
    accuracy = accuracy_score(Y_test, all_predicted_classes)
    precision = precision_score(Y_test, all_predicted_classes, average="binary")
    recall = recall_score(Y_test, all_predicted_classes, average="binary")
    f1 = f1_score(Y_test, all_predicted_classes, average="binary")

    # Output metrics
    metrics_dict = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

    print(f"Metrics: {metrics_dict}")
    return metrics_dict

# Measure metrics for both models
original_metrics = measure_metrics(
    model=original_model, 
    X_test=X_test, 
    Y_test=Y_test, 
    batch_size=batch_size
)

# optimized_metrics = measure_metrics(
#     model=optimized_model, 
#     X_test=X_test, 
#     Y_test=Y_test, 
#     batch_size=batch_size
# )


Evaluating metrics on 46 test images...
Metrics: {'accuracy': 0.5652173913043478, 'precision': 0.6071428571428571, 'recall': 0.6538461538461539, 'f1_score': 0.6296296296296297}


In [21]:
print(tf.__version__)

2.16.1


In [15]:
print("\nComparison of Metrics:")
print(f"{'Metric':<12} {'Original Model':<15} {'Optimized Model':<15}")
print("-" * 42)
for metric in original_metrics.keys():
    print(f"{metric:<12} {original_metrics[metric]:<15.4f} {optimized_metrics[metric]:<15.4f}")

# Highlighting model differences
print("\nSummary of Changes:")
for metric in original_metrics.keys():
    change = optimized_metrics[metric] - original_metrics[metric]
    print(f"{metric:<12} Change: {change:.4f}")


Comparison of Metrics:
Metric       Original Model  Optimized Model
------------------------------------------
accuracy     0.5000          0.5000         
precision    0.5517          0.5517         
recall       0.6154          0.6154         
f1_score     0.5818          0.5818         

Summary of Changes:
accuracy     Change: 0.0000
precision    Change: 0.0000
recall       Change: 0.0000
f1_score     Change: 0.0000


In [None]:
# # Load the optimized TensorRT model
# saved_model_loaded = tf.saved_model.load('path/to/save/tensorrt_model')
# infer = saved_model_loaded.signatures['serving_default']

# # Example input data (adjust as per your model's input requirements)
# input_tensor = tf.convert_to_tensor(your_input_data)

# # Run inference
# output = infer(input_tensor)

# # Process output as needed