# TensorRT

In [1]:
import pathlib
import os
import time
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# How to create the docker run directly in Lambda2
# docker run --gpus all -it --mount type=bind,source=/home/jacob-delgado/Documents/CAPSTONE,target=/workspace/CAPSTONE nvcr.io/nvidia/tensorflow:24.10-tf2-py3

2024-11-08 00:56:35.257042: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 00:56:35.289649: I tensorflow/core/platform/cpu_feature_guard.cc:211] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Benchmarking

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15426006686882890183
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 18310823936
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6426993958775620940
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:41:00.0, compute capability: 8.9"
xla_global_id: 416903419
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 22671196160
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4619222676308665363
physical_device_desc: "device: 1, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:82:00.0, compute capability: 8.9"
xla_global_id: 2144165316
]


2024-11-07 22:54:53.151232: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151401: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 22:54:53.151534: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
print(trt.trt_utils._pywrap_py_utils.get_linked_tensorrt_version())

(10, 5, 0)


### Benchmarking

In [10]:
data_root = pathlib.Path("../data/Monkeypox_Data/Original_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
batch_size = 32         # size of the batch that will be fed to model
img_height = 224        # input image height
img_width = 224         # input image width
test_size = 0.2

# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=True                                # shufle data when loaded
)

# Preprocess dataset (we only need the images for inference)
image_batches = []
for image_batch, _ in dataset:
    image_batches.append(image_batch)

image_batches = np.concatenate(image_batches)  # Flatten batches to get all images
print(f"Total images: {image_batches.shape[0]}")

# Split the data into test subset for benchmarking
_, X_test = train_test_split(image_batches, test_size=test_size, random_state=42)

Found 228 files belonging to 2 classes.


Total images: 228


2024-11-07 23:04:07.108850: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [2]:
# run inference
trials = 1000
inference_times = []

# Load the SavedModel directly
saved_model = tf.saved_model.load('../best_model/model1/best_f1score_fold')

# Load the SavedModel using TFSMLayer, treating it as a Keras layer
model_layer = tf.keras.layers.TFSMLayer('../best_model/model1/best_f1score_fold', call_endpoint='serving_default')

# Wrap the TFSMLayer in a Sequential model for inference
saved_model = tf.keras.Sequential([model_layer])

# # Wrap the SavedModel for inference
# def saved_model_call(inputs):
#     return saved_model.signatures["serving_default"](inputs)["output_0"]

# # Create a Keras Sequential model
# model = tf.keras.Sequential([
#     tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
#     tf.keras.layers.Lambda(saved_model_call)
# ])

print(f"Running {trials} inference trials on {len(X_test)} test images...")
for i in range(trials):
    start_time = time.perf_counter()

    # Run inference for a batch of images
    saved_model.predict(X_test, batch_size=batch_size, verbose=0)

    end_time = time.perf_counter()
    inference_time = end_time - start_time
    inference_times.append(inference_time)

    if i % 50 == 0:
        avg_inference = np.mean(inference_times)  # Average inference time per trial
        print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")
        
    tf.keras.backend.clear_session()
        
# Compute throughput (images per second)
total_time = np.sum(inference_times)
throughput = (trials * len(X_test)) / total_time
print(f"Throughput: {throughput:.2f} images/second")

2024-11-08 00:56:54.339635: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 00:56:54.339784: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-08 00:56:54.371890: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

AttributeError: module 'tf_keras.api._v2.keras.layers' has no attribute 'TFSMLayer'

### TensorRT Optimization

In [23]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test'

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'
optimized_model_dir = '../tensorRT_model/test'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP16,                   # You can use FP32 or INT8 if supported
    max_workspace_size_bytes=8000000000                         # 8GB, adjust as per your GPU memory
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params)

# Convert the model
converter.convert()
converter.summary()

# Save the optimized model
converter.save(optimized_model_dir)

INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Linked TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)


INFO:tensorflow:Loaded TensorRT version: (10, 5, 0)
2024-11-07 23:35:47.339794: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 23:35:47.339993: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 23:35:47.340098: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-07 23:35:47.340204: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-07 23:35:47.340475: 

INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Clearing prior device assignments in loaded saved model


INFO:tensorflow:Automatic mixed precision has been deactivated.


INFO:tensorflow:Automatic mixed precision has been deactivated.
2024-11-07 23:35:48.966673: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 23:35:48.966875: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-07 23:35:48.966982: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 2
2024-11-07 23:35:48.967062: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-11-07 23:35

TRTEngineOP Name                 Device        # Nodes # Inputs      # Outputs     Input DTypes       Output Dtypes      Input Shapes       Output Shapes     
TRTEngineOp_001_000              device:GPU:0  298     1             1             ['float32']        ['float32']        [[-1, 224, 224 ... [[-1, 1]]         

	- AddV2: 63x
	- Const: 125x
	- Conv2D: 35x
	- DepthwiseConv2dNative: 17x
	- MatMul: 1x
	- Mean: 1x
	- Mul: 17x
	- Pad: 4x
	- Relu6: 35x

[*] Total number of TensorRT engines: 1
[*] % of OPs Converted: 99.33% [298/300]

INFO:tensorflow:Could not find TRTEngineOp_001_000 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.


2024-11-07 23:35:59.057630: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: NOT_FOUND: TRTEngineCacheResource not yet created
INFO:tensorflow:Could not find TRTEngineOp_001_000 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.


INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


INFO:tensorflow:Assets written to: ../tensorRT_model/test/assets


### Run Inference on Optimized Model

In [None]:
# Load models
original_model = tf.saved_model.load(saved_model_dir)
optimized_model = tf.saved_model.load(optimized_model_dir)

# Wrap the SavedModel for inference
def saved_model_call_og(inputs):
    return original_model.signatures["serving_default"](inputs)["output_0"]

def saved_model_call_op(inputs):
    return optimized_model.signatures["serving_default"](inputs)["output_0"]

# Create a Keras Sequential model
original_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(224, 224, 3)),  # Replace with your model's input shape
    tf.keras.layers.Lambda(saved_model_call_og)
])

optimized_model = tf.keras.Sequential([tensorflow.kerass=1000):
    inference_times = []

    print(f"Running {trials} inference trials on {len(X_test)} test images...")
    for i in range(trials):
        start_time = time.perf_counter()

        # Run inference for a batch of imagestensorflow.keras
        # Log average inference time every 50 steps
        if i % 50 == 0:
            avg_inference = np.mean(inference_times)
            print(f"Step {i}: average inference time = {avg_inference:.6f} seconds")

        tf.keras.backend.clear_session()

    # Compute throughput (images per second)
    total_time = np.sum(inference_times)
    throughput = (trials * len(X_test)) / total_time
    return np.mean(inference_times), throughput

# Measure inference time for both models
original_avg_time, original_throughput = measure_inference_time(
    original_model, X_test, batch_size=batch_size
)
optimized_avg_time, optimized_throughput = measure_inference_time(
    optimized_model, X_test, batch_size=batch_size
)


The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'sequential_1/dense_1/bias:0' shape=(1,) dtype=float32>
  <tf.Variable 'sequential_1/dense_1/kernel:0' shape=(1280, 1) dtype=float32>
  <tf.Variable 'Conv_1_bn/beta:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1_bn/gamma:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1/kernel:0' shape=(1, 1, 320, 1280) dtype=float32>
  <tf.Variable 'block_16_project_BN/beta:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project_BN/gamma:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project/kernel:0' shape=(1, 1, 960, 320) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/gamma:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise/kernel:0' shape=(3, 3, 960, 1) dtype=float32>
  <tf.Variable 'block_16_expand_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'sequential_1/dense_1/bias:0' shape=(1,) dtype=float32>
  <tf.Variable 'sequential_1/dense_1/kernel:0' shape=(1280, 1) dtype=float32>
  <tf.Variable 'Conv_1_bn/beta:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1_bn/gamma:0' shape=(1280,) dtype=float32>
  <tf.Variable 'Conv_1/kernel:0' shape=(1, 1, 320, 1280) dtype=float32>
  <tf.Variable 'block_16_project_BN/beta:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project_BN/gamma:0' shape=(320,) dtype=float32>
  <tf.Variable 'block_16_project/kernel:0' shape=(1, 1, 960, 320) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise_BN/gamma:0' shape=(960,) dtype=float32>
  <tf.Variable 'block_16_depthwise/kernel:0' shape=(3, 3, 960, 1) dtype=float32>
  <tf.Variable 'block_16_expand_BN/beta:0' shape=(960,) dtype=float32>
  <tf.Variable 

Running 1000 inference trials on 46 test images...


2024-11-08 00:08:53.749742: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


AttributeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 2436, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 2409, in run_step  *
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 2377, in predict_step  *
        return self(x, training=False)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 588, in __call__  *
        return super().__call__(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
        outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/sequential.py", line 397, in call  *
        return super().call(inputs, training=training, mask=mask)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/functional.py", line 514, in call  *
        return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/functional.py", line 671, in _run_internal_graph  *
        outputs = node.layer(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
        outputs = call_fn(inputs, *args, **kwargs)
    File "/tmp/__autograph_generated_filequhoyu6r.py", line 162, in error_handler  **
        raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
    File "/tmp/__autograph_generated_filequhoyu6r.py", line 34, in error_handler
        retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
    File "/tmp/__autograph_generated_fileinhmtgjw.py", line 54, in tf__call  **
        result = ag__.converted_call(ag__.ld(self).function, (ag__.ld(inputs),), dict(**ag__.ld(kwargs)), fscope)
    File "/tmp/__autograph_generated_filexta10ajq.py", line 12, in tf__saved_model_call_og  **
        retval_ = ag__.converted_call(ag__.ld(original_model).signatures['serving_default'], (ag__.ld(inputs),), None, fscope)['output_0']

    AttributeError: Exception encountered when calling layer 'lambda' (type Lambda).
    
    'Sequential' object has no attribute 'signatures'
    
    Call arguments received by layer 'lambda' (type Lambda):
      • inputs=tf.Tensor(shape=(None, 224, 224, 3), dtype=float32)
      • mask=None
      • training=False


In [None]:
optimized_avg_time_fp16, optimized_throughput_fp16 = measure_inference_time(
    optimized_model, X_test_fp16, batch_size=batch_size
)


In [None]:
# Print results
print("\nResults:")
print(f"Original Model - Average Inference Time: {original_avg_time:.6f} seconds")
print(f"Original Model - Throughput: {original_throughput:.2f} images/second")
print(f"Optimized Model - Average Inference Time: {optimized_avg_time:.6f} seconds")
print(f"Optimized Model - Throughput: {optimized_throughput:.2f} images/second")
# print(f"Optimized Model FP16 Input - Average Inference Time: {optimized_avg_time_fp16:.6f} seconds")
# print(f"Optimized Model FP16 Input - Throughput: {optimized_throughput_fp16:.2f} images/second")

# Compute speedup
speedup_factor = original_avg_time / optimized_avg_time
print(f"\nSpeedup Factor: {speedup_factor:.2f}")


In [None]:
# # Load the optimized TensorRT model
# saved_model_loaded = tf.saved_model.load('path/to/save/tensorrt_model')
# infer = saved_model_loaded.signatures['serving_default']

# # Example input data (adjust as per your model's input requirements)
# input_tensor = tf.convert_to_tensor(your_input_data)

# # Run inference
# output = infer(input_tensor)

# # Process output as needed