# TensorRT

In [1]:
import pathlib
import os
import time
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers

# How to create the docker run directly in Lambda2
# docker run --gpus all -it --mount type=bind,source=/home/jacob-delgado/Documents/CAPSTONE,target=/workspace/CAPSTONE nvcr.io/nvidia/tensorflow:24.10-tf2-py3

2024-11-13 20:29:57.727278: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-13 20:29:57.738266: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-13 20:29:57.748982: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-13 20:29:57.752176: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 20:29:57.761698: I tensorflow/core/platform/cpu_feature_guar

### Benchmarking

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12044829441102514942
xla_global_id: -1
]


I0000 00:00:1731544198.937166 1122414 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731544198.937338 1122414 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-13 20:29:58.970463: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [3]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt
print(trt.trt_utils._pywrap_py_utils.get_linked_tensorrt_version())

(8, 6, 1)


### Benchmarking

In [4]:
data_root = pathlib.Path("../data/Monkeypox_Data/Original_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
batch_size = 32         # size of the batch that will be fed to model
img_height = 224        # input image height
img_width = 224         # input image width
test_size = 0.2

# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=True                                # shufle data when loaded
)

# normalization_layer = layers.Rescaling(1./255)
# dataset = dataset.map(lambda x, y: (normalization_layer(x), y))

Found 228 files belonging to 2 classes.


I0000 00:00:1731544198.995009 1122414 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731544198.995272 1122414 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-13 20:29:58.995405: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt

# Load the SavedModel
saved_model_dir = '../best_model/model1/best_f1score_fold'

### TensorRT Optimization FP32

In [6]:
fp32_optimized_model_dir = '../tensorRT_model/fp32'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP32,                   # You can use FP32 or INT8 if supported
    max_workspace_size_bytes=8000000000                         # 8GB, adjust as per your GPU memory
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params)

# Convert the model
converter.convert()
# converter.build()
# for layer in converter.layers:
#     print(f'Layer {layer.name}: {layer.dtype}')

converter.summary()

# Save the optimized model
converter.save(fp32_optimized_model_dir)
tf.keras.backend.clear_session()

ERROR:tensorflow:Tensorflow needs to be built with TensorRT support enabled to allow TF-TRT to operate.




RuntimeError: Tensorflow has not been built with TensorRT support.

### TensorRT Optimization FP16

In [None]:
fp16_optimized_model_dir = '../tensorRT_model/fp16'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP16,                   # You can use FP32 or INT8 if supported
    max_workspace_size_bytes=8000000000                         # 8GB, adjust as per your GPU memory
)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params)

# Convert the model
converter.convert()
# converter.build()
# for layer in converter.layers:
#     print(f'Layer {layer.name}: {layer.dtype}')

converter.summary()

# Save the optimized model
converter.save(fp16_optimized_model_dir)
tf.keras.backend.clear_session()

### TensorRT Optimization INT8

In [None]:
int8_optimized_model_dir = '../tensorRT_model/int8'

# Define the conversion parameters
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.INT8,  # Enable INT8 precision
    max_workspace_size_bytes=8000000000       # 8GB, adjust based on your GPU memory
)

# Create a converter
converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=saved_model_dir,
    conversion_params=conversion_params
)

# Function to provide calibration data (generator)
def calibration_input_fn():
    dataset = tf.keras.utils.image_dataset_from_directory(
        "../data/Monkeypox_Data/Original_Images",
        image_size=(224, 224),  # Fixed image size must match model input
        batch_size=156,
        shuffle=True
    )
    for batch, _ in dataset.take(100):  # Higher batch = better calibration?
        batch = batch / 255.0
        yield tf.convert_to_tensor(batch)

# Convert the model and calibrate
converter.convert(calibration_input_fn=calibration_input_fn)

# Explicitly define input shapes for TensorRT during `build()`
print("Building and calibrating the model with fixed input shapes...")
converter.build(input_fn=lambda: [tf.ones((1, 224, 224, 3))])  # Fixed batch size and input shape

# Save the optimized model
converter.save(int8_optimized_model_dir)
print(f"INT8 optimized model saved at {int8_optimized_model_dir}")
tf.keras.backend.clear_session()
