In [None]:
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

### Check the TensorRT version

In [None]:
!python3 -c 'import tensorrt; print("TensorRT version: {}".format(tensorrt.__version__))'

### Prepare the input image and ONNX model file

In [None]:
!python3 /workspace/TensorRT/quickstart/SemanticSegmentation/export.py

### Build TensorRT engine from the ONNX model

In [None]:
!trtexec --onnx=fcn-resnet101.onnx --saveEngine=fcn-resnet101.engine --optShapes=input:1x3x1026x1282 --stronglyTyped

### Import required modules

In [None]:
import numpy as np
import os
import ctypes
from cuda import cudart
import tensorrt as trt

import matplotlib.pyplot as plt
from PIL import Image

TRT_LOGGER = trt.Logger()

assert cudart.cudaSetDevice(0) == (cudart.cudaError_t.cudaSuccess,)

# Filenames of TensorRT plan file and input/output images.
engine_file = "/workspace/fcn-resnet101.engine"
input_file  = "/workspace/input.ppm"
output_file = "/workspace/output.ppm"

### Utilities for input / output processing

In [None]:
# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].
def preprocess(image):
    # Mean normalization
    mean = np.array([0.485, 0.456, 0.406]).astype('float32')
    stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
    data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev
    # Switch from HWC to to CHW order
    return np.moveaxis(data, 2, 0)

def postprocess(data):
    num_classes = 21
    # create a color palette, selecting a color for each class
    palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
    colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
    # plot the segmentation predictions for 21 classes in different colors
    img = Image.fromarray(data.astype('uint8'), mode='P')
    img.putpalette(colors)
    return img


### Load TensorRT engine

Deserialize the TensorRT engine from specified plan file. 

In [None]:
def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

### Inference pipeline

Starting with a deserialized engine, TensorRT inference pipeline consists of the following steps:
- Create an execution context and specify input shape (based on the image dimensions for inference).
- Allocate CUDA device memory for input and output.
- Allocate CUDA page-locked host memory to efficiently copy back the output.
- Transfer the processed image data into input memory using asynchronous host-to-device CUDA copy.
- Kickoff the TensorRT inference pipeline using the asynchronous execute API.
- Transfer the segmentation output back into pagelocked host memory using device-to-host CUDA copy.
- Synchronize the stream used for data transfers and inference execution to ensure all operations are completes.
- Finally, write out the segmentation output to an image file for visualization.

In [None]:
def infer(engine, input_file, output_file):
    print("Reading input image from file {}".format(input_file))
    with Image.open(input_file) as img:
        input_image = preprocess(img)
        image_width = img.width
        image_height = img.height

    with engine.create_execution_context() as context:
        input_buffers = {}
        input_memories = {}
        
        # Allocate host and device buffers
        tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
        for tensor in tensor_names:
            size = trt.volume(context.get_tensor_shape(tensor))
            dtype = trt.nptype(engine.get_tensor_dtype(tensor))
            
            if engine.get_tensor_mode(tensor) == trt.TensorIOMode.INPUT:
                context.set_input_shape(tensor, (1, 3, image_height, image_width))
                input_buffers[tensor] = np.ascontiguousarray(input_image)
                err, input_memories[tensor] = cudart.cudaMalloc(input_image.nbytes)
                assert err == cudart.cudaError_t.cudaSuccess
                context.set_tensor_address(tensor, input_memories[tensor])
            else:
                err, output_buffer_ptr = cudart.cudaMallocHost(size * dtype().itemsize)
                assert err == cudart.cudaError_t.cudaSuccess
                pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
                output_buffer = np.ctypeslib.as_array(ctypes.cast(output_buffer_ptr, pointer_type), (size,))
                
                err, output_memory = cudart.cudaMalloc(output_buffer.nbytes)
                assert err == cudart.cudaError_t.cudaSuccess
                context.set_tensor_address(tensor, output_memory)

        err, stream = cudart.cudaStreamCreate()
        assert err == cudart.cudaError_t.cudaSuccess
        
        # Transfer input data to the GPU for all input tensors
        for tensor_name, input_buffer in input_buffers.items():
            input_memory = input_memories[tensor_name]
            err, = cudart.cudaMemcpyAsync(input_memory, input_buffer.ctypes.data, input_buffer.nbytes,
                                          cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
            assert err == cudart.cudaError_t.cudaSuccess
        
        # Run inference
        context.execute_async_v3(stream)
        
        # Transfer prediction output from the GPU.
        err, = cudart.cudaMemcpyAsync(output_buffer.ctypes.data, output_memory, output_buffer.nbytes,
                                      cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
        assert err == cudart.cudaError_t.cudaSuccess
        # Synchronize the stream
        err, = cudart.cudaStreamSynchronize(stream)
        assert err == cudart.cudaError_t.cudaSuccess

        output_d64 = np.array(output_buffer, dtype=np.int64)
        np.savetxt('test.out', output_d64.astype(int), fmt='%i', delimiter=' ', newline=' ')

        with postprocess(np.reshape(output_buffer, (image_height, image_width))) as img:
            print("Writing output image to file {}".format(output_file))
            img.convert('RGB').save(output_file, "PPM")

        # cleanup cuda resources for all input tensors
        for input_memory in input_memories.values():
            cudart.cudaFree(input_memory)
        cudart.cudaFree(output_memory)
        cudart.cudaFreeHost(output_buffer_ptr)
        cudart.cudaStreamDestroy(stream)
        

### Plot input image

In [None]:
plt.imshow(Image.open(input_file))

### Run inference

In [None]:
print("Running TensorRT inference for FCN-ResNet101")
with load_engine(engine_file) as engine:
    infer(engine, input_file, output_file)

### Plot segmentation output

In [None]:
plt.imshow(Image.open(output_file))