## TensorRT
TensorRT is a library delivered by NVIDIA.
It allows for efficient inference computation of neural networks models.
Library delivers many tools like model optimization or pruning.
Access to mentioned functionality is available for may popular DNN frameworks like PyTorch or Tensor-Flow.

Presented notebook shows usage of library on the example of semantic segmentation.

The goal of semantic segmentation is to assign a class (in some cases classes) for each pixel of image.

Pixels that are the part of the object(s) of the same type/class should have the same class.

For that task for a single image we need to assign array of the same size with class ids or 

n boolean arrays - each represents separate class.     

Your task is to design Deep Neural Network architecture to achieve 
Binary Accuracy metric higher than 0.99 on validation dataset.

Install required packages (if not installed):

In [None]:
!pip install nvidia-pyindex nvidia-tensorrt tensorflow
!pip install torch-tensorrt==1.3.0 --find-links https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.3.0
!pip install git+https://github.com/LukeTonin/simple-deep-learning --no-deps

In [None]:
from simple_deep_learning.mnist_extended.semantic_segmentation import create_semantic_segmentation_dataset
import matplotlib.pyplot as plt
import numpy as np
import torch
import local_utils as lu

In [None]:
BATCH_SIZE = 32
NUM_OF_CLASSES = 10
NUM_OF_TRAINING_SAMPLES = BATCH_SIZE * 32
NUM_OF_TEST_SAMPLES = BATCH_SIZE * 4
INPUT_WIDTH = 64
INPUT_HEIGHT = 64
INPUT_CHANNELS = 1
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
np.random.seed(0)
train_x, train_y, _, _ = create_semantic_segmentation_dataset(NUM_OF_TRAINING_SAMPLES, 1, (INPUT_WIDTH, INPUT_HEIGHT), 2, 9, NUM_OF_CLASSES, 0.1)
np.random.seed(5)
_, _, test_x, test_y = create_semantic_segmentation_dataset(1, NUM_OF_TEST_SAMPLES, (INPUT_WIDTH, INPUT_HEIGHT), 2, 9, NUM_OF_CLASSES, 0.1)
train_x = torch.from_numpy(np.moveaxis(train_x,-1, 1)).to(torch.float32)
train_y = torch.from_numpy(np.moveaxis(train_y,-1, 1)).to(torch.float32)
test_x = torch.from_numpy(np.moveaxis(test_x,-1, 1)).to(torch.float32)
test_y = torch.from_numpy(np.moveaxis(test_y,-1, 1)).to(torch.float32)
print(train_y.shape)
print(train_x.shape)
print(test_y.shape)
print(test_x.shape)

train_loader = lu.CustomDataLoader(train_x, train_y, batch_size=BATCH_SIZE)
test_loader = lu.CustomDataLoader(test_x, test_y, batch_size=BATCH_SIZE)
eval_loader = lu.CustomDataLoader(test_x, test_y, batch_size=1)

In [None]:
for X, Y in train_loader:
    for i in range(min(4, train_loader.batch_size)):
        plt.gray()
        plt.imshow(X[i][0])
        plt.show()
        fig, axes = plt.subplots(1, Y[i].shape[0], figsize=(20,2))
        axes = np.array(axes).flatten().tolist()
        for cls, (a, y) in enumerate(zip(axes, Y[i])):
            a.imshow(y)
            a.set_title(str(cls))
        plt.show()
    break

In [None]:
criterion = lu.BinaryCrossEntropyLoss(1)
metric = lu.BinaryAccuracy()
# net = lu.UNet(INPUT_CHANNELS, NUM_OF_CLASSES, True)
net = lu.SimpleSegmenter((INPUT_CHANNELS, INPUT_HEIGHT, INPUT_WIDTH), num_of_classes=NUM_OF_CLASSES)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)

In [None]:
net, history = lu.training(net,
                           train_loader,
                           test_loader,
                           criterion,
                           metric,
                           optimizer,
                           update_period=2,
                           epoch_max=60,
                           device=device)
lu.plot_history(history)

In [None]:
def evaluate(model, dtype=None, device=None, repeat=30):
    
    tm = lu.TimeMeasurement("", len(eval_loader) * eval_loader.batch_size * repeat)
    eval_loader.data = eval_loader.data.to(dtype).to(device)
    eval_loader.labels = eval_loader.labels.to(dtype).to(device)
    
    with tm:
        net, loss, acc = lu.train_test_pass(model,
                                            eval_loader,
                                            criterion,
                                            metric,
                                            optimizer=None,
                                            update_period=None,
                                            mode='test',
                                            device=device,
                                            repeat=repeat)
    print("loss =", loss)
    print("loss =", acc)
    print(str(tm))

In [None]:
for X, Y in train_loader:
    net.eval()
    with torch.no_grad():
        Y_pred = net(X.to(device)).cpu()
    for i in range(min(8, train_loader.batch_size)):
        plt.gray()
        plt.imshow(X[i][0])
        plt.show()
        fig, axes = plt.subplots(1, Y[i].shape[0], figsize=(20,2))
        axes = np.array(axes).flatten().tolist()
        for cls, (a, y) in enumerate(zip(axes, Y[i])):
            a.imshow(y)
            a.set_title(str(cls))
        plt.show()
        fig, axes = plt.subplots(1, Y_pred[i].shape[0], figsize=(20,2))
        axes = np.array(axes).flatten().tolist()
        for cls, (a, y) in enumerate(zip(axes, Y_pred[i])):
            a.imshow(y)
            a.set_title(str(cls))
        plt.show()
    break

In [None]:
model = net

### Basic PyTorch

In [None]:
evaluate(net, torch.float32, "cuda")

### Trace model

In [None]:
traced_model = torch.jit.trace(model, torch.empty([1,1,32,32]).to("cuda"))
traced_model

In [None]:
evaluate(traced_model, device=torch.device('cuda'))

### Script model

In [None]:
script_model = torch.jit.script(model)
script_model

In [None]:
evaluate(script_model)

### Compile with Tensor-RT

In [None]:
import torch_tensorrt

dtype = torch.float32
# We use a batch-size of 1024, and half precision
trt_ts_module = torch_tensorrt.compile(traced_model, inputs=[
    torch_tensorrt.Input((1, INPUT_CHANNELS, INPUT_HEIGHT, INPUT_WIDTH),dtype=torch.float32
    )], 
    enabled_precisions = {dtype})

input_data = torch.randn((1, INPUT_CHANNELS, INPUT_HEIGHT, INPUT_WIDTH))
input_data = input_data.to(dtype).to("cuda")

input_data = input_data.to(dtype)
result = trt_ts_module(input_data)
torch.jit.save(trt_ts_module, "trt_ts_module.ts")

In [None]:
evaluate(trt_ts_module, dtype, "cuda")

## PTQ

In [None]:
# NOT SUPPORTED ON HOST DEVICE
# calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(eval_loader,
#                                                      use_cache=False,
#                                                      algo_type=torch_tensorrt.ptq.CalibrationAlgo.MINMAX_CALIBRATION,
#                                                      device=torch.device('cuda:0'))
# compile_spec = {
#          "inputs": [torch_tensorrt.Input([1, INPUT_CHANNELS, INPUT_HEIGHT, INPUT_WIDTH])],
#          "enabled_precisions": torch.int8,
#          "calibrator": calibrator,
#          "truncate_long_and_double": True
         
#      }
# trt_ptq = torch_tensorrt.compile(model, **compile_spec)


In [None]:
evaluate(trt_ptq, None, "cuda:0")