#### ДЗ_34  - TensorRT
#### Татур А.А.
##### 13.04.2023

#### Выполнить квантование уже имеющейся модели (в качестве модели взята VGG16 + Attention из предыдущей работы)

In [3]:
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm import tqdm
import matplotlib.pyplot as plt
import onnx, tf2onnx
import tensorrt as trt
import gc
from tensorflow.keras.models import load_model
from tensorflow import saved_model

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

#### Загружаем датасет с изображениями

In [6]:
df = pd.read_csv ('data_UTK.csv')
train, test = train_test_split(df, test_size=0.2, random_state=34)

#### Загружаем предобученную модель

In [29]:
own_model = load_model("own_model.h5")

In [30]:
own_model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_82 (Conv2D)          (None, 128, 128, 64)      1792      
                                                                 
 max_pooling2d_82 (MaxPoolin  (None, 64, 64, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_83 (Conv2D)          (None, 64, 64, 128)       73856     
                                                                 
 max_pooling2d_83 (MaxPoolin  (None, 32, 32, 128)      0         
 g2D)                                                            
                                                                 
 conv2d_84 (Conv2D)          (None, 32, 32, 256)       295168    
                                                                 
 max_pooling2d_84 (MaxPoolin  (None, 16, 16, 256)    

#### Засекаем время предсказания 1000 изображений

In [38]:
NUM_IMAGES = 1000
idx = np.random.randint (0,len(test),NUM_IMAGES)
expected = []
predicted = []

for idx in tqdm(enumerate(idx), desc='Prediction'):
    link = test.iloc[idx[1]]
    image = plt.imread(link[0])
    img_resized = cv2.resize (image, (128,128))/255 #,
    img_reshaped  = img_resized.reshape (1,128,128,3)
    predict = own_model.predict(img_reshaped, verbose=0)[0][0];
    
    expected.append (link[1])
    predicted.append (predict)

Prediction: 1000it [01:23, 11.92it/s]


#### Измерим точность до конвертации. Метрика - RMSE

In [39]:
mean_squared_error (expected, predicted)**0.5

8.67068697345302

In [212]:
del own_model
gc.collect()

134864

#### Конвертируем модель в ONNX формат

In [10]:
!python -m tf2onnx.convert --saved-model tmp_model --output temp.onnx

2023-04-13 15:10:03,326 - INFO - Signatures found in model: [serving_default].
2023-04-13 15:10:03,326 - INFO - Output names: ['dense_51']
2023-04-13 15:10:03,970 - INFO - Using tensorflow=2.10.1, onnx=1.13.0, tf2onnx=1.14.0/8f8d49
2023-04-13 15:10:03,970 - INFO - Using opset <onnx, 15>
2023-04-13 15:10:04,157 - INFO - Computed 0 values for constant folding
2023-04-13 15:10:04,391 - INFO - Optimizing ONNX model
2023-04-13 15:10:04,939 - INFO - After optimization: GlobalMaxPool +1 (0->1), Identity -2 (2->0), ReduceMax -1 (1->0), Squeeze +1 (0->1), Transpose -19 (20->1)
2023-04-13 15:10:04,971 - INFO - 
2023-04-13 15:10:04,971 - INFO - Successfully converted TensorFlow model tmp_model to ONNX
2023-04-13 15:10:04,971 - INFO - Model inputs: ['conv2d_82_input']
2023-04-13 15:10:04,971 - INFO - Model outputs: ['dense_51']
2023-04-13 15:10:04,971 - INFO - ONNX model is saved at temp.onnx


In [11]:
onnx_model = onnx.load_model('temp.onnx');

#### Меняем размер батча и сохраняем модель

In [8]:
BATCH_SIZE=1

In [13]:
inputs = onnx_model.graph.input
for input in inputs:
    dim1 = input.type.tensor_type.shape.dim[0]
    dim1.dim_value = BATCH_SIZE
    
model_name = "own_model.onnx"
onnx.save_model(onnx_model, model_name)
print("Done saving!")

Done saving!


#### Конвертируем в trt. Точность FP32

In [1]:
!trtexec --onnx=own_model.onnx --saveEngine=own_model.trt  --explicitBatch

&&&& RUNNING TensorRT.trtexec [TensorRT v8503] # trtexec --onnx=own_model.onnx --saveEngine=own_model.trt --explicitBatch
[04/13/2023-15:26:01] [I] === Model Options ===
[04/13/2023-15:26:01] [I] Format: ONNX
[04/13/2023-15:26:01] [I] Model: own_model.onnx
[04/13/2023-15:26:01] [I] Output:
[04/13/2023-15:26:01] [I] === Build Options ===
[04/13/2023-15:26:01] [I] Max batch: explicit batch
[04/13/2023-15:26:01] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[04/13/2023-15:26:01] [I] minTiming: 1
[04/13/2023-15:26:01] [I] avgTiming: 8
[04/13/2023-15:26:01] [I] Precision: FP32
[04/13/2023-15:26:01] [I] LayerPrecisions: 
[04/13/2023-15:26:01] [I] Calibration: 
[04/13/2023-15:26:01] [I] Refit: Disabled
[04/13/2023-15:26:01] [I] Sparsity: Disabled
[04/13/2023-15:26:01] [I] Safe mode: Disabled
[04/13/2023-15:26:01] [I] DirectIO mode: Disabled
[04/13/2023-15:26:01] [I] Restricted mode: Disabled
[04/13/2023-15:26:01] [I] Build only: Disabled

[04/13/2023-15:26:01] [W] --explicitBatch flag has been deprecated and has no effect!
[04/13/2023-15:26:01] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[04/13/2023-15:26:06] [W] [TRT] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
[04/13/2023-15:26:06] [W] [TRT] onnx2trt_utils.cpp:377: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[04/13/2023-15:26:15] [W] [TRT] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
[04/13/2023-15:26:19] [W] * GPU compute time is unstable, with coefficient of variance = 8.65841%.
[04/13/2023-15:

#### Конвертируем в trt. Точность FP16

In [2]:
!trtexec --onnx=own_model.onnx --saveEngine=own_model16.trt  --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16


&&&& RUNNING TensorRT.trtexec [TensorRT v8503] # trtexec --onnx=own_model.onnx --saveEngine=own_model16.trt --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
[04/13/2023-16:12:10] [I] === Model Options ===
[04/13/2023-16:12:10] [I] Format: ONNX
[04/13/2023-16:12:10] [I] Model: own_model.onnx
[04/13/2023-16:12:10] [I] Output:
[04/13/2023-16:12:10] [I] === Build Options ===
[04/13/2023-16:12:10] [I] Max batch: explicit batch
[04/13/2023-16:12:10] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[04/13/2023-16:12:10] [I] minTiming: 1
[04/13/2023-16:12:10] [I] avgTiming: 8
[04/13/2023-16:12:10] [I] Precision: FP32+FP16
[04/13/2023-16:12:10] [I] LayerPrecisions: 
[04/13/2023-16:12:10] [I] Calibration: 
[04/13/2023-16:12:10] [I] Refit: Disabled
[04/13/2023-16:12:10] [I] Sparsity: Disabled
[04/13/2023-16:12:10] [I] Safe mode: Disabled
[04/13/2023-16:12:10] [I] DirectIO mode: Disabled
[04/13/2023-16:12:10] [I] Restr

[04/13/2023-16:12:10] [W] --explicitBatch flag has been deprecated and has no effect!
[04/13/2023-16:12:10] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[04/13/2023-16:12:15] [W] [TRT] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
[04/13/2023-16:12:15] [W] [TRT] onnx2trt_utils.cpp:377: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[04/13/2023-16:12:34] [W] [TRT] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[04/13/2023-16:12:34] [W] [TRT] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.
[04/13/2023-16:12:34] [W] [TRT] Check verbose

#### Проверяем работу модели FP32

#### Заружаем trt файл

In [5]:
f = open("own_model.trt", "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

#### Функция предикта

In [10]:
def predict(batch):
    cuda.memcpy_htod_async(d_input, batch, stream)
    context.execute_async_v2(bindings, stream.handle, None)
    cuda.memcpy_dtoh_async(output, d_output, stream)
    stream.synchronize()    
    return output

In [20]:
NUM_IMAGES = 1000
idx = np.random.randint (0,len(test),NUM_IMAGES)
expected = []
predicted = []
for idx in tqdm(enumerate(idx), desc='Prediction'):

    link = test.iloc[idx[1]]
    image = plt.imread(link[0])
    img_resized = cv2.resize (image, (128,128))/255 #,
    img_reshaped  = img_resized.reshape (1,128,128,3)
    
    input_batch = np.array(np.repeat(np.expand_dims(np.array(img_reshaped, dtype=np.float32), axis=0), BATCH_SIZE, axis=0), dtype=np.float32)
       
    output = np.empty([BATCH_SIZE, 1], dtype = np.float32)

    d_input = cuda.mem_alloc(1 * input_batch.nbytes)
    d_output = cuda.mem_alloc(1 * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()
    
    trt_predictions = predict(input_batch).astype(np.float32)
    
    expected.append (link[1])
    predicted.append (trt_predictions[0][0])

Prediction: 1000it [00:04, 219.14it/s]


#### Точность

In [28]:
mean_squared_error (expected, predicted)**0.5

8.49594549477956

#### Проверяем работу модели FP16

In [4]:
f = open("own_model16.trt", "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

In [13]:
NUM_IMAGES = 1000
idx = np.random.randint (0,len(test),NUM_IMAGES)
expected = []
predicted = []
for idx in tqdm(enumerate(idx), desc='Prediction'):

    link = test.iloc[idx[1]]
    image = plt.imread(link[0])
    img_resized = cv2.resize (image, (128,128))/255 #,
    img_reshaped  = img_resized.reshape (1,128,128,3)
    
    input_batch = 255*np.array(np.repeat(np.expand_dims(np.array(img_reshaped, dtype=np.float32), axis=0), BATCH_SIZE, axis=0), dtype=np.float32)
       
    output = np.empty([BATCH_SIZE, 1], dtype = np.float16)

    d_input = cuda.mem_alloc(1 * input_batch.nbytes)
    d_output = cuda.mem_alloc(1 * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()
    
    trt_predictions = predict(input_batch).astype(np.float32)
    
    expected.append (link[1])
    predicted.append (trt_predictions[0][0])

Prediction: 1000it [00:03, 260.98it/s]


In [14]:
mean_squared_error (expected, predicted)**0.5

37.89230951152473

#### Вывод

Применение TensorRT позволило значительно ускорить работу модели , с более чем 1 минуты на 1000 итераций, до 4 секунд (для Fp32) И до 3 секунд (для FP16). Однако, при применении FP16 отмечается значительное падение точности, тогда как при использовании FP32 Она сохраняется