In [None]:
from google.colab import drive
import sys


drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/dnn_model_optimization')

Mounted at /content/drive


In [None]:
!pip install -q torchmetrics torchinfo onnxruntime-gpu onnx onnxsim onnxoptimizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.4/153.4 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m678.1/678.1 kB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from torch.utils.data import DataLoader
from utils.torch_helpers import train_model, validate_model, warmup_torch_model, ctc_loss_log_differentiable_torch
from utils.torch_model import CRNN
from utils.data import decode_texts, load_data, OCRDataset
import torch
from torch import nn
from torchinfo import summary
from torchmetrics.text import CharErrorRate
from copy import deepcopy
import matplotlib.pyplot as plt
import time
import onnxruntime as ort
import numpy as np
from itertools import groupby


((train_imgs, train_abits), train_labels), ((val_imgs, val_abits), val_labels), alphabet = load_data('/content/drive/MyDrive/dnn_model_optimization/data', split=True)

train_dataset = OCRDataset(train_imgs, train_abits, train_labels)
val_dataset = OCRDataset(val_imgs, val_abits, val_labels)

train_loader = DataLoader(train_dataset, batch_size=128)
val_loader = DataLoader(val_dataset, batch_size=128)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CRNN(len(alphabet))
model.load_state_dict(torch.load('/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields_.pt', map_location=torch.device(device)))
summary(model, input_size=[(32, 1, 32, 400), (32, 50, 2)], device=device, depth=1)

Layer (type:depth-idx)                   Output Shape              Param #
CRNN                                     [32, 50, 46]              --
├─Sequential: 1-1                        [32, 256, 1, 50]          425,856
├─LSTM: 1-2                              [32, 50, 256]             528,384
├─LSTM: 1-3                              [32, 50, 256]             526,336
├─Sequential: 1-4                        [32, 50, 46]              11,822
Total params: 1,492,398
Trainable params: 1,492,398
Non-trainable params: 0
Total mult-adds (G): 7.49
Input size (MB): 1.65
Forward/backward pass size (MB): 413.47
Params size (MB): 5.97
Estimated Total Size (MB): 421.09

In [None]:
print('Original model before warmup: ', dict(zip(['batch_time', 'loss', 'metric'], [round(e, 6) for e in validate_model(model, val_loader, alphabet, device=device)])))
warmup_torch_model(model, [(32, 1, 32, 400), (32, 50, 2)], device)
print('Original model after warmup: ', dict(zip(['batch_time', 'loss', 'metric'], [round(e, 6) for e in validate_model(model, val_loader, alphabet, device=device)])))

Original model before warmup:  {'batch_time': 0.003233, 'loss': 14.042188, 'metric': 0.049073}
Original model after warmup:  {'batch_time': 0.002996, 'loss': 14.042188, 'metric': 0.049073}


# Model conversion to ONNX format and measurements
1. I had to replace string value in padding='same' to integer padding=1 so onnx exporter could work properely
2. I had to specify dynamic_axes argument due to the presence LSTM

In [None]:
x1, x2 = next(iter(train_loader))[0]

model.to(device)
torch.onnx.export(model, (x1[0].unsqueeze(0).to(device), x2[0].unsqueeze(0).to(device)),
                  "/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields.onnx",
                  input_names=['image_data', 'field_data'], output_names=['output'],
                  dynamic_axes={'image_data' : {0 : 'batch_size'},
                                'field_data' : {0 : 'batch_size'},
                                'output' : {0 : 'batch_size'}})



In [None]:
!python -m onnxoptimizer "/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields.onnx" "/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields_opt.onnx"
!onnxsim "/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields_opt.onnx" "/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields_opt.onnx"

[1;35mYour model contains "Tile" ops or/and "ConstantOfShape" ops. Folding these ops can make the [0m
[1;35msimplified model much larger. If it is not expected, please specify "--no-large-tensor" (which will [0m
[1;35mlose some optimization chances)[0m
Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m                  [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add                │ 1              │ 1                │
│ BatchNormalization │ 5              │ 5                │
│ Concat             │ 3              │ 3                │
│ Constant           │ 43             │ 43               │
│ ConstantOfShape    │ 2              │ 2                │
│ Conv               │ 5              │ 5                │
│ Gather             │ 2              │ 2                │
│ LSTM               │ 

In [None]:
from itertools import groupby

sess = ort.InferenceSession("/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields_opt.onnx", providers=["CUDAExecutionProvider"])
output_names = [output.name for output in sess.get_outputs()]

batch_size = 128
n = batch_size - val_imgs.shape[0] % batch_size

val_imgs_batched = val_imgs.copy()
val_abits_batched = val_abits.copy()

val_imgs_batched = np.concatenate([val_imgs_batched, val_imgs_batched[:n]], axis=0)
val_abits_batched = np.concatenate([val_abits_batched, val_abits_batched[:n]], axis=0)

val_imgs_batched = np.expand_dims(val_imgs_batched, 1).astype('float32').reshape(batch_size, -1, 1, 32, 400)
val_abits_batched = val_abits_batched.astype('float32').reshape(batch_size, -1, 50, 2)

runs = 10

start = time.time()

for i in range(runs):
    y_pred = list()
    for i in range(val_imgs_batched.shape[0]):
        sess.run(output_names, {'image_data': val_imgs_batched[i], 'field_data': val_abits_batched[i]})
time_spent = (time.time()-start) / val_imgs_batched.shape[0] / runs

y_pred = list()
for i in range(1, val_imgs.shape[0] // batch_size + 2):
    y_pred.append(sess.run(output_names, {'image_data': np.expand_dims(val_imgs, 1).astype('float32')[(i-1)*batch_size: i*batch_size],
                                          'field_data': val_abits[(i-1)*batch_size: i*batch_size].astype('float32')})[0])
y_pred = np.concatenate(y_pred)

input_lengths = torch.full((y_pred.shape[0],), y_pred.shape[1]).to('cpu')
val_labels = torch.LongTensor(val_labels).to('cpu')
target_lengths = torch.sum(val_labels != 0, axis=1)

criterion = ctc_loss_log_differentiable_torch
metric = CharErrorRate()

loss = criterion(torch.log(torch.FloatTensor(y_pred).to('cpu')), val_labels, input_lengths, target_lengths, device='cpu').item()
cer_value = metric(decode_texts(y_pred, alphabet, 0), [''.join(alphabet[k-1] for k, _ in groupby(e) if k != 0) for e in val_labels.cpu().numpy().astype(int)]).item()
print(f'batch_time: {round(time_spent, 6)}, loss: {round(loss, 6)}, metric: {round(cer_value, 6)}')

batch_time: 0.008122, loss: 0.611737, metric: 0.047295


* inference w/ PyTorch: {'batch_time': 0.002996, 'loss': 14.042188, 'metric': 0.049073}
* inference w/ ORT: batch_time: 0.008122, loss: 0.611737, metric: 0.047295