In [1]:
from google.colab import drive
import sys


drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/dnn_model_optimization')

Mounted at /content/drive


In [2]:
!pip install -q tf2onnx onnxruntime-gpu==1.14.1 onnx==1.14.1 onnxsim onnxoptimizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.7/454.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.2/136.2 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m678.1/678.1 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from utils.data import decode_texts, load_data
from copy import deepcopy
import matplotlib.pyplot as plt
import time
import onnxruntime as ort
import numpy as np
from itertools import groupby
import tensorflow as tf
from utils.tf_helpers import CER, CTCLoss, warmup_tf_model
from tqdm import trange
import gc
import onnxruntime as rt


((imgs, abits), labels), alphabet = load_data('/content/drive/MyDrive/dnn_model_optimization/data', split=False, blank_idx=-1)

model = tf.keras.models.load_model('/content/drive/MyDrive/dnn_model_optimization/weights/crnn_common_fields.h5',
                                   custom_objects={'CTCLoss': CTCLoss, 'CER': CER})

model = warmup_tf_model(model, [(128, 50, 2), (128, 32, 400)])

In [4]:
NRUNS = 10
times = list()
BATCH_SIZE = 128

start = time.time()

for i in trange(NRUNS):
  gc.collect()
  y_pred = model.predict([abits, imgs], batch_size=BATCH_SIZE, verbose=0)

print(f'Time spent: {(round((time.time()-start) / NRUNS / BATCH_SIZE, 6))}')

loss = CTCLoss(labels, y_pred)
cer = CER()
cer.update_state(labels, y_pred)

print(f'loss: {round(tf.reduce_mean(loss).numpy(), 6)}, Character Error Rate: {round(cer.result().numpy(), 6)}')

100%|██████████| 10/10 [01:16<00:00,  7.70s/it]


Time spent: 0.060159
loss: 0.2938440144062042, Character Error Rate: 0.0022150001022964716


# Model conversion and measurements

In [9]:
from tf2onnx.convert import from_keras

spec = (tf.TensorSpec((None, 50, 2), tf.float32, name="field_data"),
        tf.TensorSpec((None, 32, 400), tf.float32, name="image_data"),)

output_path = "/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx.onnx"

model_proto, external_tensor_storage = from_keras(model, input_signature=spec, output_path=output_path)



In [None]:
del model
gc.collect()

In [19]:
gc.collect()

sess = rt.InferenceSession(output_path, providers=['CUDAExecutionProvider'])

output_names = [output.name for output in sess.get_outputs()]

batch_size = 128
n = batch_size - imgs.shape[0] % batch_size

imgs_batched = imgs.copy()
abits_batched = abits.copy()

imgs_batched = np.concatenate([imgs_batched, imgs_batched[:n]], axis=0)
abits_batched = np.concatenate([abits_batched, abits_batched[:n]], axis=0)

imgs_batched = np.expand_dims(imgs_batched, 1).astype('float32').reshape(batch_size, -1, 32, 400)
abits_batched = abits_batched.astype('float32').reshape(batch_size, -1, 50, 2)

runs = 10

start = time.time()

for i in range(runs):
    y_pred = list()
    for i in range(imgs_batched.shape[0]):
        sess.run(output_names, {'image_data': imgs_batched[i], 'field_data': abits_batched[i]})
time_spent = (time.time()-start) / imgs_batched.shape[0] / runs

y_pred = list()
for i in range(1, imgs.shape[0] // batch_size + 2):
    y_pred.append(sess.run(output_names, {'image_data': imgs.astype('float32')[(i-1)*batch_size: i*batch_size],
                                          'field_data': abits[(i-1)*batch_size: i*batch_size].astype('float32')})[0])
y_pred = np.concatenate(y_pred)

loss = CTCLoss(labels, y_pred)
cer = CER()
cer.update_state(labels, y_pred)

print(f'batch_time: {round(time_spent, 6)}, loss: {round(tf.reduce_mean(loss).numpy(), 6)}, metric: {round(cer.result().numpy(), 6)}')

batch_time: 0.029454, loss: 0.2938440144062042, metric: 0.0022150001022964716


# Optimization with ONNXOptimizer and ONNXSimplifier

In [20]:
!python -m onnxoptimizer "/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx.onnx" "/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx_opt.onnx"
!onnxsim "/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx_opt.onnx" "/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx_opt.onnx"

Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m          [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add        │ 1              │ 1                │
│ Cast       │ 8              │ [1;32m6               [0m │
│ Concat     │ 5              │ [1;32m4               [0m │
│ Constant   │ 29             │ [1;32m28              [0m │
│ Conv       │ 5              │ 5                │
│ Expand     │ 2              │ 2                │
│ Gather     │ 2              │ [1;32m1               [0m │
│ LSTM       │ 2              │ 2                │
│ LeakyRelu  │ 5              │ 5                │
│ MatMul     │ 1              │ 1                │
│ MaxPool    │ 5              │ 5                │
│ Reshape    │ 4              │ 4                │
│ Shape      │ 4              │ [1;32m3               [0m │
│

In [21]:
gc.collect()

sess = rt.InferenceSession("/content/drive/MyDrive/dnn_model_optimization/weights/common_segments_crnn_tf2onnx_opt.onnx", providers=['CUDAExecutionProvider'])

output_names = [output.name for output in sess.get_outputs()]

batch_size = 128
n = batch_size - imgs.shape[0] % batch_size

imgs_batched = imgs.copy()
abits_batched = abits.copy()

imgs_batched = np.concatenate([imgs_batched, imgs_batched[:n]], axis=0)
abits_batched = np.concatenate([abits_batched, abits_batched[:n]], axis=0)

imgs_batched = np.expand_dims(imgs_batched, 1).astype('float32').reshape(batch_size, -1, 32, 400)
abits_batched = abits_batched.astype('float32').reshape(batch_size, -1, 50, 2)

runs = 10

start = time.time()

for i in range(runs):
    y_pred = list()
    for i in range(imgs_batched.shape[0]):
        sess.run(output_names, {'image_data': imgs_batched[i], 'field_data': abits_batched[i]})
time_spent = (time.time()-start) / imgs_batched.shape[0] / runs

y_pred = list()
for i in range(1, imgs.shape[0] // batch_size + 2):
    y_pred.append(sess.run(output_names, {'image_data': imgs.astype('float32')[(i-1)*batch_size: i*batch_size],
                                          'field_data': abits[(i-1)*batch_size: i*batch_size].astype('float32')})[0])
y_pred = np.concatenate(y_pred)

loss = CTCLoss(labels, y_pred)
cer = CER()
cer.update_state(labels, y_pred)

print(f'batch_time: {round(time_spent, 6)}, loss: {round(tf.reduce_mean(loss).numpy(), 6)}, metric: {round(cer.result().numpy(), 6)}')

batch_time: 0.028541, loss: 0.2938440144062042, metric: 0.0022150001022964716


In [None]:
from google.colab import runtime

runtime.unassign()