# **LPRNet Model**

In [3]:
!git clone https://github.com/sirius-ai/LPRNet_Pytorch.git
%cd LPRNet_Pytorch

Cloning into 'LPRNet_Pytorch'...
remote: Enumerating objects: 1071, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 1071 (delta 25), reused 22 (delta 22), pack-reused 1037 (from 1)[K
Receiving objects: 100% (1071/1071), 20.04 MiB | 9.92 MiB/s, done.
Resolving deltas: 100% (35/35), done.
/content/LPRNet_Pytorch


In [None]:
!ls

data  LICENSE  model  README.md  test_LPRNet.py  train_LPRNet.py  weights


Overwriting cv2.destroyAllWindows()

In [None]:
file_path = "test_LPRNet.py"
with open(file_path, "r") as file:
    lines = file.readlines()

with open(file_path, "w") as file:
    inside_finally = False
    for line in lines:
        if line.strip() == 'finally:':
            inside_finally = True
            file.write(line)  # write the finally line
        elif inside_finally and line.strip() == '':
            inside_finally = False  # end of finally block
            file.write(line)
        elif inside_finally:
            if "cv2.destroyAllWindows()" in line:
                # Replace cv2.destroyAllWindows() with pass, ensuring proper indentation
                file.write('         pass\n')  # Ensuring 'pass' is indented correctly
            else:
                # Indent other lines inside finally block
                file.write('    ' + line)
        else:
            file.write(line)

## **LPRNet Model Accuracy and Inference Speed**

In [None]:
!python ./test_LPRNet.py

Successful to build network!
  lprnet.load_state_dict(torch.load(args.pretrained_model))
load pretrained model successful!
[Info] Test Accuracy: 0.899 [899:61:40:1000]
[Info] Test Speed: 0.0015159180164337157s 1/1000]


In [5]:
# Installations
!apt-get install -y llvm-14-dev
!pip install apache-tvm

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  binfmt-support libffi-dev libpfm4 libz3-4 libz3-dev llvm-14 llvm-14-runtime llvm-14-tools
  python3-pygments python3-yaml
Suggested packages:
  llvm-14-doc python-pygments-doc ttf-bitstream-vera
The following NEW packages will be installed:
  binfmt-support libffi-dev libpfm4 libz3-4 libz3-dev llvm-14 llvm-14-dev llvm-14-runtime
  llvm-14-tools python3-pygments python3-yaml
0 upgraded, 11 newly installed, 0 to remove and 49 not upgraded.
Need to get 58.6 MB of archives.
After this operation, 354 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 python3-yaml amd64 5.4.1-1ubuntu1 [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 binfmt-support amd64 2.2.1-2 [55.8 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 llvm-14-runtime amd64 1:14.0.0-1ubuntu1.1 [

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from model.LPRNet import build_lprnet
import os
import torch.nn.utils.prune as prune
import copy
from data.load_data import CHARS, CHARS_DICT, LPRDataLoader
from PIL import Image, ImageDraw, ImageFont
from torch.autograd import Variable
from torch.utils.data import *
import argparse
import tvm
from tvm import te,relay
import numpy as np
import time

import cv2
import matplotlib.pyplot as plt

In [12]:
!pip install torch torchvision onnx onnxruntime tvm

Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting tvm
  Downloading tvm-1.0.0.tar.gz (5.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting appdirs (from tvm)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting docopt (from tvm)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting inform (from tvm)
  Downloading inform-1.32-py3-none-any.whl.metadata (11 kB)
Collecting quantiphy (from tvm)
  Downloading quantiphy-2.20-py3-none-any.whl.metadata (7.7 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metad

In [10]:
def get_model_size_onnx(model, input_tensor=None):
    # Set the model to evaluation mode
    model.eval()

    # If no input tensor is provided, use a default one based on the model's input shape
    if input_tensor is None:
        input_tensor = torch.randn(1, 3, 24, 94)

    # Export the model to ONNX format
    onnx_file = "./weights/model.onnx"
    torch.onnx.export(model, input_tensor, onnx_file, verbose=False, opset_version=12)

    # Get the size of the ONNX model file in bytes
    model_size_bytes = os.path.getsize(onnx_file)

    # Convert bytes to MB (1 MB = 1024 * 1024 bytes)
    model_size_MB = model_size_bytes / (1024 ** 2)


    return model_size_MB


In [None]:
# Initialize the model
lprnet_model = build_lprnet()
lprnet_model.eval()

# Prepare dummy input
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dummy_input = torch.randn(1, 3, 24, 94).to(device)  # Dummy input for ONNX export
lprnet_model.to(device)

# ONNX export path
onnx_model_path = './weights/lprnet_model.onnx'

# Export the model
try:
    torch.onnx.export(
        lprnet_model,
        dummy_input,
        onnx_model_path,
        export_params=True,  # Store the trained parameter weights
        opset_version=11,    # Use an ONNX version compatible with your use case
        do_constant_folding=True,  # Optimize constant folding
        input_names=['input'],  # Name of the input tensor
        output_names=['output'],  # Name of the output tensor
        dynamic_axes={
            'input': {0: 'batch_size'},  # Allow dynamic batch sizes
            'output': {0: 'batch_size'}
        },
        verbose=True
    )
    print(f'Model exported successfully to: {onnx_model_path}')
except Exception as e:
    print(f"Error exporting the model: {e}")

Model exported successfully to: ./weights/lprnet_model.onnx


## **LPRNet Model Size**

In [None]:
# Get the file size of the ONNX model
onnx_size = os.path.getsize(onnx_model_path) / (1024 * 1024)  # Size in MB
print(f"LPRNet model size: {onnx_size:.2f} MB")

LPRNet model size: 1.89 MB


In [None]:
# Install necessary libraries if not already installed
!pip install torch torchvision



#**MODEL OPTIMISATIONS**

##**1.   PRUNING**



In [7]:
class small_basic_block(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(small_basic_block, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(ch_in, ch_out // 4, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out, kernel_size=1),
        )

    def forward(self, x):
        return self.block(x)

class LPRNet_pruned(nn.Module):
    def __init__(self, lpr_max_len, phase, class_num, dropout_rate):
        super(LPRNet_pruned, self).__init__()
        self.phase = phase
        self.lpr_max_len = lpr_max_len
        self.class_num = class_num
        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            small_basic_block(ch_in=256, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1),
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),
        )
        self.container = nn.Sequential(
            nn.Conv2d(in_channels=448+self.class_num, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)),
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]:  # Feature layers to keep
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            if i in [0, 1]:
                f = nn.AvgPool2d(kernel_size=5, stride=5)(f)
            if i in [2]:
                f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f)
            f_pow = torch.pow(f, 2)
            f_mean = torch.mean(f_pow)
            f = torch.div(f, f_mean)
            global_context.append(f)

        x = torch.cat(global_context, 1)
        x = self.container(x)
        logits = torch.mean(x, dim=2)

        return logits


In [8]:
def apply_pruning(model, amount=0.05):
    """
    Applies pruning to the model's layers to improve performance.
    """
    model_pruned = copy.deepcopy(model)  # Create a deep copy of the original model

    # Apply L1 pruning to Conv2D and Linear layers
    for name, module in model_pruned.named_modules():
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')
            print(f"Pruned {name} layer")

    print("Pruning applied successfully!")
    return model_pruned

def build_pruned_model(lpr_max_len=8, phase=False, class_num=66, dropout_rate=0.5, pruning_amount=0.05):
    """
    Builds a pruned version of the LPRNet model by first constructing the model,
    then applying pruning to it.
    """
    model = LPRNet_pruned(lpr_max_len, phase, class_num, dropout_rate)

    if phase == "train":
        return model.train()
    else:
        return model.eval()

### Pruned LPRNet Model Size

In [14]:
lprnet_model = build_lprnet(lpr_max_len=8,
        phase=False,
        class_num=60,
        dropout_rate=0.5)
pruned_model = apply_pruning(lprnet_model)
pruned_model.eval()

onnx_size = get_model_size_onnx(pruned_model)
print(f"LPRNet model size: {onnx_size:.4f} MB")

Pruned backbone.0 layer
Pruned backbone.4.block.0 layer
Pruned backbone.4.block.2 layer
Pruned backbone.4.block.4 layer
Pruned backbone.4.block.6 layer
Pruned backbone.8.block.0 layer
Pruned backbone.8.block.2 layer
Pruned backbone.8.block.4 layer
Pruned backbone.8.block.6 layer
Pruned backbone.11.block.0 layer
Pruned backbone.11.block.2 layer
Pruned backbone.11.block.4 layer
Pruned backbone.11.block.6 layer
Pruned backbone.16 layer
Pruned backbone.20 layer
Pruned container.0 layer
Pruning applied successfully!
LPRNet model size: 1.5871 MB


In [None]:
def get_parser():
    parser = argparse.ArgumentParser(description='parameters to test pruned net')
    parser.add_argument('--img_size', default=[94, 24], help='the image size')
    parser.add_argument('--test_img_dirs', default="./data/test", help='the test images path')
    parser.add_argument('--dropout_rate', default=0, help='dropout rate.')
    parser.add_argument('--lpr_max_len', default=8, help='license plate number max length.')
    parser.add_argument('--test_batch_size', default=100, help='testing batch size.')
    parser.add_argument('--phase_train', default=False, type=bool, help='train or test phase flag.')
    parser.add_argument('--num_workers', default=8, type=int, help='Number of workers used in dataloading')
    parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to test model')
    parser.add_argument('--show', default=False, type=bool, help='show test image and its predict result or not.')
    parser.add_argument('--pretrained_model', default='./weights/Final_LPRNet_model.pth', help='pretrained base model')

    args = parser.parse_args()
    return args
def collate_fn(batch):
    imgs = []
    labels = []
    lengths = []
    for _, sample in enumerate(batch):
        img, label, length = sample
        imgs.append(torch.from_numpy(img))
        labels.extend(label)
        lengths.append(length)
    labels = np.asarray(labels).flatten().astype(np.float32)

    return (torch.stack(imgs, 0), torch.from_numpy(labels), lengths)

In [None]:
def Greedy_Decode_Eval(Net, datasets, args):
    epoch_size = len(datasets) // args.test_batch_size
    print("[Info] epoch_size: %d" % epoch_size)
    batch_iterator = iter(DataLoader(datasets, args.test_batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn))

    Tp = 0
    Tn_1 = 0
    Tn_2 = 0
    t1 = time.time()
    for i in range(epoch_size):
        images, labels, lengths = next(batch_iterator)
        start = 0
        targets = []
        for length in lengths:
            label = labels[start:start + length]
            targets.append(label)
            start += length
        targets = np.array([el.numpy() for el in targets])
        imgs = images.numpy().copy()

        if args.cuda:
            images = Variable(images.cuda())
        else:
            images = Variable(images)

        # forward
        prebs = Net(images)
        prebs = prebs.cpu().detach().numpy()

        # Greedy decode
        preb_labels = []
        for i in range(prebs.shape[0]):
            preb = prebs[i, :, :]
            preb_label = []
            for j in range(preb.shape[1]):
                preb_label.append(np.argmax(preb[:, j], axis=0))
            no_repeat_blank_label = []
            pre_c = preb_label[0]
            if pre_c != len(CHARS) - 1:
                no_repeat_blank_label.append(pre_c)
            for c in preb_label:
                if (pre_c == c) or (c == len(CHARS) - 1):
                    if c == len(CHARS) - 1:
                        pre_c = c
                    continue
                no_repeat_blank_label.append(c)
                pre_c = c
            preb_labels.append(no_repeat_blank_label)

        for i, label in enumerate(preb_labels):
            if len(label) != len(targets[i]):
                Tn_1 += 1
                continue
            if (np.asarray(targets[i]) == np.asarray(label)).all():
                Tp += 1
            else:
                Tn_2 += 1
    Acc = Tp * 1.0 / (Tp + Tn_1 + Tn_2)
    print("[Info] Test Accuracy: {} [{}:{}:{}:{}]".format(Acc, Tp, Tn_1, Tn_2, (Tp + Tn_1 + Tn_2)))
    t2 = time.time()
    print("[Info] Test Speed: {}s 1/{}]".format((t2 - t1) / len(datasets), len(datasets)))


In [None]:
# Replace cv2.destroyAllWindows() with a Colab-friendly visualization method
def close_visualizations():
    plt.close('all')  # Closes all open matplotlib figures

def test_pruned():
    args = get_parser()

    # Build LPRNet_pruned
    lprnet_pruned = build_pruned_model(lpr_max_len=args.lpr_max_len, phase=args.phase_train, class_num=len(CHARS), dropout_rate=args.dropout_rate)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet_pruned.to(device)
    print("Successfully built pruned network!")

    # Load the pretrained model and filter pruned weights
    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device, weights_only=True)
        filtered_checkpoint = {k: v for k, v in checkpoint.items() if k in lprnet_pruned.state_dict()}
        lprnet_pruned.load_state_dict(filtered_checkpoint, strict=False)
        print("Successfully loaded pruned pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(lprnet_pruned, test_dataset, args)
    finally:
        close_visualizations()


### Pruned LPRNet Model Accuracy and Inference Speed

In [None]:
def test_pruned():
    # Manually set the arguments as needed
    class Args:
        def __init__(self):
            self.img_size = [94, 24]
            self.test_img_dirs = './data/test'
            self.dropout_rate = 0.5
            self.lpr_max_len = 8
            self.test_batch_size = 100
            self.phase_train = False
            self.num_workers = 8
            self.cuda = True
            self.show = False
            self.pretrained_model = './weights/Final_LPRNet_model.pth'

    args = Args()
    lprnet= build_lprnet(lpr_max_len=args.lpr_max_len, phase=args.phase_train, class_num=len(CHARS), dropout_rate=args.dropout_rate)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet.to(device)
    print("Successfully built network!")
    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device)
        filtered_checkpoint = {k: v for k, v in checkpoint.items() if k in lprnet.state_dict()}
        lprnet.load_state_dict(filtered_checkpoint, strict=False)
        pruned_model=apply_pruning(lprnet, amount=0.20)
        print("Successfully loaded pruned pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(pruned_model, test_dataset, args)
    finally:
        close_visualizations()

test_pruned()

Successfully built network!
Pruned backbone.0 layer
Pruned backbone.4.block.0 layer
Pruned backbone.4.block.2 layer
Pruned backbone.4.block.4 layer
Pruned backbone.4.block.6 layer
Pruned backbone.8.block.0 layer
Pruned backbone.8.block.2 layer
Pruned backbone.8.block.4 layer
Pruned backbone.8.block.6 layer
Pruned backbone.11.block.0 layer
Pruned backbone.11.block.2 layer
Pruned backbone.11.block.4 layer
Pruned backbone.11.block.6 layer
Pruned backbone.16 layer
Pruned backbone.20 layer
Pruned container.0 layer
Pruning applied successfully!
Successfully loaded pruned pretrained model!
[Info] epoch_size: 10


  checkpoint = torch.load(args.pretrained_model, map_location=device)


[Info] Test Accuracy: 0.896 [896:63:41:1000]
[Info] Test Speed: 0.0006168961524963378s 1/1000]


##**2. QUANTIZATION**

In [None]:
class LPRNet_quantized(nn.Module):
    def __init__(self, lpr_max_len, phase, class_num, dropout_rate):
        super(LPRNet_quantized, self).__init__()
        self.phase = phase
        self.lpr_max_len = lpr_max_len
        self.class_num = class_num
        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            small_basic_block(ch_in=256, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1),
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),
        )
        self.container = nn.Sequential(
            nn.Conv2d(in_channels=448+self.class_num, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)),
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]:
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            if i in [0, 1]:
                f = nn.AvgPool2d(kernel_size=5, stride=5)(f)
            if i in [2]:
                f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f)
            f_pow = torch.pow(f, 2)
            f_mean = torch.mean(f_pow)
            f = torch.div(f, f_mean)
            global_context.append(f)

        x = torch.cat(global_context, 1)
        x = self.container(x)
        logits = torch.mean(x, dim=2)

        return logits

In [None]:
def fuse_conv_bn(conv, bn):
    """
    Correctly fuse Conv2D and BatchNorm2D layers.
    """
    with torch.no_grad():
        conv_weight = conv.weight.clone()
        conv_bias = conv.bias if conv.bias is not None else torch.zeros(conv.out_channels, device=conv.weight.device)

        # BatchNorm parameters
        bn_weight = bn.weight
        bn_bias = bn.bias
        bn_mean = bn.running_mean
        bn_var = bn.running_var
        bn_eps = bn.eps

        scale = bn_weight / torch.sqrt(bn_var + bn_eps)
        bias = bn_bias - bn_mean * scale

        # Fuse Conv weights and biases
        fused_weight = conv_weight * scale.view(-1, 1, 1, 1)
        fused_bias = conv_bias * scale + bias

        # Update Conv2D layer
        conv.weight.copy_(fused_weight)
        conv.bias = nn.Parameter(fused_bias)
    return conv

def fuse_lprnet_model(model):
    """
    Fuse all Conv2D and BatchNorm2D layers in the LPRNet_quantized model.
    """
    for name, module in model.backbone.named_children():
        if isinstance(module, nn.Conv2d):
            # Check for a BatchNorm2D layer following the Conv2D
            next_name = str(int(name) + 1)
            if next_name in model.backbone._modules:
                next_module = model.backbone._modules[next_name]
                if isinstance(next_module, nn.BatchNorm2d):
                    # Fuse Conv2D and BatchNorm2D
                    fused_conv = fuse_conv_bn(module, next_module)
                    # Replace layers in the model
                    model.backbone._modules[name] = fused_conv
                    model.backbone._modules[next_name] = nn.Identity()
    return model

def apply_quantization(model):
    """
    Applies dynamic quantization to the model to improve inference performance.
    """
    model_quantized = torch.quantization.quantize_dynamic(
        model, {torch.nn.Conv2d, torch.nn.Linear}, dtype=torch.qint8
    )
    print("Quantization applied successfully!")
    return model_quantized

def build_quantized_model(lpr_max_len=8, phase=False, class_num=66, dropout_rate=0.5):
    """
    Builds a quantized version of the LPRNet model.
    """
    model = LPRNet_quantized(lpr_max_len, phase, class_num, dropout_rate)

    if phase == "train":
        return model.train()
    else:
        return model.eval()

### Quantized LPRNet Model Size

In [None]:
lprnet_model = build_lprnet(lpr_max_len=8,
        phase=False,
        class_num=60,
        dropout_rate=0.5)
fused_model = fuse_lprnet_model(lprnet_model)
quantised_model = apply_quantization(fused_model)
quantised_model.eval()

get_model_size_onnx(quantised_model)

Quantization applied successfully!


1.5871267318725586

In [None]:
def get_parser():
    parser = argparse.ArgumentParser(description='parameters to train net')
    parser.add_argument('--img_size', default=[94, 24], help='the image size')
    parser.add_argument('--test_img_dirs', default="./data/test", help='the test images path')
    parser.add_argument('--dropout_rate', default=0, help='dropout rate.')
    parser.add_argument('--lpr_max_len', default=8, help='license plate number max length.')
    parser.add_argument('--test_batch_size', default=100, help='testing batch size.')
    parser.add_argument('--phase_train', default=False, type=bool, help='train or test phase flag.')
    parser.add_argument('--num_workers', default=8, type=int, help='Number of workers used in dataloading')
    parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to train model')
    parser.add_argument('--show', default=False, type=bool, help='show test image and its predict result or not.')
    parser.add_argument('--pretrained_model', default='./weights/Final_LPRNet_model.pth', help='pretrained base model')

    args = parser.parse_args()

    return args

def collate_fn(batch):
    imgs = []
    labels = []
    lengths = []
    for _, sample in enumerate(batch):
        img, label, length = sample
        imgs.append(torch.from_numpy(img))
        labels.extend(label)
        lengths.append(length)
    labels = np.asarray(labels).flatten().astype(np.float32)

    return (torch.stack(imgs, 0), torch.from_numpy(labels), lengths)

In [None]:
def test_quantized():
    args = get_parser()

    # Build LPRNet_quantized
    lprnet_quantized = build_quantized_model(lpr_max_len=args.lpr_max_len, phase=args.phase_train, class_num=len(CHARS), dropout_rate=args.dropout_rate)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet_quantized.to(device)
    print("Successfully built quantized network!")

    # Load the pretrained model
    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device)
        lprnet_quantized.load_state_dict(checkpoint, strict=False)
        print("Successfully loaded quantized pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(lprnet_quantized, test_dataset, args)
    finally:
        close_visualizations()

In [None]:
def Greedy_Decode_Eval(Net, datasets, args):
    # TestNet = Net.eval()
    epoch_size = len(datasets) // args.test_batch_size
    batch_iterator = iter(DataLoader(datasets, args.test_batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn))

    Tp = 0
    Tn_1 = 0
    Tn_2 = 0
    t1 = time.time()
    for i in range(epoch_size):
        # load train data
        images, labels, lengths = next(batch_iterator)
        start = 0
        targets = []
        for length in lengths:
            label = labels[start:start+length]
            targets.append(label)
            start += length
        targets = np.array([el.numpy() for el in targets])
        imgs = images.numpy().copy()

        if args.cuda:
            images = Variable(images.cuda())
        else:
            images = Variable(images)

        # forward
        prebs = Net(images)
        # greedy decode
        prebs = prebs.cpu().detach().numpy()
        preb_labels = list()
        for i in range(prebs.shape[0]):
            preb = prebs[i, :, :]
            preb_label = list()
            for j in range(preb.shape[1]):
                preb_label.append(np.argmax(preb[:, j], axis=0))
            no_repeat_blank_label = list()
            pre_c = preb_label[0]
            if pre_c != len(CHARS) - 1:
                no_repeat_blank_label.append(pre_c)
            for c in preb_label: # dropout repeate label and blank label
                if (pre_c == c) or (c == len(CHARS) - 1):
                    if c == len(CHARS) - 1:
                        pre_c = c
                    continue
                no_repeat_blank_label.append(c)
                pre_c = c
            preb_labels.append(no_repeat_blank_label)
        for i, label in enumerate(preb_labels):
            # show image and its predict label
            if args.show:
                show(imgs[i], label, targets[i])
            if len(label) != len(targets[i]):
                Tn_1 += 1
                continue
            if (np.asarray(targets[i]) == np.asarray(label)).all():
                Tp += 1
            else:
                Tn_2 += 1
    Acc = Tp * 1.0 / (Tp + Tn_1 + Tn_2)
    print("[Info] Test Accuracy: {} [{}:{}:{}:{}]".format(Acc, Tp, Tn_1, Tn_2, (Tp+Tn_1+Tn_2)))
    t2 = time.time()
    print("[Info] Test Speed: {}s 1/{}]".format((t2 - t1) / len(datasets), len(datasets)))

### Quantized Model Accuracy and Inference Speed

In [None]:
def test_quantized():
    # Manually set the arguments as needed
    class Args:
        def __init__(self):
            self.img_size = [94, 24]
            self.test_img_dirs = './data/test'
            self.dropout_rate = 0.5
            self.lpr_max_len = 8
            self.test_batch_size = 100
            self.phase_train = False
            self.num_workers = 8
            self.cuda = True
            self.show = False
            self.pretrained_model = './weights/Final_LPRNet_model.pth'

    args = Args()
    lprnet = build_lprnet(
        lpr_max_len=args.lpr_max_len,
        phase=args.phase_train,
        class_num=len(CHARS),
        dropout_rate=args.dropout_rate
    )
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet.to(device)
    print("Successfully built network!")

    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device)
        filtered_checkpoint = {k: v for k, v in checkpoint.items() if k in lprnet.state_dict()}
        lprnet.load_state_dict(filtered_checkpoint, strict=False)
        fused_model = fuse_lprnet_model(lprnet)
        quantised_model = apply_quantization(fused_model)
        print("Successfully loaded and quantized pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(quantised_model, test_dataset, args)
    finally:
        close_visualizations()

# Call the function
test_quantized()


Successfully built network!
Quantization applied successfully!
Successfully loaded and quantized pretrained model!


  checkpoint = torch.load(args.pretrained_model, map_location=device)


[Info] Test Accuracy: 0.9 [900:60:40:1000]
[Info] Test Speed: 0.0005907914638519287s 1/1000]


#**MLC OPTIMIZATIONS**

##**1. PARALLELIZATION AND VECTORIZATION**

In [None]:
class small_basic_block(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(small_basic_block, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(ch_in, ch_out // 4, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out, kernel_size=1),
        )

    def forward(self, x):
        return self.block(x)

class LPRNet_MLC_vectorization(nn.Module):
    def __init__(self, lpr_max_len, phase, class_num, dropout_rate):
        super(LPRNet_MLC_vectorization, self).__init__()
        self.phase = phase
        self.lpr_max_len = lpr_max_len
        self.class_num = class_num

        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1), # 0
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),  # 2
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),    # *** 4 ***
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),  # 6
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),   # 8
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 10
            small_basic_block(ch_in=256, ch_out=256),   # *** 11 ***
            nn.BatchNorm2d(num_features=256),   # 12
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),  # 14
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),  # 16
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 18
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1), # 20
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),  # *** 22 ***
        )

        self.container = nn.Sequential(
            nn.Conv2d(in_channels=448+self.class_num, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)),
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]: # [2, 4, 8, 11, 22]
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            print(f"Feature map {i} size before pooling: {f.shape}")
            if i in [0, 1]:
                f = nn.AvgPool2d(kernel_size=5, stride=5)(f)
            if i in [2]:
                f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f)
            f_pow = torch.pow(f, 2)
            f_mean = torch.mean(f_pow)
            f = torch.div(f, f_mean)
            print(f"Feature map {i} size after pooling: {f.shape}")
            global_context.append(f)

        # Debug the global context tensor sizes before concatenation
        print(f"Global context tensor sizes: {[f.shape for f in global_context]}")

        x = torch.cat(global_context, 1)
        x = self.container(x)
        logits = torch.mean(x, dim=2)

        return logits

def build_lprnet_vectorization(lpr_max_len=8, phase=False, class_num=66, dropout_rate=0.5):
    Net = LPRNet_MLC_vectorization(lpr_max_len, phase, class_num, dropout_rate)
    if phase == "train":
        return Net.train()
    else:
        return Net.eval()

### Parallelized and Vectorized LPRNet Model Size

In [None]:
lprnet_model_vectorization = build_lprnet_vectorization(lpr_max_len=8,
        phase=False,
        class_num=60,
        dropout_rate=0.5)
lprnet_model_vectorization.eval()

get_model_size_onnx(lprnet_model_vectorization)

Feature map 0 size before pooling: torch.Size([1, 64, 22, 92])
Feature map 0 size after pooling: torch.Size([1, 64, 4, 18])
Feature map 1 size before pooling: torch.Size([1, 128, 20, 90])
Feature map 1 size after pooling: torch.Size([1, 128, 4, 18])
Feature map 2 size before pooling: torch.Size([1, 256, 18, 44])
Feature map 2 size after pooling: torch.Size([1, 256, 4, 18])
Feature map 3 size before pooling: torch.Size([1, 60, 4, 18])
Feature map 3 size after pooling: torch.Size([1, 60, 4, 18])
Global context tensor sizes: [torch.Size([1, 64, 4, 18]), torch.Size([1, 128, 4, 18]), torch.Size([1, 256, 4, 18]), torch.Size([1, 60, 4, 18])]


1.5870962142944336

### Parallelized and Vectorized LPRNet Model Accuracy and Inference Speed

In [None]:
def test_MLC_vectorization():
    # Manually set the arguments as needed
    class Args:
        def __init__(self):
            self.img_size = [94, 24]
            self.test_img_dirs = './data/test'
            self.dropout_rate = 0.5
            self.lpr_max_len = 8
            self.test_batch_size = 100
            self.phase_train = False
            self.num_workers = 8
            self.cuda = True
            self.show = False
            self.pretrained_model = './weights/Final_LPRNet_model.pth'

    args = Args()
    lprnet_MLC1 = build_lprnet_vectorization(
        lpr_max_len=args.lpr_max_len,
        phase=args.phase_train,
        class_num=len(CHARS),
        dropout_rate=args.dropout_rate
    )
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet_MLC1.to(device)
    print("Successfully built network!")

    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device)
        filtered_checkpoint = {k: v for k, v in checkpoint.items() if k in lprnet_MLC1.state_dict()}
        lprnet_MLC1.load_state_dict(filtered_checkpoint, strict=False)
        print("Successfully loaded pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(lprnet_MLC1, test_dataset, args)
    finally:
        close_visualizations()
# Call the function
test_MLC_vectorization()


Successfully built network!
Successfully loaded pretrained model!


  checkpoint = torch.load(args.pretrained_model, map_location=device)


Feature map 0 size before pooling: torch.Size([100, 64, 22, 92])
Feature map 0 size after pooling: torch.Size([100, 64, 4, 18])
Feature map 1 size before pooling: torch.Size([100, 128, 20, 90])
Feature map 1 size after pooling: torch.Size([100, 128, 4, 18])
Feature map 2 size before pooling: torch.Size([100, 256, 18, 44])
Feature map 2 size after pooling: torch.Size([100, 256, 4, 18])
Feature map 3 size before pooling: torch.Size([100, 68, 4, 18])
Feature map 3 size after pooling: torch.Size([100, 68, 4, 18])
Global context tensor sizes: [torch.Size([100, 64, 4, 18]), torch.Size([100, 128, 4, 18]), torch.Size([100, 256, 4, 18]), torch.Size([100, 68, 4, 18])]
Feature map 0 size before pooling: torch.Size([100, 64, 22, 92])
Feature map 0 size after pooling: torch.Size([100, 64, 4, 18])
Feature map 1 size before pooling: torch.Size([100, 128, 20, 90])
Feature map 1 size after pooling: torch.Size([100, 128, 4, 18])
Feature map 2 size before pooling: torch.Size([100, 256, 18, 44])
Feature m

##**2. LOOP BLOCKING**

In [None]:
def te_conv2d(input, weight, stride, padding):
    N, C, H, W = input.shape
    K, _, R, S = weight.shape
    P = (H + 2 * padding - R) // stride + 1
    Q = (W + 2 * padding - S) // stride + 1

    rc = te.reduce_axis((0, C), name='rc')
    ry = te.reduce_axis((0, R), name='ry')
    rx = te.reduce_axis((0, S), name='rx')

    padded_input = te.compute(
        (N, C, H + 2 * padding, W + 2 * padding),
        lambda n, c, h, w: te.if_then_else(
            te.all(h >= padding, h < H + padding, w >= padding, w < W + padding),
            input[n, c, h - padding, w - padding],
            0.0
        ),
        name="padded_input"
    )

    output = te.compute(
        (N, K, P, Q),
        lambda n, k, p, q: te.sum(
            padded_input[n, rc, p * stride + ry, q * stride + rx] * weight[k, rc, ry, rx],
            axis=[rc, ry, rx]
        ),
        name="conv2d"
    )
    return output

# Step 2: Define Tensor Expression for ReLU
def te_relu(input):
    return te.compute(input.shape, lambda *i: te.max(input(*i), 0), name="relu")

# Step 3: Define Tensor Expression for Average Pooling
def te_avg_pool(input, pool_size, stride):
    N, C, H, W = input.shape
    PH, PW = pool_size
    SH, SW = stride
    pooled_height = (H - PH) // SH + 1
    pooled_width = (W - PW) // SW + 1

    ph = te.reduce_axis((0, PH), name="ph")
    pw = te.reduce_axis((0, PW), name="pw")

    return te.compute(
        (N, C, pooled_height, pooled_width),
        lambda n, c, h, w: te.sum(input[n, c, h * SH + ph, w * SW + pw] / (PH * PW), axis=[ph, pw]),
        name="avg_pool"
    )

# Step 4: Create TVM-LPRNet Backbone
def build_lprnet_loopblocking(lpr_max_len, phase, class_num, dropout_rate):
    # Input placeholders
    N, C, H, W = 1, 3, 24, 94  # Example input size
    input = te.placeholder((N, C, H, W), name="input")

    # Define layers using TVM operations
    weight1 = te.placeholder((64, C, 3, 3), name="weight1")
    conv1 = te_conv2d(input, weight1, stride=1, padding=1)
    relu1 = te_relu(conv1)

    weight2 = te.placeholder((128, 64, 1, 1), name="weight2")
    conv2 = te_conv2d(relu1, weight2, stride=1, padding=0)
    relu2 = te_relu(conv2)

    # Add more layers as needed using `te_conv2d`, `te_relu`, etc.
    output = relu2

    # Scheduling: Apply loop blocking
    s = te.create_schedule(output.op)
    n, c, h, w = s[output].op.axis
    ho, hi = s[output].split(h, factor=8)
    wo, wi = s[output].split(w, factor=8)
    s[output].reorder(n, c, ho, wo, hi, wi)

    return s, [input, weight1, weight2, output]


In [None]:
class small_basic_block(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(small_basic_block, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(ch_in, ch_out // 4, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out, kernel_size=1),
        )

    def forward(self, x):
        return self.block(x)

class LPRNet_loop_blocking(nn.Module):
    def __init__(self, lpr_max_len, phase, class_num, dropout_rate):
        super(LPRNet_loop_blocking, self).__init__()
        self.phase = phase
        self.lpr_max_len = lpr_max_len
        self.class_num = class_num
        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            small_basic_block(ch_in=256, ch_out=256),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1),
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),
        )
        self.container = nn.Sequential(
            nn.Conv2d(in_channels=448 + self.class_num, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)),
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]:  # Blocked feature selection
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            # Apply loop blocking during pooling
            if i in [0, 1]:
                f = F.avg_pool2d(f, kernel_size=5, stride=5)
            elif i == 2:
                f = F.avg_pool2d(f, kernel_size=(4, 10), stride=(4, 2))

            # Normalize each feature map
            f_pow = f ** 2
            f_mean = torch.mean(f_pow, dim=[1, 2, 3], keepdim=True)
            f = f / (f_mean + 1e-8)  # Avoid division by zero
            global_context.append(f)

        # Concatenate global context tensors with loop blocking
        x = torch.cat(global_context, dim=1)
        x = self.container(x)

        # Compute logits
        logits = torch.mean(x, dim=2)
        return logits

def build_lprnet_loop_blocking(lpr_max_len=8, phase=False, class_num=66, dropout_rate=0.5):
    Net = LPRNet_loop_blocking(lpr_max_len, phase, class_num, dropout_rate)

    if phase == "train":
        return Net.train()
    else:
        return Net.eval()

### Loop Blocked LPRNet Model Size

In [None]:
lprnet_loop_blocking = build_lprnet_loop_blocking(lpr_max_len=8,
        phase=False,
        class_num=60,
        dropout_rate=0.5)
lprnet_loop_blocking.eval()

get_model_size_onnx(lprnet_loop_blocking)

1.5876893997192383

### Loop Blocked LPRNet Model Accuracy and Inference Speed

In [None]:
def test_loop_blocking():
    # Manually set the arguments as needed
    class Args:
        def __init__(self):
            self.img_size = [94, 24]
            self.test_img_dirs = './data/test'
            self.dropout_rate = 0.5
            self.lpr_max_len = 8
            self.test_batch_size = 100
            self.phase_train = False
            self.num_workers = 8
            self.cuda = True
            self.show = False
            self.pretrained_model = './weights/Final_LPRNet_model.pth'

    args = Args()
    lprnet_loop_blocking = build_lprnet_loop_blocking(
        lpr_max_len=args.lpr_max_len,
        phase=args.phase_train,
        class_num=len(CHARS),
        dropout_rate=args.dropout_rate
    )
    device = torch.device("cuda:0" if args.cuda else "cpu")
    lprnet_loop_blocking.to(device)
    print("Successfully built network!")

    if args.pretrained_model:
        checkpoint = torch.load(args.pretrained_model, map_location=device)
        filtered_checkpoint = {k: v for k, v in checkpoint.items() if k in lprnet_loop_blocking.state_dict()}
        lprnet_loop_blocking.load_state_dict(filtered_checkpoint, strict=False)
        print("Successfully loaded pretrained model!")
    else:
        print("[Error] Can't find pretrained model, please check!")
        return False

    # Prepare the test dataset
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)

    try:
        Greedy_Decode_Eval(lprnet_loop_blocking, test_dataset, args)
    finally:
        close_visualizations()

# Call the function
test_loop_blocking()

Successfully built network!
Successfully loaded pretrained model!


  checkpoint = torch.load(args.pretrained_model, map_location=device)


[Info] Test Accuracy: 0.894 [894:69:37:1000]
[Info] Test Speed: 0.000720259666442871s 1/1000]
