The base idea is:
* **FIRST BASELINE** -> no detection, usage of ground truth for the bounding box | recognition part with CNN or CRNN

* **SECOND BASELINE** -> detection through groundedSAM2 and recognition with CNN/CRNN

* **SECOND BASELINE** -> detection I have to understand how and recognition with the paper model rpnet

# Imports

In [1]:
import os
#import cv2
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
import torch.nn as nn
import torch.optim as optim

In [2]:
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current Device Index:", torch.cuda.current_device())

GPU Name: NVIDIA GeForce GTX 1070
Number of GPUs: 1
Current Device Index: 0


# Globals

In [3]:
#transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((256, 256)), # Resize to a fixed size
    #transforms.ColorJitter(brightness=0.2, contrast=0.2), # Augmentation, DUNNO ABOUT THIS, MAYBE LATER
    transforms.ToTensor(), # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.5], std=[0.5]), # Normalize the image to have mean 0.5 and std 0.5
    transforms.Grayscale(num_output_channels=1), #allows the model to focus on plate numbers without color distraction
    #transforms.RandomRotation(degrees=3), # small tilt to simulate real-world scenarios, already present, commenting for now
    #transforms.RandomPerspective(distortion_scale=0.2, p=0.5) #this as well, to simulate real world random perspective distortions. have to check other dataset folders, maybe already present
])


#saving fields of the licence plate as global variables, i'm gonna use them later on
provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
        

# Utils

In [None]:
class PlateCNN(nn.Module):
    def __init__(self, num_provinces, num_alphabets, num_ads):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.fc_province = nn.Linear(128, num_provinces)
        self.fc_alpha = nn.Linear(128, num_alphabets)
        self.fc_ads = nn.ModuleList([nn.Linear(128, num_ads) for _ in range(5)])

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        out_prov = self.fc_province(x)
        out_alpha = self.fc_alpha(x)
        out_ads = [fc(x) for fc in self.fc_ads]
        return out_prov, out_alpha, out_ads
    


'''def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, labels in dataloader:
        #print("TYPE:", type(labels))       DEBUG
        #print("SHAPE:", getattr(labels, 'shape', None))    # DEBUG
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        out_prov, out_alpha, out_ads = model(images)
        loss = criterion(out_prov, labels[:, 0]) + \
               criterion(out_alpha, labels[:, 1])
        for i in range(5):
            loss += criterion(out_ads[i], labels[:, i+2])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)'''



'''def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            out_prov, out_alpha, out_ads = model(images)
            pred_prov = out_prov.argmax(1)
            pred_alpha = out_alpha.argmax(1)
            pred_ads = [out.argmax(1) for out in out_ads]
            preds = torch.stack([pred_prov, pred_alpha] + pred_ads, dim=1)
            correct += (preds == labels).all(dim=1).sum().item()
            total += labels.size(0)
    return correct / total'''




def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_idx, (images, labels) in enumerate(dataloader):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        out_prov, out_alpha, out_ads = model(images)
        loss = criterion(out_prov, labels[:, 0]) + \
               criterion(out_alpha, labels[:, 1])
        for i in range(5):
            loss += criterion(out_ads[i], labels[:, i+2])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Verbose output
        if batch_idx == 0:
            print(f"[Batch {batch_idx}] Loss: {loss.item():.4f}")
            print("labels[0]:", labels[0].cpu().numpy())
            print("pred_prov[0]:", out_prov[0].argmax().item(), "label:", labels[0,0].item())
            print("pred_alpha[0]:", out_alpha[0].argmax().item(), "label:", labels[0,1].item())
            print("pred_ads[0]:", [out_ads[j][0].argmax().item() for j in range(5)], "label:", [labels[0,2+i].item() for i in range(5)])
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch average loss: {avg_loss:.4f}")
    return avg_loss

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    correct_per_char = 0
    total_chars = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            out_prov, out_alpha, out_ads = model(images)
            pred_prov = out_prov.argmax(1)
            pred_alpha = out_alpha.argmax(1)
            pred_ads = [out.argmax(1) for out in out_ads]
            preds = torch.stack([pred_prov, pred_alpha] + pred_ads, dim=1)
            correct += (preds == labels).all(dim=1).sum().item()
            total += labels.size(0)
            correct_per_char += (preds == labels).sum().item()
            total_chars += labels.numel()
    acc_all = correct / total
    acc_char = correct_per_char / total_chars
    return acc_all, acc_char

# Data

A sample image name is "025-95_113-154&383_386&473-386&473_177&454_154&383_363&402-0_0_22_27_27_33_16-37-15.jpg".
Each name can be splited by "-" into seven fields. Those fields are explained as follows.

1) Area: Area ratio of license plate area to the entire picture area.

2) Tilt degree: Horizontal tilt degree and vertical tilt degree.

3) Bounding box coordinates: The coordinates of the left-up and the right-bottom vertices.

4) Four vertices locations: The exact (x, y) coordinates of the four vertices of LP in the whole image. These coordinates start from the right-bottom vertex.

5) License plate number: Each image in CCPD has only one LP. Each LP number is comprised of a Chinese character, a letter, and five letters or numbers. A valid Chinese license plate consists of seven characters:

       - province (1 character),
       - alphabets (1 character),
       - alphabets+digits (5 characters).

"0_0_22_27_27_33_16" is the index of each character. These three arrays are defined as follows. The last character of each array is letter O rather than a digit 0. We use O as a sign of "no character" because there is no O in Chinese license plate characters.

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]


alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
             'X', 'Y', 'Z', 'O']


ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
       'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']


6) Brightness: The brightness of the license plate region.

7) Blurriness: The Blurriness of the license plate region.

In [70]:
class CarPlateDataset(Dataset):

    def __init__(self, img_dir, transform=None, cropped = False):
        self.img_dir = img_dir
        self.transform = transform
        self.image_names = os.listdir(img_dir)
        self.cropped = cropped


    def __len__(self):
        return len(self.image_names)

    def parse_filename(self, filename):
        fields = filename.split('-')
        area = float(fields[0]) / 100  #filename encodes the area in percentage (ratio plate-no plate area), so divising by 100 gives me a 0-1 range
        tilt_degree = fields[1].split('_')
        h_tilt = int(tilt_degree[0])    #horizontal tilt degree
        v_tilt = int(tilt_degree[1])    #vertical tilt degree
        tilt_list = np.array([h_tilt, v_tilt], dtype=np.float32)


        bbox_coords = fields[2].split('_')  #buonding box coordinates
        leftUp_bbox = bbox_coords[0].split('&')
        leftUp_bbox_x = int(leftUp_bbox[0])
        leftUp_bbox_y = int(leftUp_bbox[1])
        rightBottom_bbox = bbox_coords[1].split('&')
        rightDown_bbox_x = int(rightBottom_bbox[0])
        rightDown_bbox_y = int(rightBottom_bbox[1])
        bbox_coords_list = np.array([(leftUp_bbox_x, leftUp_bbox_y),
                                    (rightDown_bbox_x, rightDown_bbox_y)], dtype=np.float32)

        vertices = fields[3].split('_')  #vertices of the plate
        left_bottom_vertex = vertices[0].split('&')
        left_bottom_vertex_x = int(left_bottom_vertex[0])
        left_bottom_vertex_y = int(left_bottom_vertex[1])
        right_bottom_vertex = vertices[1].split('&')
        right_bottom_vertex_x = int(right_bottom_vertex[0])
        right_bottom_vertex_y = int(right_bottom_vertex[1])
        right_up_vertex = vertices[2].split('&')
        right_up_vertex_x = int(right_up_vertex[0])
        right_up_vertex_y = int(right_up_vertex[1])
        left_up_vertex = vertices[3].split('&')
        left_up_vertex_x = int(left_up_vertex[0])
        left_up_vertex_y = int(left_up_vertex[1])
        vertices_list = np.array([(left_bottom_vertex_x, left_bottom_vertex_y),
                                (right_bottom_vertex_x, right_bottom_vertex_y),
                                (right_up_vertex_x, right_up_vertex_y),
                                (left_up_vertex_x, left_up_vertex_y)], dtype=np.float32)
        
        lp_numbers = list(map(int, fields[4].split('_')))[:7]  #license plate number
        provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
        alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
        ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
        
        lp = provinces[lp_numbers[0]] + alphabets[lp_numbers[1]] + ads[lp_numbers[2]] + ads[lp_numbers[3]] + ads[lp_numbers[4]] + ads[lp_numbers[5]]

        brightness = int(fields[5])
        blurriness = int(fields[6].strip('.jpg'))  # Remove .jpg, it's end of filename

        return {
            'area': area,
            'tilt': tilt_list,
            'bbox_coords': bbox_coords_list,
            'vertices': vertices_list,
            'lp': str(lp),
            'brightness': brightness,
            'blurriness': blurriness,
            'lp_indexes': lp_numbers
        }

    def __getitem__(self, idx):
        img_name = self.image_names[idx]
        img_path = os.path.join(self.img_dir, img_name)

        # Load the image
        image = Image.open(img_path)


        # Parse the filename to get the associated metadata
        metadata = self.parse_filename(img_name)

        if self.cropped:    #I use this dataset for both baselines, so I check if I need to skip detection part and use dataset bbox.
            #I can use the crop method of PIL, that crops the image using coords in this way: (left, upper, right, lower)
            '''
            left is the x-coordinate of the left edge.

            upper is the y-coordinate of the top edge.

            right is the x-coordinate of the right edge.

            lower is the y-coordinate of the bottom edge.
            seen on the online odcs of pillow
            '''
            bboox_coords = metadata['bbox_coords']
            
            left = int(bboox_coords[0][0])   # x-coordinate of the left edge
            upper = int(bboox_coords[0][1])  # y-coordinate of the top edge
            right = int(bboox_coords[1][0])  # x-coordinate of the right edge
            lower = int(bboox_coords[1][1])  # y-coordinate of the bottom edge

            image = image.crop((left, upper, right, lower))


        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(metadata['lp_indexes'], dtype=torch.long)  # Return the image and the license plate indexes as a tensor, for the CNN to elaborate

    #I included this method in the above one, with the if cropped check. I dunno if i'm gonna need it anymore. COMMENTED FOR NOW
    '''def get_cropped_plate(self, idx):
        img_name = self.image_names[idx]
        img_path = os.path.join(self.img_dir, img_name)

        image = Image.open(img_path)

        #now I crop the image using the bbox coords that I have in the metadata
        metadata = self.parse_filename(img_name)
        #I can use the crop method of PIL, that crops the image using coords in this way: (left, upper, right, lower)
        
        #left is the x-coordinate of the left edge.

        #upper is the y-coordinate of the top edge.

        #right is the x-coordinate of the right edge.

        #lower is the y-coordinate of the bottom edge.
        #seen on the online odcs of pillow
        
        bboox_coords = metadata['bbox_coords']
        
        left = int(bboox_coords[0][0])   # x-coordinate of the left edge
        upper = int(bboox_coords[0][1])  # y-coordinate of the top edge
        right = int(bboox_coords[1][0])  # x-coordinate of the right edge
        lower = int(bboox_coords[1][1])  # y-coordinate of the bottom edge

        cropped_lp = image.crop((left, upper, right, lower))

        if self.transform:
            cropped_lp = self.transform(cropped_lp)

        return cropped_lp, metadata['lp']'''


In [71]:
dataset_train = CarPlateDataset(img_dir='./Data/train', transform=transform, cropped=True)
dataset_eval = CarPlateDataset(img_dir='./Data/eval', transform=transform, cropped=True)
dataset_test = CarPlateDataset(img_dir='./Data/test', transform=transform, cropped=True)

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_eval = DataLoader(dataset_eval, batch_size=32, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)

'''# Iterate through data, to check if everything is working
for images, metadata in dataloader_train:
    # Process your data here
    break'''

'# Iterate through data, to check if everything is working\nfor images, metadata in dataloader_train:\n    # Process your data here\n    break'

# Network

In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PlateCNN(num_provinces=len(provinces), num_alphabets=len(alphabets), num_ads=len(ads)).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

# Train

In [76]:
for epoch in range(5):
    loss = train_one_epoch(model, dataloader_train, optimizer, criterion, device)
    #acc = evaluate(model, dataloader_eval, device)
    acc, acc_char = evaluate(model, dataloader_eval, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Eval Acc: {acc:.4f}, Char Acc: {acc_char:.4f}")
    #print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Eval Acc: {acc:.4f}")

Epoch 1, Loss: 8.6187, Eval Acc: 0.0000, Char Acc: 0.5408
Epoch 2, Loss: 8.6190, Eval Acc: 0.0000, Char Acc: 0.5471
Epoch 3, Loss: 8.5933, Eval Acc: 0.0000, Char Acc: 0.5440
Epoch 4, Loss: 8.5878, Eval Acc: 0.0000, Char Acc: 0.5432
Epoch 5, Loss: 8.5816, Eval Acc: 0.0000, Char Acc: 0.5463


# Test

In [None]:
test_acc = evaluate(model, dataloader_test, device)
print(f"Test Accuracy: {test_acc:.4f}")

In [68]:
for images, labels in dataloader_train:
    print("labels shape:", labels.shape)
    print("labels min:", labels.min().item(), "labels max:", labels.max().item())
    print("province max:", len(provinces)-1, "alpha max:", len(alphabets)-1, "ads max:", len(ads)-1)
    break

labels shape: torch.Size([32, 7])
labels min: 0 labels max: 33
province max: 33 alpha max: 24 ads max: 34
