# Imports

In [2]:
import torch
import numpy as np
import os
import torchvision
from PIL import Image
from torch.utils.data import DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim
import torch.nn as nn

# Globals

In [3]:
# GLOBALS NEEDED FOR ENCODING AND DECODING 

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
seen = set()    #avoid duplicates
MY_DICTIONARY = []
for char_list in [provinces, alphabets, ads]:
    for char in char_list:
        if char not in seen:
            MY_DICTIONARY.append(char)
            seen.add(char)
char2idx = {c: i for i, c in enumerate(MY_DICTIONARY)}
idx2char = {i: c for i, c in enumerate(MY_DICTIONARY)}
BLANK_IDX = len(MY_DICTIONARY)  # CTC needs +1 for "blank" 

# DATASET 

test_path="/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/test"

# Utils

In [4]:
# FUNCTION TO LOAD THE DETECTION MODEL

def load_Fasterrcnn(device):
    model = fasterrcnn_resnet50_fpn(num_classes=2)  
    model.load_state_dict(torch.load('model_weights/best_frcnn_model_final_version.pth', map_location="cpu"))
    model.to(device)
    model.eval()
    return model

device="cpu"
model=load_Fasterrcnn(device)

# CAR PLATE TEXT FUNCTION

def get_plate(filename):
    fields = filename.split('-')

    text=str(fields[4])
    indices=text.split("_")
    province_character=provinces[int(indices[0])]
    alphabet_character=alphabets[int(indices[1])]
    ads_charachters=[ads[int(i)] for i in indices[2:]]
    plate_text=province_character+alphabet_character+"".join(ads_charachters)

    return plate_text

  model.load_state_dict(torch.load('model_weights/best_frcnn_model_final_version.pth', map_location="cpu"))


# Dataset

In [5]:
# DATASET CLASS

class LicensePlateRecognitionPipeline:
    def __init__(self, pdlpr_model_path, device='cpu'):

        self.device = device

        # Load Faster R-CNN model
        self.detection_model = load_Fasterrcnn(self.device)

        # Load PDLPR model
        self.pdlpr_model = PDLPR(num_classes=BLANK_IDX + 1, dropout=0.1).to(self.device)
        checkpoint = torch.load(pdlpr_model_path, map_location=self.device)
        self.pdlpr_model.load_state_dict(checkpoint['model_state_dict'])
        self.pdlpr_model.eval()

        # Transform for PDLPR
        self.transform = transforms.Compose([
            transforms.Resize((64, 256)),
            transforms.ToTensor(),
            transforms.Grayscale(num_output_channels=1),
            transforms.Normalize(mean=[0.5], std=[0.5])
        ])

        print("Pipeline initialized successfully!")

    def frcnn_crop(self, image_path):
        """
        Detecting plates using Faster R-CNN

        Args:
            image_path: Path to the image

        Returns:
            cropped image
        """
        # Load and transform image
        img = Image.open(image_path)
        transform = transforms.Compose([transforms.ToTensor()])
        img_tensor = transform(img).unsqueeze(0).to(self.device)

        # Get predictions
        with torch.no_grad():
            predictions = self.detection_model(img_tensor)

        # Get the first (best) detection if any
        if len(predictions[0]['boxes']) > 0:
            box = predictions[0]['boxes'][0].cpu().numpy()
            x1, y1, x2, y2 = map(int, box)
            cropped = img.crop((x1, y1, x2, y2))
            return cropped
        else:
            print("No license plate detected!")
            return None

    def recognize_plate(self, plate_crop):
        """
        Recognizes text on a license plate using PDLPR

        Args:
            plate_crop: Cropped license plate (PIL Image)

        Returns:
            Recognized text
        """
        if plate_crop is None:
            return ""

        plate_crop_tr = self.transform(plate_crop).unsqueeze(0)
        plate_crop_tr = plate_crop_tr.to(self.device)

        with torch.no_grad():
            outputs = self.pdlpr_model(plate_crop_tr)
            pred_string = ctc_greedy_decoder(outputs, idx2char)

        return pred_string[0] if pred_string else ""

    def __call__(self, img_path):
        """
        Makes the object callable - main pipeline method

        Args:
            img_path: Path to the image

        Returns:
            Recognized license plate text
        """
        crop = self.frcnn_crop(img_path)
        plate = self.recognize_plate(crop)
        return plate
      
# COLLATE FUNCTION

def ctc_collate_fn(batch):
    '''
    basically what I do here is stacking all the images in a batch into a single tensor and
    then computing the len of each label (assuming different lenght plate can happen). (I could actually avoid this but it's more general)
    Finally just concatenating all the labels into a vector (pytorch CTC wantres them in a line, not list)
    then returning image-label-its lenght.
    I need this to tell CTC where labels finish and i do not care padding as CTC deals with that internally (NICE)
    '''
    images, labels = zip(*batch)
    images = torch.stack(images)
    label_lengths = torch.tensor([len(l) for l in labels], dtype=torch.long)
    labels = torch.cat(labels)
    return images, labels, label_lengths

# DECODER FUNCTION

def ctc_greedy_decoder(output, idx2char, blank=BLANK_IDX):
    '''
    Now, I know the network returns probabilities, as it does a softmax with logits of characters.
    I need to transform that probability into an actual char to compose the plate.
    I take the argmax of the softmax (most prob char), remove blanks used by CTC and possible
    duplicates CTC can actually produce.
    At the end I simply use the  mappings char-index index-char deified at the beginning to compose the plate.
    This is greedy as it just takes the argmax of every step, I think it's more than enough here.
    '''
    # output: [seq_len, batch, num_classes]
    out = output.permute(1, 0, 2)  # [batch, seq_len, num_classes]
    pred_strings = []
    for probs in out:
        pred = probs.argmax(1).cpu().numpy()
        prev = -1
        pred_str = []
        for p in pred:
            if p != blank and p != prev:
                pred_str.append(idx2char[p])
            prev = p
        pred_strings.append(''.join(pred_str))
    return pred_strings

# Network

In [6]:
# PDLPR MODEL FOLLOWING PAPER ARCHITECTURE

# --- Focus Structure Module ---
class Focus(nn.Module):
    def __init__(self, in_channels=1, out_channels=64, dropout=0.1):  # Fixed __init__
        super(Focus, self).__init__()  # Fixed __init__
        self.conv = nn.Conv2d(in_channels * 4, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.LeakyReLU(inplace=True)
        self.dropout = nn.Dropout(dropout)
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        # Slice and concat - Focus structure downsampling
        patch1 = x[..., ::2, ::2]
        patch2 = x[..., ::2, 1::2]
        patch3 = x[..., 1::2, ::2]
        patch4 = x[..., 1::2, 1::2]
        x = torch.cat([patch1, patch2, patch3, patch4], dim=1)
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)
        x = self.dropout(x)
        return x


# --- CNN Block used in RESBLOCK and downsampling ---
def conv_block(in_channels, out_channels, kernel_size=3, stride=1, padding=1):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
        nn.BatchNorm2d(out_channels),
        nn.LeakyReLU(inplace=True)
    )


# --- Residual Block ---
class ResBlock(nn.Module):
    def __init__(self, channels):  # Fixed __init__
        super(ResBlock, self).__init__()  # Fixed __init__
        self.block = nn.Sequential(
            conv_block(channels, channels),
            conv_block(channels, channels)
        )

    def forward(self, x):
        return x + self.block(x)


# --- IGFE Module ---
class IGFE(nn.Module):
    def __init__(self, dropout=0.1):  # Fixed __init__
        super(IGFE, self).__init__()  # Fixed __init__
        self.focus = Focus(1, 64, dropout)  # Changed to 1 channel for grayscale
        self.down1 = conv_block(64, 128, stride=2)
        self.res1 = ResBlock(128)
        self.res2 = ResBlock(128)
        self.down2 = conv_block(128, 256, stride=2)
        self.res3 = ResBlock(256)
        self.res4 = ResBlock(256)
        self.final_conv = nn.Conv2d(256, 512, kernel_size=1)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((6, 18))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.focus(x)
        x = self.down1(x)
        x = self.dropout(x)
        x = self.res1(x)
        x = self.res2(x)
        x = self.down2(x)
        x = self.dropout(x)
        x = self.res3(x)
        x = self.res4(x)
        x = self.final_conv(x)
        x = self.adaptive_pool(x)
        return x


# --- Positional Encoding for 2D feature maps ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, height, width):  # Fixed __init__
        super().__init__()  # Fixed __init__
        self.pe = nn.Parameter(torch.randn(1, d_model, height, width))

    def forward(self, x):
        return x + self.pe


# --- Transformer Encoder Block ---
class EncoderBlock(nn.Module):
    def __init__(self, d_model):  # Fixed __init__
        super(EncoderBlock, self).__init__()  # Fixed __init__
        self.self_attn = nn.MultiheadAttention(d_model, num_heads=8, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, C, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)  # [B, HW, C]

        # Self-attention
        attn_out, _ = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout(attn_out))

        # FFN
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))

        return x.transpose(1, 2).view(B, C, H, W)


# --- PDLPR Recognition Model (CTC-based) ---
class PDLPR(nn.Module):
    def __init__(self, num_classes, dropout=0.1):  # Fixed __init__
        super(PDLPR, self).__init__()  # Fixed __init__
        self.igfe = IGFE(dropout)
        self.pos_encoding = PositionalEncoding(512, 6, 18)
        self.encoder = nn.Sequential(*[EncoderBlock(512) for _ in range(3)])

        # CTC head - outputs sequence of characters
        # After flattening, we have [B, W, C*H] = [B, 18, 512*6] = [B, 18, 3072]
        self.ctc_head = nn.Sequential(
            nn.Linear(512 * 6, 256),  # Fixed dimension: 512*6 = 3072
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Feature extraction
        x = self.igfe(x)  # [B, 512, 6, 18]
        x = self.pos_encoding(x)
        x = self.encoder(x)  # [B, 512, 6, 18]

        # Flatten to sequence for CTC
        B, C, H, W = x.shape
        x = x.permute(0, 3, 1, 2)  # [B, W, C, H]
        x = x.contiguous().view(B, W, -1)  # [B, W, C*H] = [B, 18, 3072]

        # Linear projection to reduce dimension
        x = self.ctc_head(x)  # [B, W, num_classes]
        x = x.permute(1, 0, 2)  # [W, B, num_classes] for CTC

        return x

# Train

In [None]:
# THERE IS NO NEED TO TRAIN ANYTHING HERE SINCE WE JUST HAVE TO MAKE INFERENCE WITH THE DETECTION AND RECOGNITION MODEL

# Evaluation

In [7]:
test_path = "/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/test"
online_model=LicensePlateRecognitionPipeline("/home/filippo/Documents/Visual Studio Code/best_pdlpr_model.pth")

def test_on_directory(model, test_dir):
    results = []
    
    # Get all image files
    valid_extensions = ('.jpg', '.jpeg', '.png')
    image_files = [f for f in os.listdir(test_dir) if f.lower().endswith(valid_extensions)]
    
    for img_file in image_files:
        img_path = os.path.join(test_dir, img_file)
        try:
            # Process image
            plate_text = model(img_path)
            
            # Store results
            results.append({
                'image': img_file,
                'plate_text': plate_text
            })
            
            # Display results
            print(f"Image: {img_file}")
            print(f"Detected plate: {plate_text}\n")
            
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}\n")
    
    return results

# Run test on directory
print(f"Testing on directory: {test_path}")
results = test_on_directory(online_model, test_path)

  model.load_state_dict(torch.load('model_weights/best_frcnn_model_final_version.pth', map_location="cpu"))
  checkpoint = torch.load(pdlpr_model_path, map_location=self.device)
  checkpoint = torch.load(pdlpr_model_path, map_location=self.device)


Pipeline initialized successfully!
Testing on directory: /home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/test
Image: 04254310344827586-90_262-173&515_551&618-551&618_185&616_173&515_551&517-0_0_3_26_30_30_26_33-95-45.jpg
Detected plate: 皖AD26629

Image: 04254310344827586-90_262-173&515_551&618-551&618_185&616_173&515_551&517-0_0_3_26_30_30_26_33-95-45.jpg
Detected plate: 皖AD26629

Image: 027942708333333333-92_259-283&464_515&585-515&585_294&544_283&464_512&477-0_0_5_24_24_31_32_30-96-27.jpg
Detected plate: 皖AD00786

Image: 027942708333333333-92_259-283&464_515&585-515&585_294&544_283&464_512&477-0_0_5_24_24_31_32_30-96-27.jpg
Detected plate: 皖AD00786

Image: 043534482758620686-89_269-150&502_510&612-499&600_152&612_150&513_510&502-0_0_3_0_29_29_29_32-114-6.jpg
Detected plate: 皖ADB5550

Image: 043534482758620686-89_269-150&502_510&612-499&600_152&612_150&513_510&502-0_0_3_0_29_29_29_32-114-6.jpg
Detected plate: 皖ADB5550

Image: 0569227430556-94_256-165&481_595&615-589&61

KeyboardInterrupt: 

# Evaluation Metrics

In [8]:
def evaluate_on_test_set(model, test_dir):
    results = []
    total_images = 0
    correct_predictions = 0
    
    # Get all image files
    valid_extensions = ('.jpg', '.jpeg', '.png')
    image_files = [f for f in os.listdir(test_dir) if f.lower().endswith(valid_extensions)]
    
    print("Starting evaluation on test set...\n")
    
    for img_file in image_files:
        img_path = os.path.join(test_dir, img_file)
        try:
            # Get ground truth
            ground_truth = get_plate(img_file)
            
            # Get prediction
            predicted_plate = model(img_path)
            
            # Compare prediction with ground truth
            is_correct = predicted_plate.strip() == ground_truth.strip()
            
            # Store results
            results.append({
                'image': img_file,
                'ground_truth': ground_truth,
                'prediction': predicted_plate,
                'correct': is_correct
            })
            
            total_images += 1
            if is_correct:
                correct_predictions += 1
            
            # Display results
            print(f"Image: {img_file}")
            print(f"Ground Truth: {ground_truth}")
            print(f"Prediction: {predicted_plate}")
            print(f"Correct: {is_correct}\n")
            
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}\n")
    
    # Calculate final accuracy
    accuracy = (correct_predictions / total_images) * 100 if total_images > 0 else 0
    
    print("\nFinal Results:")
    print(f"Total images processed: {total_images}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Accuracy: {accuracy:.2f}%")
    
    return results, accuracy

# Run evaluation on test set
print(f"Evaluating model on test directory: {test_path}")
evaluation_results, final_accuracy = evaluate_on_test_set(online_model, test_path)

Evaluating model on test directory: /home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/test
Starting evaluation on test set...

Image: 04254310344827586-90_262-173&515_551&618-551&618_185&616_173&515_551&517-0_0_3_26_30_30_26_33-95-45.jpg
Ground Truth: 皖AD26629
Prediction: 皖AD26629
Correct: True

Image: 04254310344827586-90_262-173&515_551&618-551&618_185&616_173&515_551&517-0_0_3_26_30_30_26_33-95-45.jpg
Ground Truth: 皖AD26629
Prediction: 皖AD26629
Correct: True

Image: 027942708333333333-92_259-283&464_515&585-515&585_294&544_283&464_512&477-0_0_5_24_24_31_32_30-96-27.jpg
Ground Truth: 皖AF00786
Prediction: 皖AD00786
Correct: False

Image: 027942708333333333-92_259-283&464_515&585-515&585_294&544_283&464_512&477-0_0_5_24_24_31_32_30-96-27.jpg
Ground Truth: 皖AF00786
Prediction: 皖AD00786
Correct: False

Image: 043534482758620686-89_269-150&502_510&612-499&600_152&612_150&513_510&502-0_0_3_0_29_29_29_32-114-6.jpg
Ground Truth: 皖ADA5558
Prediction: 皖ADB5550
Correct: False

Imag

KeyboardInterrupt: 