In [None]:
# IMPORTS AND UTILS

import torch
import os
from PIL import Image
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as T
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CTCLoss
import torch.optim.lr_scheduler as lr_scheduler
import math

# DEVICE DEFINITION

device="cpu"

# BOUNDING BOX FUNCTION 

def get_bounding_box(file):
    numbers=file.split("-")
    values=numbers[3]
    values_v2=values.split("&")
    values_v3=[]
    for i in range(len(values_v2)):
        if "_" in values_v2[i]:
            values_v3.append(values_v2[i].split("_"))
    t=[values_v2[0],values_v3[0],values_v3[1],values_v3[2],values_v2[-1]]
    final_values = [int(x) for item in t for x in (item if isinstance(item, list) else [item])]
    x_coords=[final_values[0],final_values[2],final_values[4],final_values[6]]
    y_coords=[final_values[1],final_values[3],final_values[5],final_values[7]]
    x_min = min(x_coords)
    y_min = min(y_coords)
    x_max = max(x_coords)
    y_max = max(y_coords)
    return [float(x_min), float(y_min), float(x_max), float(y_max)]

# LOAD FASTER RCNN MODEL

def load_Fasterrcnn(device):
    model = fasterrcnn_resnet50_fpn(num_classes=2)  
    model.load_state_dict(torch.load('model_weights/best_frcnn_model.pth'))
    model.to(device)
    model.eval()
    return model

model=load_Fasterrcnn("cpu")

# CAR PLATE TEXT FUNCTION

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

MY_DICTIONARY = provinces + [c for c in alphabet if c not in provinces] + [c for c in ads if c not in provinces and c not in alphabet]    
MY_DICTIONARY = list(dict.fromkeys(MY_DICTIONARY))  
char2idx = {c: i for i, c in enumerate(MY_DICTIONARY)}
idx2char = {i: c for i, c in enumerate(MY_DICTIONARY)}
BLANK_IDX = len(MY_DICTIONARY)  

def get_text(file):
    values=file.split("-")
    text=str(values[4])
    indices=text.split("_")
    province_character=provinces[int(indices[0])]
    alphabet_character=alphabet[int(indices[1])]
    ads_charachters=[ads[int(i)] for i in indices[2:]]
    plate_text=province_character+alphabet_character+"".join(ads_charachters)
    return plate_text

# CROP FUNCTION WITH PREDICTED BOUNDING BOX

def crop_image_with_ground_truth(full_path):
    filename = os.path.basename(full_path)  
    bb = get_bounding_box(filename)
    image = Image.open(full_path).convert("RGB")
    transform = T.ToTensor()
    img_tensor = transform(image)
    cropped = img_tensor[:, int(bb[1]):int(bb[3]), int(bb[0]):int(bb[2])]
    cropped_resized = F.interpolate(cropped.unsqueeze(0), size=(48, 144), mode='bilinear', align_corners=False)
    return cropped_resized.squeeze(0)

def crop_folder_with_ground_truth(folder_path):
    cropped_folder = []
    files = os.listdir(folder_path)
    for file in files:
        full_path = os.path.join(folder_path, file)
        gt_text = get_text(file)  # Get ground truth text
        cropped_image = crop_image_with_ground_truth(full_path)
        cropped_folder.append([cropped_image, gt_text])  # Store image and text pair
    return cropped_folder

# Dataset
class CroppedImages(Dataset):
    def __init__(self, folder, transformations):
        self.folder = folder
        self.transformations = transformations

    def __len__(self):
        return len(self.folder)

    def __getitem__(self, idx):
        try:
            image = self.folder[idx][0]
            gt_text = self.folder[idx][1]
            
            if image is None:
                raise ValueError(f"None image at index {idx}")
                
            if self.transformations:
                image = self.transformations(image)
                
            return image, gt_text
            
        except Exception as e:
            print(f"Error loading sample {idx}: {str(e)}")
            return torch.zeros(3, 48, 144), ""

def encode_labels(label_list, char2idx, max_len=8):
    encoded = []
    for label in label_list:
        label = label[:max_len].ljust(max_len)
        encoded.append([char2idx[c] for c in label])
    return torch.tensor(encoded, dtype=torch.long)

# CTC collate function (per CRNN baseline)
def ctc_collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)
    label_lengths = torch.tensor([len(l) for l in labels], dtype=torch.long)
    labels = torch.cat([encode_labels([l], char2idx) for l in labels])
    return images, labels, label_lengths

# PDLPR collate function (per PDLPR model)
def pdlpr_collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)
    
    # Converti labels in sequenze di indici per PDLPR
    max_length = 10  # Lunghezza massima targa (8 caratteri + start/end tokens)
    blank_idx = len(MY_DICTIONARY)  # Blank token
    
    targets = []
    for label in labels:
        # Crea sequenza: [start_token, char1, char2, ..., end_token]
        target = [blank_idx]  # Start token
        for char in label:
            if char in char2idx:
                target.append(char2idx[char])
        target.append(blank_idx)  # End token
        
        # Pad o tronca alla lunghezza massima
        if len(target) > max_length:
            target = target[:max_length]
        else:
            target.extend([blank_idx] * (max_length - len(target)))
        
        targets.append(target)
    
    targets = torch.tensor(targets, dtype=torch.long)
    return images, targets

def ctc_greedy_decoder(output, idx2char, blank=0):
    out = output.permute(1, 0, 2) 
    pred_strings = []
    for probs in out:
        pred = probs.argmax(1).cpu().numpy()
        prev = -1
        pred_str = []
        for p in pred:
            if p != blank and p != prev:
                pred_str.append(idx2char[p])
            prev = p
        pred_strings.append(''.join(pred_str))
    return pred_strings

trans = T.Compose([
    T.Resize((48, 144)),
    T.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])


  model.load_state_dict(torch.load('model_weights/best_frcnn_model.pth'))


In [8]:
# CHOICE OF THE DATASET

model=load_Fasterrcnn(device="cpu")
device="cpu"
model.eval()

cropped_folder_train=crop_folder_with_ground_truth("/home/filippo/Documents/Visual Studio Code/Computer_Vision/Prove/train_small")
cropped_folder_eval=crop_folder_with_ground_truth("/home/filippo/Documents/Visual Studio Code/Computer_Vision/Prove/eval_small")

train_dataset = CroppedImages(cropped_folder_train, trans)
eval_dataset = CroppedImages(cropped_folder_eval, trans)

# CTC DataLoaders (per CRNN baseline)
train_dataloader_ctc = DataLoader(train_dataset, batch_size=8, shuffle=False, collate_fn=ctc_collate_fn)
eval_dataloader_ctc = DataLoader(eval_dataset, batch_size=8, shuffle=False, collate_fn=ctc_collate_fn)

# PDLPR DataLoaders (per PDLPR model)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pdlpr_collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=False, collate_fn=pdlpr_collate_fn)


  model.load_state_dict(torch.load('model_weights/best_frcnn_model.pth'))


In [None]:
# MODEL

class PositionalEncoding(nn.Module):
    """Positional encoding per la sequenza di features"""
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class CNNBackbone(nn.Module):
    """CNN Backbone per estrazione features come descritto nel paper"""
    def __init__(self, input_channels=3):
        super().__init__()
        
        # Convolutional layers per feature extraction
        self.conv_layers = nn.Sequential(
            # Block 1
            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
            
            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
            
            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
        )
        
    def forward(self, x):
        # x shape: [batch_size, channels, height, width]
        features = self.conv_layers(x)
        # features shape: [batch_size, 512, H', W']
        
        # Reshape per creare sequenza di features
        batch_size, channels, height, width = features.size()
        # Combina height e channels, mantieni width come sequenza
        features = features.view(batch_size, channels * height, width)
        features = features.permute(2, 0, 1)  # [seq_len, batch_size, feature_dim]
        
        return features

class AttentionDecoder(nn.Module):
    """Attention-based decoder come descritto nel PDLPR paper"""
    def __init__(self, feature_dim, hidden_dim, num_classes, max_length=20):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.max_length = max_length
        
        # LSTM decoder
        self.lstm = nn.LSTM(feature_dim + num_classes, hidden_dim, batch_first=False)
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_dim + feature_dim, 1)
        
        # Output projection
        self.out = nn.Linear(hidden_dim, num_classes)
        
        # Embedding per previous character
        self.embedding = nn.Embedding(num_classes, num_classes)
        
    def forward(self, encoder_features, target=None, max_length=None):
        """
        encoder_features: [seq_len, batch_size, feature_dim]
        target: [batch_size, target_length] per training
        """
        if max_length is None:
            max_length = self.max_length
            
        batch_size = encoder_features.size(1)
        seq_len = encoder_features.size(0)
        
        # Initialize
        hidden = self._init_hidden(batch_size, encoder_features.device)
        outputs = []
        
        # Start token (blank)
        input_char = torch.zeros(batch_size, self.num_classes).to(encoder_features.device)
        
        for t in range(max_length):
            # Compute attention weights
            attention_weights = self._compute_attention(hidden[0], encoder_features)
            
            # Apply attention to get context
            context = torch.sum(attention_weights.unsqueeze(-1) * encoder_features, dim=0)
            
            # Concatenate context with previous character embedding
            lstm_input = torch.cat([context, input_char], dim=1).unsqueeze(0)
            
            # LSTM step
            output, hidden = self.lstm(lstm_input, hidden)
            
            # Predict next character
            char_logits = self.out(output.squeeze(0))
            outputs.append(char_logits)
            
            # Teacher forcing during training
            if target is not None and t < target.size(1) - 1:
                # Use ground truth
                next_char_idx = target[:, t + 1]
                input_char = self.embedding(next_char_idx)
            else:
                # Use prediction
                next_char_idx = char_logits.argmax(dim=1)
                input_char = self.embedding(next_char_idx)
        
        return torch.stack(outputs, dim=1)  # [batch_size, max_length, num_classes]
    
    def _init_hidden(self, batch_size, device):
        h0 = torch.zeros(1, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(1, batch_size, self.hidden_dim).to(device)
        return (h0, c0)
    
    def _compute_attention(self, hidden, encoder_features):
        """
        hidden: [batch_size, hidden_dim]
        encoder_features: [seq_len, batch_size, feature_dim]
        """
        seq_len, batch_size, feature_dim = encoder_features.size()
        
        # Expand hidden to match sequence length
        hidden_expanded = hidden.unsqueeze(0).expand(seq_len, -1, -1)
        
        # Concatenate hidden with encoder features
        combined = torch.cat([hidden_expanded, encoder_features], dim=2)
        
        # Compute attention scores
        attention_scores = self.attention(combined).squeeze(-1)  # [seq_len, batch_size]
        
        # Apply softmax
        attention_weights = F.softmax(attention_scores, dim=0)
        
        return attention_weights

class PDLPR(nn.Module):
    """
    PDLPR: Progressive Dilated License Plate Recognition
    Implementazione basata sul paper fornito
    """
    def __init__(self, num_classes, input_channels=3, hidden_dim=256, max_length=20):
        super().__init__()
        self.num_classes = num_classes
        self.max_length = max_length
        
        # CNN Backbone per feature extraction
        self.backbone = CNNBackbone(input_channels)
        
        # Calcola feature dimension dopo CNN
        # Assumendo input 64x256, dopo conv layers: feature_dim = 512 * 4 = 2048
        feature_dim = 512 * 4  # channels * remaining_height
        
        # Positional encoding
        self.pos_encoding = PositionalEncoding(feature_dim)
        
        # Attention-based decoder
        self.decoder = AttentionDecoder(feature_dim, hidden_dim, num_classes, max_length)
        
    def forward(self, x, target=None):
        """
        x: [batch_size, channels, height, width]
        target: [batch_size, target_length] per training (opzionale)
        """
        # Extract features usando CNN backbone
        encoder_features = self.backbone(x)  # [seq_len, batch_size, feature_dim]
        
        # Add positional encoding
        encoder_features = self.pos_encoding(encoder_features)
        
        # Decode con attention
        outputs = self.decoder(encoder_features, target)
        
        return outputs
    
    def predict(self, x, idx2char, blank_idx):
        """Prediction senza teacher forcing"""
        self.eval()
        with torch.no_grad():
            outputs = self.forward(x)
            predictions = outputs.argmax(dim=-1)  # [batch_size, max_length]
            
            # Decodifica predizioni
            results = []
            for pred in predictions:
                chars = []
                for char_idx in pred:
                    char_idx = char_idx.item()
                    if char_idx == blank_idx:  # Stop al blank
                        break
                    if char_idx in idx2char:
                        chars.append(idx2char[char_idx])
                results.append(''.join(chars))
            
            return results

model=PDLPR(len(MY_DICTIONARY)+1)
# Training function per PDLPR
def train_pdlpr_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.CrossEntropyLoss(ignore_index=model.num_classes)  # Ignora blank per loss
    
    for batch_idx, (images, targets) in enumerate(dataloader):
        images = images.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass con teacher forcing
        outputs = model(images, targets[:, :-1])  # Escludi ultimo token dal target
        
        # Reshape per loss computation
        outputs = outputs.contiguous().view(-1, model.num_classes)
        targets = targets[:, 1:].contiguous().view(-1)  # Escludi primo token (start)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch_idx % 10 == 0:
            print(f'Batch {batch_idx}/{len(dataloader)}, Loss: {loss.item():.4f}')
    
    return total_loss / len(dataloader)

# Evaluation function
def evaluate_pdlpr(model, dataloader, idx2char, device):
    model.eval()
    correct = 0
    total = 0
    blank_idx = len(idx2char)  # Blank token index
    
    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            
            # Predici
            predictions = model.predict(images, idx2char, blank_idx)
            
            # Converti targets in stringhe per confronto
            target_strings = []
            for target in targets:
                chars = []
                for char_idx in target[1:-1]:  # Escludi start/end tokens (primi e ultimi blank)
                    char_idx = char_idx.item()
                    if char_idx < len(idx2char) and char_idx != blank_idx:  # Escludi blank tokens
                        chars.append(idx2char[char_idx])
                target_strings.append(''.join(chars))
            
            # Calcola accuracy
            for pred, target in zip(predictions, target_strings):
                total += 1
                if pred == target:
                    correct += 1
                print(f"Pred: '{pred}' | Target: '{target}' | Match: {pred == target}")
    
    return correct / total if total > 0 else 0

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.0005)
train_pdlpr_epoch(model,train_dataloader,optimizer,"cpu")

RuntimeError: The size of tensor a (1536) must match the size of tensor b (2048) at non-singleton dimension 2