In [None]:
import torch
import os
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms as T
from torch.utils.data import Dataset

In [None]:
# BOUNDING BOX FUNCTION --> IT DOES NOT LET ME IMPORT IT

def get_bounding_box(file):
    numbers=file.split("-")
    values=numbers[3]
    values_v2=values.split("&")
    values_v3=[]
    for i in range(len(values_v2)):
        if "_" in values_v2[i]:
            values_v3.append(values_v2[i].split("_"))
    t=[values_v2[0],values_v3[0],values_v3[1],values_v3[2],values_v2[-1]]
    final_values = [int(x) for item in t for x in (item if isinstance(item, list) else [item])]
    x_coords=[final_values[0],final_values[2],final_values[4],final_values[6]]
    y_coords=[final_values[1],final_values[3],final_values[5],final_values[7]]
    x_min = min(x_coords)
    y_min = min(y_coords)
    x_max = max(x_coords)
    y_max = max(y_coords)
    
    return [float(x_min), float(y_min), float(x_max), float(y_max)]

In [None]:
# CAR PLATE TEXT FUNCTION

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

def get_text(file):
    values=file.split("-")
    text=str(values[4])
    indices=text.split("_")
    province_character=provinces[int(indices[0])]
    alphabet_character=alphabet[int(indices[1])]
    ads_charachters=[ads[int(i)] for i in indices[2:]]
    plate_text=province_character+alphabet_character+"".join(ads_charachters)
    return plate_text

In [None]:
# DATASET 

all_characters = sorted(set(provinces + alphabet + ads))
char2idx = {char: idx + 1 for idx, char in enumerate(all_characters)}  # +1 for CTC blank

class CCPD_dataset_recognition(Dataset):
    
    def __init__(self,path,char2idx,transformations):
        self.path=path
        self.char2idx=char2idx
        self.transformations=transformations
        self.images=[f for f in os.listdir(path) if f.endswith("jpg")]

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        file=self.images[idx]
        gt_bb=get_bounding_box(file)
        gt_text=get_text(file)
        gt_text_tensor= torch.tensor([self.char2idx[c] for c in gt_text], dtype=torch.long)
        full_path=os.path.join(self.path,file)
        image=Image.open(full_path)
        cropped_image=image.crop(gt_bb)
        if self.transformations:
            cropped_image=self.transformations(cropped_image)
        return cropped_image, gt_text_tensor
    
def crnn_collate_fn(batch):
    images, labels = zip(*batch)
    image_batch = torch.stack(images)
    label_lengths = torch.tensor([len(label) for label in labels], dtype=torch.long)
    labels_concat = torch.cat(labels)
    return image_batch, labels_concat, label_lengths

path="/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/train"
transformations=T.Compose([T.Grayscale(num_output_channels=1),T.Resize((64,256)),T.ToTensor()])
recognition_dataset=CCPD_dataset_recognition(path,char2idx,transformations=transformations)
train_dataloader=DataLoader(recognition_dataset,batch_size=8, shuffle=True, collate_fn=crnn_collate_fn)

batch=next(iter(train_dataloader))
batch[0].shape
# returns a batch containing the images as tensors and the ground truth of the text once treated with char2idx (all stacked)

In [None]:
# MODEL

import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2, 1)),
            nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2, 1)),
            nn.AdaptiveAvgPool2d((2, None)),
            nn.BatchNorm2d(512),
            nn.ReLU(),
        )

        self.rnn = nn.LSTM(1024, 256, bidirectional=True, num_layers=2,dropout=0.5, batch_first=True)
        self.fc = nn.Linear(512, num_classes + 1)  # +1 for CTC blank

    def forward(self, x):
        x = self.cnn(x)
        b, c, h, w = x.size()
        assert h == 2, f"Expected height to be 2 after CNN, got {h}"
        x = x.permute(0, 3, 1, 2).contiguous().view(b, w, c * h)  # [B, W, 1024]
        x, _ = self.rnn(x)

        return x.permute(1, 0, 2)  # for CTC loss: [T, B, C]

recognition_model=CRNN(67)
# recognition_model(batch[0])
# returns a tensor of size 25X1X68 --> this output has to be decoded

In [None]:
# TRAINING LOOP

from torch.nn import CTCLoss

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = recognition_model.to(device)

loss_fn = CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(recognition_model.parameters(), lr=0.001)
epochs = 10

for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}\n-------')
    recognition_model.train()
    train_loss = 0.0

    for batch, (images, targets, target_lengths) in enumerate(train_dataloader):
        # images = images.to(device)
        # targets = targets.to(device)
        # target_lengths = target_lengths.to(device)
          # Forward pass
        preds = recognition_model(images)  # preds: [T, B, C]
        T, B, C = preds.size()
        input_lengths = torch.full(size=(B,), fill_value=T, dtype=torch.long)  # Each sample has T time steps
        log_probs = preds.log_softmax(2)  # dim=2 is for classes

        # Compute loss
        loss = loss_fn(log_probs, targets, input_lengths, target_lengths)
        train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            print(f'Batch {batch}: Loss = {loss.item():.4f}')

    avg_loss = train_loss / len(train_dataloader)
    print(f'Average training loss: {avg_loss:.4f}')


In [None]:
# Save model weights
torch.save(recognition_model.state_dict(), "crnn_ctc_model.pth") # LOSS: 33

# TO LOAD IT 

# from crnn_model import CRNN  # or wherever your model is defined
# model = CRNN(imgH=32, nc=1, nclass=NUM_CLASSES, nh=256)  # example args


In [None]:
# EVAL LOOP

eval_path="/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/eval"
recognition_dataset_eval=CCPD_dataset_recognition(eval_path,char2idx,transformations=transformations)
eval_dataloader=DataLoader(recognition_dataset_eval,batch_size=8, shuffle=False, collate_fn=crnn_collate_fn)


recognition_model.eval()
test_loss = 0.0

with torch.inference_mode():
    for images, targets, target_lengths in eval_dataloader:
        preds = recognition_model(images)
        log_probs = preds.log_softmax(2)
        T, B, C = preds.size()
        input_lengths = torch.full(size=(B,), fill_value=T, dtype=torch.long)
        loss = loss_fn(log_probs, targets, input_lengths, target_lengths)
        test_loss += loss.item()

    test_loss /= len(eval_dataloader)
    print(f'Test loss: {test_loss:.4f}')


In [None]:
# DECODING RESULTS

def greedy_decode(ctc_output, idx2char):
    # ctc_output: [T, B, C] (output from the model)
    preds = ctc_output.permute(1, 0, 2)  # [B, T, C]
    pred_strings = []
    for pred in preds:
        best_path = torch.argmax(pred, dim=1).tolist()  # get index with max prob at each timestep
        prev = -1
        decoded = []
        for p in best_path:
            if p != prev and p != 0:  # 0 is the CTC blank
                decoded.append(idx2char[p])
            prev = p
        pred_strings.append("".join(decoded))
    return pred_strings

recognition_model.eval()
correct = 0
total = 0

with torch.inference_mode():
    for images, targets, target_lengths in eval_dataloader:
        outputs = recognition_model(images)  # [T, B, C]
        predictions = greedy_decode(outputs, idx2char)

        # Compare to original labels (you need original ground truth strings)
        # This depends on how your DataLoader is structured

        # Example (if you stored gt_texts during dataset loading):
        for pred, true in zip(predictions, gt_texts):
            if pred == true:
                correct += 1
            total += 1

print(f"Accuracy: {correct / total * 100:.2f}%")


In [None]:
# FINE TUNING trOCR

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")


In [None]:
class CCPD_dataset_recognition2(Dataset):
    
    def __init__(self,path):
        self.path=path
        self.images=[f for f in os.listdir(path) if f.endswith("jpg")]

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        file=self.images[idx]
        gt_bb=get_bounding_box(file)
        gt_text=get_text(file)
        full_path=os.path.join(self.path,file)
        image=Image.open(full_path).convert("RGB")
        cropped_image=image.crop(gt_bb)
        return {"image":cropped_image, "text": gt_text}

In [None]:
def preprocess_for_trocr(example):
    encoding = processor(images=example["image"], text=example["text"], return_tensors="pt", padding="max_length", truncation=True)
    encoding["labels"] = encoding["input_ids"]
    return {k: v.squeeze(0) for k, v in encoding.items()}

In [None]:
from datasets import Dataset as HFDataset

train_path="/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/train"

raw_data = CCPD_dataset_recognition2(train_path)
hf_dataset = HFDataset.from_list([raw_data[i] for i in range(len(raw_data))])
hf_dataset = hf_dataset.map(preprocess_for_trocr)


In [None]:
from transformers import TrainingArguments, Trainer, default_data_collator

training_args = TrainingArguments(
    output_dir="./trocr-ccpd",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    evaluation_strategy="no",
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    tokenizer=processor.tokenizer,
    data_collator=default_data_collator,
)
trainer.train()


In [None]:
# FINE TUNING EASY OCR

In [None]:
path="/home/filippo/Documents/Visual Studio Code/Computer_Vision/Data/train"

In [None]:
class CCPD_dataset_recognition2(Dataset):
    
    def __init__(self,path):
        self.path=path
        self.images=[f for f in os.listdir(path) if f.endswith("jpg")]

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        file=self.images[idx]
        gt_bb=get_bounding_box(file)
        gt_text=get_text(file)
        full_path=os.path.join(self.path,file)
        image=Image.open(full_path).convert("RGB")
        cropped_image=image.crop(gt_bb)
        return {"image":cropped_image, "text": gt_text}

dataset_prova=CCPD_dataset_recognition2(path)

In [None]:
def crop_folder(folder_path):
    cropped_folder = []
    files = os.listdir(folder_path)
    for file in files:
        full_path = os.path.join(folder_path, file)
        cropped_image = crop_image_with_ground_truth(full_path)
        cropped_folder.append(cropped_image)
    return cropped_folder

def crop_image_with_ground_truth(full_path):
    filename = os.path.basename(full_path)  # extract just the filename for parsing
    bb = get_bounding_box(filename)
    image = Image.open(full_path).convert("RGB")
    cropped_image = image.crop(bb)
    return cropped_image

output_image_directory='train_cropped_images'
os.makedirs(output_image_directory,exist_ok=True)

cropped_folder=crop_folder(path)