In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Training model

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from transformers import LogitsProcessor


from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split

import os
import random
import cv2
from PIL import Image
from sklearn.metrics import precision_score, recall_score
from tqdm import tqdm  # For progress tracking

from torchvision import transforms
from datasets import Dataset

from transformers import pipeline
from transformers.onnx import export
from transformers import AutoModelForSequenceClassification, AutoTokenizer

### Set paths

In [None]:
model_pth = "/content/drive/MyDrive/lp_project/saved"         # init model path
train_data_dir = "/content/drive/MyDrive/lp_project/train_data"          # Replace with your train dataset folder path   Note: images should be in train_data with labels
save_model_pth = "/content/drive/MyDrive/lp_project/saved"    # Replace with your save  folder path
test_dataset_path = "/content/drive/MyDrive/lp_project/test_data"           # Replace with your test dataset folder path


In [None]:

os.environ["WANDB_DISABLED"] = "true"
# Load processor and model
processor = TrOCRProcessor.from_pretrained(model_pth)
model = VisionEncoderDecoderModel.from_pretrained(model_pth)
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_pth)



Config of the encoder: <class 'transformers.models.deit.modeling_deit.DeiTModel'> is overwritten by shared encoder config: DeiTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 384,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "model_type": "deit",
  "num_attention_heads": 6,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 384,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 1024,
  "decode

In [None]:
import os
# Prepare dataset by extracting LP numbers from filenames
def preprocess_data_from_folder(data_dir):
    images = []
    labels = []

    for filename in os.listdir(data_dir):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            # Extract license plate number from filename (before the first underscore)
            lp_number = filename.split("_")[0]
            images.append(os.path.join(data_dir, filename))
            labels.append(lp_number)

    return {"image_path": images, "text": labels}


# Load data
data = preprocess_data_from_folder(train_data_dir)

# Split the data into 80% train and 20% validation
train_images, val_images, train_labels, val_labels = train_test_split(
    data["image_path"], data["text"], test_size=0.01, random_state=42
)

# Create train and validation datasets
train_data = {"image_path": train_images, "text": train_labels}
val_data = {"image_path": val_images, "text": val_labels}

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)


In [None]:


transform = transforms.Compose([
    # Geometric transforms (keep plates readable)
    transforms.RandomRotation(degrees=15),                # Mild rotation
    transforms.RandomAffine(
        degrees=0,
        translate=(0.1, 0.1),  # Minor shifts
        shear=5                 # Gentle slant
    ),
    transforms.RandomPerspective(
        distortion_scale=0.4,   # Moderate warping
        p=0.5
    ),

    # Color/lighting adjustments
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.1,
        hue=0.05
    ),

    # Noise and blur (apply sparingly)
    transforms.RandomApply(
        [transforms.GaussianBlur(kernel_size=3)],
        p=0.3
    ),

    # Resizing and normalization
    transforms.Resize((224, 224)),           # Critical for ViT/CNNs

])



def collate_fn(batch):
    # Apply transformations and process images
    images = [
        transform(Image.open(item['image_path']).convert("RGB")) for item in batch
    ]
    texts = [item['text'] for item in batch]

    # Process images and text into tensors with padding and truncation
    # pixel_values = torch.stack(images)  # Stack all transformed tensors into a single batch tensor
    pixel_values = processor(images, return_tensors="pt", padding=True, truncation=True).pixel_values
    labels = processor.tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).input_ids

    return {"pixel_values": pixel_values, "labels": labels}



train_dataloader = DataLoader(train_dataset, batch_size=64, collate_fn=collate_fn)

val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn)

In [None]:
# 4. Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)



In [None]:
# Set the decoder start token ID
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

In [None]:
from torch.cuda.amp import autocast, GradScaler

# Initialize scaler for mixed precision
scaler = GradScaler()
# Define number of epochs
epochs = 50

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, batch in enumerate(train_dataloader):
        # Move data to device (GPU/CPU)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Enable mixed precision training
        with autocast():


            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss



        # Scale loss and backpropagate
        scaler.scale(loss).backward()

        # Step optimizer and scaler
        scaler.step(optimizer)
        scaler.update()

        # Update running loss
        running_loss += loss.item()

        # Optional: Print loss every N batches
        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}, Loss: {loss.item()}")

    # Update learning rate scheduler
    scheduler.step()

    # Average loss for the epoch
    avg_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} completed. Average Loss: {avg_loss:.4f}")




In [None]:
# Save the fine-tuned model and processor
model.save_pretrained(save_model_pth)
processor.save_pretrained(save_model_pth)
print("Model and processor saved.")

In [None]:
class LicensePlateLogitsProcessor(LogitsProcessor):
    def __init__(self, processor, max_digits=9):
        self.processor = processor
        self.max_digits = max_digits
        self.digit_token_ids = []
        self.char_token_ids = []
        for token_id in range(processor.tokenizer.vocab_size):
            token = processor.decode([token_id]).strip()
            if len(token) == 1:
                if token.isdigit():
                    self.digit_token_ids.append(token_id)
                elif token.isalpha():
                    self.char_token_ids.append(token_id)

    def __call__(self, input_ids, scores):
        current_step = input_ids.shape[1]
        batch_size = input_ids.shape[0]

        for batch_idx in range(batch_size):
            generated_tokens = input_ids[batch_idx].tolist()
            digit_count = sum(1 for t in generated_tokens if t in self.digit_token_ids)

            if current_step < 3:
                for token_id in self.char_token_ids:
                    scores[batch_idx, token_id] = -float('inf')
            elif current_step == 3:
                for token_id in self.digit_token_ids:
                    scores[batch_idx, token_id] = -float('inf')

            else:
                if digit_count >= self.max_digits:
                    for token_id in self.digit_token_ids:
                        scores[batch_idx, token_id] = -float('inf')

        return scores


In [None]:


save_data = []
def pad_to_square_cv2(image):
    # Get the current dimensions of the image
    height, width = image.shape[:2]

    # Calculate padding to make the image square
    if width != height:
        # Determine the padding needed for each side
        pad_size = abs(width - height) // 2
        if width > height:
            # Add padding to the top and bottom
            padding = ((pad_size, pad_size), (0, 0), (0, 0))  # Padding for (top, bottom, left, right)
        else:
            # Add padding to the left and right
            padding = ((0, 0), (pad_size, pad_size), (0, 0))  # Padding for (top, bottom, left, right)

        # Apply padding using numpy
        padded_image = np.pad(image, padding, mode='constant', constant_values=(0, 0))  # Black padding

    else:
        # No padding needed if the image is already square
        padded_image = image

    return padded_image


all_files = [os.path.join(test_dataset_path, f) for f in os.listdir(test_dataset_path) if f.endswith(".jpg") or f.endswith(".png")]


# Initialize logits processor with format rules
logits_processor = LicensePlateLogitsProcessor(processor)

def predict_license_plate(image_path):
    # Load image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    pil_image = Image.fromarray(image)  # Convert to PIL Image

    # Process image and make prediction
    inputs = processor(images=pil_image, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.pixel_values,
        logits_processor=[logits_processor],
        max_length=9,  # Assuming license plates have 10 characters
        num_beams=10,   # Beam search for better accuracy
        early_stopping=True
    )
    prediction = processor.decode(outputs[0], skip_special_tokens=True)


    return prediction

# Evaluation variables
true_labels = []
predicted_labels = []

# Read random images and make predictions
for filename in tqdm(all_files, desc="Processing Images"):
    # Extract ground truth label from filename
    filename___ = filename.split("/")[-1]
    ground_truth = filename___.split("_")[0]  # The LP number is before the first underscore

    true_labels.append(ground_truth)

    # Get prediction
    # image_path = os.path.join(dataset_path, filename)
    predicted_label = predict_license_plate(filename)
    predicted_label = predicted_label.replace(" ", "")
    predicted_label = predicted_label.lower()
    predicted_labels.append(predicted_label)


    name_fn = filename___.split('.')[0]
    save_data.append((name_fn, predicted_label))



# Calculate metrics
# LP-Level Accuracy
lp_exact_matches = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == pred)
lp_accuracy = lp_exact_matches / len(true_labels)

# Print results
print(f"Total Samples: {len(true_labels)}")
print(f"LP-Level Exact Match Accuracy: {lp_accuracy:.4f}")


#### Save results in .txt

In [None]:
# Sort the data by converting the first item to an integer
sorted_data = sorted(save_data, key=lambda x: int(x[0]))

In [None]:

with open("result_all.txt", "w") as file:
    # Write each item in the data list to the file
    for item in sorted_data:
        file.write(f"{item[0]}  {item[1]}  "+"\n")  # Format: number followed by string


#### export to onnx

In [None]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [None]:
image_path = '/content/drive/MyDrive/LP_detection/lp_project/test.jpg'
image = Image.open(image_path)

# Prepare the image for the model (processor will handle necessary transformations)
inputs = processor(images=image, return_tensors="pt").to(device)  # Move inputs to the same device as the model

# Create dummy decoder input IDs (a tensor of zeros)
decoder_input_ids = torch.zeros((inputs["pixel_values"].shape[0], 1), dtype=torch.long).to(device)

# Export the model to ONNX format with opset version 14
onnx_output_path = "/content/drive/MyDrive/LP_detection/lp_project/trocr_model.onnx"

# Define the input names (according to the expected inputs of your model)
input_names = ["pixel_values", "decoder_input_ids"]
output_names = ["logits"]  # This is the output of the model (logits from the decoder)

# Export the model
torch.onnx.export(model,
                  (inputs["pixel_values"], decoder_input_ids),  # Provide both image tensor and dummy decoder input
                  onnx_output_path,
                  input_names=input_names,
                  output_names=output_names,
                  opset_version=14,  # Use opset version 14
                  dynamic_axes={
                      "pixel_values": {0: "batch_size", 2: "height", 3: "width"},  # dynamic input dimensions
                      "decoder_input_ids": {0: "batch_size"},  # dynamic decoder input dimensions
                      "logits": {0: "batch_size"}})  # dynamic output dimensions

print(f"Model exported to {onnx_output_path}")

Model exported to /content/drive/MyDrive/LP_detection/lp_project/trocr_model.onnx
