In [13]:
!pip install transformers
!pip install verovio
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [20]:
import random
import string
import math
import itertools
import os
import numpy as np
import pandas as pd
import imgaug
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn.model_selection
import xml.etree.ElementTree as ET
import cv2
import torch
import PIL
from PIL import Image

# Download a dataset from Kaggle

In [1]:
#!kaggle datasets download -d riotulab/saudi-license-plate-characters

Dataset URL: https://www.kaggle.com/datasets/riotulab/saudi-license-plate-characters
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading saudi-license-plate-characters.zip to /content
 80% 7.00M/8.71M [00:01<00:00, 10.0MB/s]
100% 8.71M/8.71M [00:01<00:00, 7.23MB/s]


In [None]:
#!unzip /content/saudi-license-plate-characters.zip -d /content/Data

# Load  pre-trained transformer model

In [14]:
# Load pre-trained transformer model (stepfun-ai/GOT-OCR2_0)
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0")
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0")

The repository for stepfun-ai/GOT-OCR2_0 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/stepfun-ai/GOT-OCR2_0.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for stepfun-ai/GOT-OCR2_0 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/stepfun-ai/GOT-OCR2_0.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for stepfun-ai/GOT-OCR2_0 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/stepfun-ai/GOT-OCR2_0.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


qwen.tiktoken:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/149 [00:00<?, ?B/s]

In [None]:
# Define the directory where the XML files and images are stored
xml_dir = '/content/Data/License-Characters-by-2-27classes/train'
image_dir = '/content/Data/License-Characters-by-2-27classes/train'

In [16]:
# Set dataset directories
train_dir = '/content/Data/License-Characters-by-2-27classes/train'
test_dir = '/content/Data/License-Characters-by-2-27classes/test'

In [22]:
def load_data_from_folder(folder):
    images = []
    labels = []
    for file_name in os.listdir(folder):
        if file_name.endswith(".jpg") or file_name.endswith(".png"):  # Adjust if other image formats are used
            img_path = os.path.join(folder, file_name)
            xml_path = os.path.splitext(img_path)[0] + '.xml'

            # Load image
            img = Image.open(img_path)
            images.append(img)

            # Load label from XML
            label = parse_xml(xml_path)
            labels.append(label)

    return images, labels

In [8]:

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    filename = root.find('filename').text

    labels = []
    for obj in root.findall('object'):
        label = obj.find('name').text
        labels.append(label)

    return labels

def has_three_alphabets(labels):
    alphabet_count = sum([1 for label in labels if label.isalpha()])
    return alphabet_count == 3

image_labels = []
for xml_filename in os.listdir(xml_dir):
    if xml_filename.endswith('.xml'):
        xml_file_path = os.path.join(xml_dir, xml_filename)

        image_filename, labels = parse_xml(xml_file_path)

        if has_three_alphabets(labels):
            image_path = os.path.join(image_dir, image_filename)
            image_labels.append((image_path, labels))

In [23]:
# Load train and test datasets
train_images, train_labels = load_data_from_folder(train_dir)
test_images, test_labels = load_data_from_folder(test_dir)

In [28]:
len(train_images)

127

In [29]:
len(train_labels)

127

In [32]:

# Function to preprocess images (resize, normalize)
def preprocess_image(image):
    target_size = (224, 224)  # Adjust based on model input size
    img = image.resize(target_size)
    img = np.array(img)

    # Check if the image is grayscale and convert to RGB if needed
    if len(img.shape) == 2:  # Grayscale image has 2 dimensions
        img = np.stack((img,) * 3, axis=-1)  # Convert to 3 channels (RGB)

    img = img.astype(np.float32) / 255.0  # Normalize pixel values to range [0, 1]
    img = np.transpose(img, (2, 0, 1))  # Convert to CHW format
    return torch.tensor(img)


# Preprocessing function for dataset
max_target_length = 12  # Maximum length of a license plate number

def preprocess_data(images, labels):
    # Preprocess images
    pixel_values = [preprocess_image(img) for img in images]

    # Tokenize labels
    tokenized_labels = [tokenizer(label, max_length=max_target_length, padding="max_length", truncation=True).input_ids for label in labels]

    # Replace padding token ids with -100 to ignore them in the loss calculation
    tokenized_labels = [[(label if label != tokenizer.pad_token_id else -100) for label in seq] for seq in tokenized_labels]

    return {"pixel_values": pixel_values, "labels": tokenized_labels}

# Preprocess train and test datasets
train_data = preprocess_data(train_images, train_labels)
test_data = preprocess_data(test_images, test_labels)

# Convert datasets into Hugging Face's Dataset object
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)


ValueError: too many values to unpack (expected 2)

In [30]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,  # Adjust batch size based on available GPU memory
    per_device_eval_batch_size=4,
    output_dir="./results",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=3,
    num_train_epochs=3,  # Adjust as necessary
    learning_rate=5e-5,  # Adjust learning rate as necessary
)

# Define the Seq2Seq trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,  # Use the tokenizer only (no feature extractor)
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_GOT_OCR2_0")
tokenizer.save_pretrained("./fine_tuned_GOT_OCR2_0")

NameError: name 'dataset' is not defined