In [None]:
!pip install datasets

In [None]:
!pip install openpyxl

In [None]:
!pip install transformers torch

In [None]:
!pip install tiktoken

In [None]:
!pip install verovio

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq
import torch


In [None]:
# 1. Read the dataset from Excel file
file_path = '/content/OCR_training.xlsx'
df = pd.read_excel(file_path)


df['images_paths'] = df['images_paths'].astype(str)
df['plate_numbers'] = df['plate_numbers'].astype(str)
# 2. Create a Hugging Face Dataset from pandas dataframe
hf_dataset = Dataset.from_pandas(df)

In [None]:
df.columns

Index(['images_paths', 'plate_numbers'], dtype='object')

In [None]:
df['images_paths'][0]

'/content/drive/MyDrive/T5/Images1 - After FT - 9462 Vid/0.jpg'

In [None]:
# 3. Load pre-trained model and tokenizer
# Load model directly
from transformers import AutoModel, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [None]:
# 4. Preprocess the dataset
# 3. Preprocess the dataset
def preprocess_function(df):
    inputs = tokenizer(list(df['images_pathes']), truncation=True, padding='max_length', max_length=128)
    labels = tokenizer(list(df['plate_numbers']), truncation=True, padding='max_length', max_length=128).input_ids
    inputs['labels'] = labels
    return inputs

tokenized_dataset = preprocess_function(df)

In [None]:

# 5. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Output directory
    evaluation_strategy="no",           # Disable evaluation
    per_device_train_batch_size=8,      # Batch size per device for training
    num_train_epochs=3,                 # Number of training epochs
    weight_decay=0.01,                  # Weight decay
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=10,                   # Log every 10 steps
    save_steps=500,                     # Save checkpoint every 500 steps
    save_total_limit=3,                 # Limit total checkpoints
    load_best_model_at_end=True,        # Load the best model when finished
    report_to="none"                    # Disable reporting (like WandB, etc.)
)

# 6. Define data collator (used for dynamic padding in seq2seq tasks)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 8. Fine-tune the model
trainer.train()
