In [1]:
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
import os
from transformers import VisionEncoderDecoderModel, TrOCRProcessor, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SpanishOCRDataset(Dataset):
    def __init__(self, csv_file, images_dir, processor, max_target_length=128):
        self.data = pd.read_csv(csv_file)
        self.images_dir = images_dir
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.images_dir, row['image_path'])  # <-- CHANGED THIS LINE
        text = row['text']

        image = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(text, padding="max_length", max_length=self.max_target_length, truncation=True, return_tensors="pt").input_ids.squeeze(0)
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values,
            "labels": labels,
        }


In [3]:
def build_model():
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model = model.to(device)
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
    model.config.pad_token_id = processor.tokenizer.pad_token_id
    print("Model built")
    return processor, model

def build_dataset(processor):
    ds = SpanishOCRDataset(csv_file="data/spanish_data.csv", images_dir="data/images/", processor=processor,)
    print("Dataset built")
    return ds

def build_trainer(model, train_dataset):
    args = TrainingArguments(
        output_dir="./spanish_ocr_model",
        per_device_train_batch_size=8,
        num_train_epochs=5,
        learning_rate=5e-5,
        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        logging_dir="./logs",
        fp16=False,  # if you have a GPU that supports it
        report_to="none",  # don't log to wandb/huggingface unless you want to
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=None,  # you can split dataset later
    )

    print("Trainer built")
    return trainer

In [4]:
processor, model = build_model()
train_dataset = build_dataset(processor)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transfor

Model built
Dataset built


In [None]:
trainer = build_trainer(model, train_dataset)
trainer.train()

Trainer built


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
