# Handwriting OCR with TrOCR: IAM & Imgur5K Datasets
This notebook implements end-to-end handwriting OCR using the TrOCR model, following the assignment steps: data loading, preprocessing, model setup, training, evaluation, and saving. Datasets: IAM and Imgur5K. Model: microsoft/trocr-large-handwritten.

In [None]:
# Install all required libraries for handwriting OCR and TrOCR pipeline
!pip install torch torchvision torchaudio --quiet
!pip install transformers --quiet
!pip install jiwer editdistance --quiet
!pip install pillow --quiet

## Task 1: Load and Preprocess IAM and Imgur5K Datasets
- Download IAM and Imgur5K datasets
- Convert images to grayscale, resize to 384x384, normalize
- Prepare PyTorch Dataset and DataLoader
- Use line-level annotations for IAM

In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import requests
from io import BytesIO
import numpy as np

# Helper: Download and preprocess image
def preprocess_image(img_path):
    img = Image.open(img_path).convert('L')  # Grayscale
    img = img.resize((384, 384))
    img = np.array(img) / 255.0  # Normalize
    img = torch.tensor(img, dtype=torch.float32).unsqueeze(0)  # (1, 384, 384)
    return img

# Example custom dataset (replace with actual annotation loading)
class HandwritingDataset(Dataset):
    def __init__(self, img_dir, annotations):
        self.img_dir = img_dir
        self.annotations = annotations  # List of (img_name, text)
    def __len__(self):
        return len(self.annotations)
    def __getitem__(self, idx):
        img_name, text = self.annotations[idx]
        img_path = os.path.join(self.img_dir, img_name)
        image = preprocess_image(img_path)
        return image, text

# Example usage (replace with real paths and annotation loading)
# iam_dataset = HandwritingDataset('IAM/images', iam_annotations)
# imgur5k_dataset = HandwritingDataset('Imgur5K/images', imgur5k_annotations)
# dataloader = DataLoader(iam_dataset, batch_size=4, shuffle=True)

ModuleNotFoundError: No module named 'torch'

## Task 2: Initialize TrOCR Model and Processor
- Use microsoft/trocr-large-handwritten from Hugging Face
- Load model to GPU if available
- Prepare processor for image and text

In [None]:
from transformers import AutoProcessor, VisionEncoderDecoderModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = AutoProcessor.from_pretrained('microsoft/trocr-large-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device)

# Example: process image and text
# inputs = processor(images=image, return_tensors='pt').to(device)
# labels = processor.tokenizer(text, return_tensors='pt', padding='max_length', max_length=128, truncation=True).input_ids.to(device)

## Task 3: Training Configuration
- Adam optimizer (lr=5e-5), batch size 4, mixed precision
- 10% validation split, monitor CER for early stopping

In [None]:
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Example train/val split (replace with real data)
# train_size = int(0.9 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=4)

## Task 4: Evaluation Functions
- Implement Character Error Rate (CER) and Word Error Rate (WER)
- Use editdistance or jiwer library

In [None]:
!pip install jiwer editdistance --quiet
import jiwer
import editdistance

def compute_cer(preds, labels):
    total_edits = 0
    total_chars = 0
    for p, l in zip(preds, labels):
        total_edits += editdistance.eval(p, l)
        total_chars += len(l)
    return total_edits / total_chars if total_chars > 0 else 0

def compute_wer(preds, labels):
    return jiwer.wer(labels, preds)

# Example usage:
# cer = compute_cer(['hello'], ['h3llo'])
# wer = compute_wer(['hello world'], ['h3llo world'])

## Task 5: Fine-tune the Model
- Train for 10+ epochs, combine IAM and Imgur5K
- Log CER and WER after each epoch

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for images, texts in train_loader:
        images = images.to(device)
        labels = processor.tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        with autocast():
            outputs = model(pixel_values=images, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, texts in val_loader:
            images = images.to(device)
            labels = list(texts)
            generated_ids = model.generate(pixel_values=images)
            preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
            val_preds.extend(preds)
            val_labels.extend(labels)
    cer = compute_cer(val_preds, val_labels)
    wer = compute_wer(val_preds, val_labels)
    print(f"Epoch {epoch+1}: CER={cer:.4f}, WER={wer:.4f}")

## Task 6: Evaluate on Held-out IAM Test Set
- Evaluate on test split, report final CER and WER
- Display predictions and errors for 5 samples

In [None]:
# Example test evaluation (replace with real test_loader)
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for images, texts in test_loader:
        images = images.to(device)
        labels = list(texts)
        generated_ids = model.generate(pixel_values=images)
        preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
        test_preds.extend(preds)
        test_labels.extend(labels)
cer = compute_cer(test_preds, test_labels)
wer = compute_wer(test_preds, test_labels)
print(f"Test CER: {cer:.4f}, Test WER: {wer:.4f}")

# Show 5 sample predictions
for i in range(5):
    print(f"GT: {test_labels[i]} | Pred: {test_preds[i]}")

## Task 7: Save the Fine-tuned Model
- Save model weights and processor in Hugging Face format

In [None]:
model.save_pretrained('finetuned_trocr')
processor.save_pretrained('finetuned_trocr')
# Optionally: push to Hugging Face Hub
# from huggingface_hub import notebook_login
# notebook_login()
# model.push_to_hub('your-username/finetuned-trocr')
# processor.push_to_hub('your-username/finetuned-trocr')

## Task 8 (Optional): Generate Synthetic Handwriting Data
- Use TextRecognitionDataGenerator to create synthetic samples

In [None]:
# Example: Generate synthetic data using TextRecognitionDataGenerator
# !git clone https://github.com/Belval/TextRecognitionDataGenerator.git
# %cd TextRecognitionDataGenerator
# !pip install -r requirements.txt
# !python run.py -w 1000 -c 5 --output_dir ../synthetic_data
# Add generated images and labels to your training set

In [None]:
# Install required libraries for handwriting OCR and TrOCR pipeline
!pip install torch torchvision torchaudio --quiet
!pip install transformers --quiet
!pip install jiwer editdistance --quiet
!pip install pillow --quiet