In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Path to the dataset
dataset_path = "IAM\\gt_test.txt"
image_folder = "IAM\\image"  # Change this to the correct image folder

# Store data
dataset = []

# Read the dataset file
with open(dataset_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")  # Tab-separated
        if len(parts) == 2:  # Ensure valid format
            filename, text = parts
            image_path = os.path.join(image_folder, filename)
            dataset.append({"image": image_path, "text": text})

# Print a sample
print(dataset[:3])

# Convert to Pandas DataFrame (for easy splitting)
df = pd.DataFrame(dataset)

# Split into Train (80%), Remaining (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Split Remaining into Validation (10%) and Test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

# Print dataset sizes
print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

[{'image': 'IAM\\image\\c04-110-00.jpg', 'text': "Become a success with a disc and hey presto ! You're a star ... . Rolly sings with"}, {'image': 'IAM\\image\\c04-110-01.jpg', 'text': 'assuredness " Bella Bella Marie " ( Parlophone ) , a lively song that changes tempo mid-way .'}, {'image': 'IAM\\image\\c04-110-02.jpg', 'text': "I don't think he will storm the charts with this one , but it's a good start ."}]
Train: 2332, Validation: 291, Test: 292


In [2]:
from datasets import Dataset

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Print dataset
print(train_dataset)


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['image', 'text', '__index_level_0__'],
    num_rows: 2332
})


In [3]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("CUDA Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")
print("Current Device:", torch.cuda.current_device())


CUDA Available: True
CUDA Device Count: 1
CUDA Device Name: NVIDIA GeForce GTX 1650
Current Device: 0


In [4]:
from transformers import TrOCRProcessor
from PIL import Image
import torch

# Load processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten", use_fast=True)

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("###########\n",device,"\n###########")

# Preprocessing function
def preprocess(sample):
    image = Image.open(sample["image"]).convert("RGB")  # Open image
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.squeeze().to(device)  # Move to GPU
    
    # Encode text labels
    labels = processor.tokenizer(sample["text"], padding="max_length", truncation=True, return_tensors="pt").input_ids.squeeze().to(device)  # Move to GPU
    
    return {"pixel_values": pixel_values, "labels": labels}

# Apply preprocessing to each dataset
train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)
test_dataset = test_dataset.map(preprocess)

print(train_dataset[0])  # Now all data should be on GPU


###########
 cuda 
###########


Map: 100%|██████████| 2332/2332 [02:29<00:00, 15.62 examples/s]
Map: 100%|██████████| 291/291 [00:07<00:00, 39.13 examples/s]
Map: 100%|██████████| 292/292 [00:07<00:00, 38.75 examples/s]


{'image': 'IAM\\image\\m04-231-06.jpg', 'text': 'a great basket .', '__index_level_0__': 1867, 'pixel_values': [[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9921568632125854, 0.9686274528503418, 0.9843137264251709, 0.9764705896377563, 0

In [5]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load pre-trained TrOCR model and processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU
model.to(device)

# Function to evaluate the model
def evaluate(model, test_dataset):
    correct = 0
    total = 0

    for _, sample in test_dataset.iterrows():  # Iterate over test samples
        image_path = sample["image"]
        label = sample["text"]

        # Open image
        image = Image.open(image_path).convert("RGB")

        # Preprocess image
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)  # Move to GPU

        # Predict
        with torch.no_grad():
            generated_ids = model.generate(pixel_values)
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        print(f"GT: {label} | Predicted: {predicted_text}")

        # Compare (Simple Accuracy)
        if predicted_text.strip().lower() == label.strip().lower():
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"Pre-trained Model Accuracy: {accuracy:.2%}")

# Run Evaluation Before Training
evaluate(model, test_df)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transfor

Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(


GT: after which the Israelites sighed . | Predicted: afk which the Israelites signed
GT: and the vague advantages of military glory ; | Predicted: and the vague advantages of military glory ;
GT: and have a look at the mining camp . | Predicted: and have a look at the mining camp .
GT: symbolises the bitterness that is hurled against us . | Predicted: symbolises the bitterness that is hurled against us .
GT: a verdict of " wilful murder against some person or | Predicted: a verdict of " wilful murder against some person or
GT: Her eyes darkened in torment . ' Are | Predicted: Hereyes darkened in torment . Are
GT: thinking that he and Angelina would have | Predicted: thinking that he and Angelina would have
GT: your heart and with all your soul and with all your might | Predicted: your heart and with all your soul and with all your night
GT: Little Phelpham . | Predicted: little Phelpham .
GT: We naturally cannot overlook that Magyar cookery owes | Predicted: We naturally cannot overloo

In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# from PIL import Image

# # 1. Manual Memory Optimization
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.benchmark = True

# # 2. Dataset Class with On-the-Fly Processing
# class OCRDataset(Dataset):
#     def __init__(self, df, processor):
#         self.df = df
#         self.processor = processor
        
#     def __len__(self):
#         return len(self.df)
    
#     def __getitem__(self, idx):
#         image = Image.open(self.df.iloc[idx]['image']).convert("RGB").resize((256, 256))
#         text = self.df.iloc[idx]['text']
        
#         # Process on CPU
#         pixel_values = self.processor(
#             images=image, 
#             return_tensors="pt"
#         ).pixel_values.squeeze()
        
#         labels = self.processor.tokenizer(
#             text,
#             max_length=32,
#             padding="max_length",
#             truncation=True,
#             return_tensors="pt"
#         ).input_ids.squeeze()
        
#         return pixel_values, labels

# # 3. Initialize Components
# processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
# model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

# # 4. Aggressive Memory Config
# model.config.use_cache = False
# model.encoder.gradient_checkpointing = True
# model.decoder.gradient_checkpointing = True
# model = model.half().to('cuda')

# # 5. Training Setup
# BATCH_SIZE = 1
# GRAD_ACCUM = 8
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# # 6. DataLoader with Manual Batching
# def collate_fn(batch):
#     pixel_values = torch.stack([item[0] for item in batch]).to('cuda', dtype=torch.float16)
#     labels = torch.stack([item[1] for item in batch]).to('cuda')
#     return pixel_values, labels

# train_loader = DataLoader(
#     OCRDataset(train_df, processor),
#     batch_size=BATCH_SIZE,
#     collate_fn=collate_fn,
#     shuffle=True
# )

# # 7. Training Loop
# for epoch in range(3):
#     model.train()
#     optimizer.zero_grad()
    
#     for batch_idx, (pixel_values, labels) in enumerate(train_loader):
#         # Forward pass
#         outputs = model(pixel_values=pixel_values, labels=labels)
#         loss = outputs.loss / GRAD_ACCUM
        
#         # Backward pass
#         loss.backward()
        
#         # Gradient accumulation
#         if (batch_idx + 1) % GRAD_ACCUM == 0:
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             optimizer.zero_grad()
            
#             # Memory cleanup
#             del pixel_values, labels, outputs
#             torch.cuda.empty_cache()
            
#     # Save after each epoch
#     torch.save(model.state_dict(), f"epoch_{epoch}.pt")




Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Error while downloading from https://cdn-lfs.hf.co/microsoft/trocr-base-stage1/881612edf38f0bd647f22e22fced77a71f379219ed9043ad121ab0b1e5f762f9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1743757824&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mzc1NzgyNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9taWNyb3NvZnQvdHJvY3ItYmFzZS1zdGFnZTEvODgxNjEyZWRmMzhmMGJkNjQ3ZjIyZTIyZmNlZDc3YTcxZjM3OTIxOWVkOTA0M2FkMTIxYWIwYjFlNWY3NjJmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=IzT-ysShdT26T%7ElSgiUMjOEfbW3yJ62Em5BIFeQVYkfCaFYfUOYO3l%7Eb2b

ValueError: Make sure to set the decoder_start_token_id attribute of the model's configuration.

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from tqdm import tqdm
import jiwer  # For accuracy metrics

# 1. Manual Memory Optimization
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True

# 2. Dataset Class
class OCRDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        image = Image.open(self.df.iloc[idx]['image']).convert("RGB").resize((256, 256))
        text = self.df.iloc[idx]['text']
        
        pixel_values = self.processor(
            images=image, 
            return_tensors="pt"
        ).pixel_values.squeeze()
        
        labels = self.processor.tokenizer(
            text,
            max_length=32,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze()
        
        return pixel_values, labels, text  # Return original text for accuracy calculation

# 3. Initialize Components
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

# Critical configuration for decoder
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

# 4. Memory Config
model.config.use_cache = False
model.encoder.gradient_checkpointing = True
model.decoder.gradient_checkpointing = True
model = model.half().to('cuda')



Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [19]:
# 5. Training Setup
BATCH_SIZE = 1
GRAD_ACCUM = 8
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# 6. DataLoader
def collate_fn(batch):
    pixel_values = torch.stack([item[0] for item in batch]).to('cuda', dtype=torch.float16)
    labels = torch.stack([item[1] for item in batch]).to('cuda')
    texts = [item[2] for item in batch]  # Original texts
    return pixel_values, labels, texts

train_loader = DataLoader(
    OCRDataset(train_df, processor),
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=True
)

# 7. Validation Loader (Add your val_df)
# Create validation subset of 50 samples
VAL_SUBSET_SIZE = 35
val_subset = val_df.iloc[:VAL_SUBSET_SIZE]  # First 50 validation samples

# Modified validation DataLoader
val_loader = DataLoader(
    OCRDataset(val_subset, processor),  # Use subset instead of full val_df
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=False
)


# 8. Metrics Calculation
def calculate_metrics(predictions, references):
    cer = jiwer.cer(references, predictions)
    wer = jiwer.wer(references, predictions)
    return cer, wer



In [22]:
# 9. Enhanced Training Loop (70 samples)
best_val_cer = float('inf')
all_metrics = []

# Create subsets
TRAIN_SUBSET_SIZE = 70
VAL_SUBSET_SIZE = 50

train_subset = train_df.iloc[:TRAIN_SUBSET_SIZE]
val_subset = val_df.iloc[:VAL_SUBSET_SIZE]

# Update DataLoaders
train_loader = DataLoader(
    OCRDataset(train_subset, processor),
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=True
)

val_loader = DataLoader(
    OCRDataset(val_subset, processor),
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=False
)

for epoch in range(3):
    # Training Phase
    model.train()
    optimizer.zero_grad()
    train_loss = 0
    
    epoch_progress = tqdm(
        enumerate(train_loader),  # Now using 70-sample loader
        total=len(train_loader),  # Automatically adjusted to 70/BATCH_SIZE
        desc=f"Epoch {epoch+1}/3 [Train]",
        unit="batch"
    )
    
    for batch_idx, (pixel_values, labels, _) in epoch_progress:
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss / GRAD_ACCUM
        loss.backward()
        
        train_loss += loss.item() * GRAD_ACCUM
        
        if (batch_idx + 1) % GRAD_ACCUM == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            del pixel_values, labels, outputs
            torch.cuda.empty_cache()
        
        epoch_progress.set_postfix({
            "loss": f"{train_loss/(batch_idx+1):.4f}",
            "lr": f"{optimizer.param_groups[0]['lr']:.2e}"
        })

    # Validation Phase (using 50 samples)
    model.eval()
    val_loss = 0
    all_preds = []
    all_texts = []
    
    with torch.no_grad():
        val_progress = tqdm(val_loader, desc=f"Epoch {epoch+1}/3 [Val]", unit="batch")
        for pixel_values, labels, texts in val_progress:
            outputs = model(pixel_values=pixel_values, labels=labels)
            val_loss += outputs.loss.item()
            
            generated_ids = model.generate(pixel_values)
            pred_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
            
            all_preds.extend(pred_texts)
            all_texts.extend(texts)
            
            del pixel_values, labels, outputs

    # Calculate metrics
    val_loss /= len(val_loader)
    cer, wer = calculate_metrics(all_preds, all_texts)
    
    # Store metrics
    epoch_metrics = {
        'epoch': epoch+1,
        'train_loss': train_loss/len(train_loader),
        'val_loss': val_loss,
        'cer': cer,
        'wer': wer
    }
    all_metrics.append(epoch_metrics)
    
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train Loss: {epoch_metrics['train_loss']:.4f}")
    print(f"Val Loss: {epoch_metrics['val_loss']:.4f}")
    print(f"Character Error Rate: {cer*100:.2f}%")
    print(f"Word Error Rate: {wer*100:.2f}%")
    
    if cer < best_val_cer:
        best_val_cer = cer
        torch.save(model.state_dict(), f"best_model_70samples_epoch{epoch+1}.pt")
        print(f"New best model saved (CER: {cer*100:.2f}%)")

print("\n=== Training Complete ===")
print(f"Best Validation CER: {min(m['cer'] for m in all_metrics)*100:.2f}%")
print(f"Final Validation CER: {all_metrics[-1]['cer']*100:.2f}%")


Epoch 1/3 [Train]:   0%|          | 0/70 [00:00<?, ?batch/s]

Epoch 1/3 [Train]: 100%|██████████| 70/70 [02:03<00:00,  1.77s/batch, loss=nan, lr=1.00e-05]
Epoch 1/3 [Val]: 100%|██████████| 50/50 [02:55<00:00,  3.52s/batch]



Epoch 1 Summary:
Train Loss: nan
Val Loss: nan
Character Error Rate: 100.00%
Word Error Rate: 100.00%
New best model saved (CER: 100.00%)


Epoch 2/3 [Train]: 100%|██████████| 70/70 [02:04<00:00,  1.79s/batch, loss=nan, lr=1.00e-05]
Epoch 2/3 [Val]: 100%|██████████| 50/50 [03:00<00:00,  3.60s/batch]



Epoch 2 Summary:
Train Loss: nan
Val Loss: nan
Character Error Rate: 100.00%
Word Error Rate: 100.00%


Epoch 3/3 [Train]: 100%|██████████| 70/70 [02:01<00:00,  1.74s/batch, loss=nan, lr=1.00e-05]
Epoch 3/3 [Val]: 100%|██████████| 50/50 [02:59<00:00,  3.58s/batch]


Epoch 3 Summary:
Train Loss: nan
Val Loss: nan
Character Error Rate: 100.00%
Word Error Rate: 100.00%

=== Training Complete ===
Best Validation CER: 100.00%
Final Validation CER: 100.00%



