In [None]:
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [None]:
import os
os.listdir('/content/drive/MyDrive/')


['ocr_dataset']

In [None]:
# Paths to the dataset
easy_images_path = '/content/drive/MyDrive/ocr_dataset/easy/images/'
hard_images_path = '/content/drive/MyDrive/ocr_dataset/hard/images/'
bonus_images_path = '/content/drive/MyDrive/ocr_dataset/bonus/images/'


In [None]:
import os
import random
import pandas as pd

# Global debug flag
DEBUG = True

# Define directories for Easy and Hard datasets
easy_image_dir = '/content/drive/MyDrive/ocr_dataset/easy/images/'
hard_image_dir = '/content/drive/MyDrive/ocr_dataset/hard/images/'

if DEBUG:
    print(f"[DEBUG] Easy image directory: {easy_image_dir}")
    print(f"[DEBUG] Hard image directory: {hard_image_dir}")

# Define CSV files for easy and hard labels
easy_labels_file = '/content/drive/MyDrive/ocr_dataset/easy/labels.csv'
hard_labels_file = '/content/drive/MyDrive/ocr_dataset/hard/labels.csv'

if DEBUG:
    print(f"[DEBUG] Easy labels file: {easy_labels_file}")
    print(f"[DEBUG] Hard labels file: {hard_labels_file}")

# Load the CSV files
easy_labels = pd.read_csv(easy_labels_file)
hard_labels = pd.read_csv(hard_labels_file)

if DEBUG:
    print(f"[DEBUG] Loaded easy_labels with shape: {easy_labels.shape}")
    print("[DEBUG] First 5 rows of easy_labels:")
    print(easy_labels.head())
    print(f"[DEBUG] Loaded hard_labels with shape: {hard_labels.shape}")
    print("[DEBUG] First 5 rows of hard_labels:")
    print(hard_labels.head())

# Ensure that all the 100 unique words are included (Easy and Hard sets should contain these)
unique_words = easy_labels['text'].unique().tolist()  # Assuming 'text' column holds the words
print(f"[DEBUG] Unique words: {len(unique_words)} words found.")

# Initialize an empty list to store all data (img_path, label pairs)
all_data = []

# For each unique word in the dataset, select 4 random images from the Easy set and 4 from the Hard set
for word in unique_words:
    # Get the 4 random images for this word from the easy set
    easy_samples = easy_labels[easy_labels['text'] == word].sample(n=50, random_state=42)
    # Get the 4 random images for this word from the hard set
    hard_samples = hard_labels[hard_labels['text'] == word].sample(n=50, random_state=42)

    # Add the img_path and label pairs to the all_data list
    for sample in easy_samples.itertuples():
        img_path = os.path.join(easy_image_dir, sample.filename)
        all_data.append({'img_path': img_path, 'label': word.lower()})  # Store label in lowercase

    for sample in hard_samples.itertuples():
        img_path = os.path.join(hard_image_dir, sample.filename)
        all_data.append({'img_path': img_path, 'label': word.lower()})  # Store label in lowercase

# Convert the list of data into a DataFrame
final_dataset = pd.DataFrame(all_data)

# Save the final dataset to a new CSV file
classification_csv_path = '/content/drive/MyDrive/ocr_dataset/ocr_classification.csv'
final_dataset.to_csv(classification_csv_path, index=False)

print(f"Generated classification CSV file: {classification_csv_path}")


[DEBUG] Easy image directory: /content/drive/MyDrive/ocr_dataset/easy/images/
[DEBUG] Hard image directory: /content/drive/MyDrive/ocr_dataset/hard/images/
[DEBUG] Easy labels file: /content/drive/MyDrive/ocr_dataset/easy/labels.csv
[DEBUG] Hard labels file: /content/drive/MyDrive/ocr_dataset/hard/labels.csv
[DEBUG] Loaded easy_labels with shape: (5000, 2)
[DEBUG] First 5 rows of easy_labels:
                 filename       text
0  easy_0_variation_0.png  algorithm
1  easy_0_variation_1.png  algorithm
2  easy_0_variation_2.png  algorithm
3  easy_0_variation_3.png  algorithm
4  easy_0_variation_4.png  algorithm
[DEBUG] Loaded hard_labels with shape: (5000, 2)
[DEBUG] First 5 rows of hard_labels:
                 filename       text
0  hard_0_variation_0.png  algorithm
1  hard_0_variation_1.png  algorithm
2  hard_0_variation_2.png  algorithm
3  hard_0_variation_3.png  algorithm
4  hard_0_variation_4.png  algorithm
[DEBUG] Unique words: 100 words found.
Generated classification CSV file: 

# Task 1 (V4)


In [None]:
import os
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from sklearn.model_selection import KFold

# Set the global debug flag
DEBUG = False

# -------------------------------
# Dataset Definition with Debugging Checks
# -------------------------------
class OCRDataset(Dataset):
    def __init__(self, csv_file, transform=None, label2idx=None):
        """
        Args:
            csv_file (str): Path to the CSV file with columns 'img_path' and 'label'.
            transform (callable, optional): Optional transforms to be applied on a sample.
            label2idx (dict, optional): Mapping from label strings to integer indices.
        """
        self.data = pd.read_csv(csv_file)
        if DEBUG:
            print(f"[DEBUG] Loaded CSV file '{csv_file}' with {len(self.data)} samples.")
            print("[DEBUG] First 5 rows of CSV:")
            print(self.data.head())
        self.transform = transform
        self.label2idx = label2idx
        if label2idx:
            # Verify that every unique label in the CSV has an entry in label2idx
            unique_labels = set(self.data['label'].unique())
            mapped_labels = set(label2idx.keys())
            missing = unique_labels - mapped_labels
            if missing:
                print(f"[ERROR] The following labels are missing in label2idx mapping: {missing}")
            else:
                if DEBUG:
                    print("[DEBUG] All labels in the CSV are present in the label2idx mapping.")
        # Check that all image files exist (print warnings if not)
        for i, row in self.data.iterrows():
            path = row['img_path']
            if not os.path.exists(path):
                print(f"[WARNING] Image file does not exist: {path}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['img_path']
        label_str = row['label']

        # Debug: Print basic info for the first few items.
        if DEBUG and idx < 5:
            print(f"[DEBUG] __getitem__: Index {idx}: img_path='{img_path}', label='{label_str}'")

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"[ERROR] Image file not found: {img_path}")

        # Open the image and convert to RGB (transformation will convert to grayscale if desired)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
            if DEBUG and idx < 5:
                # If the image is a tensor, print its shape
                if isinstance(image, torch.Tensor):
                    print(f"[DEBUG] __getitem__: Transformed image shape (index {idx}): {image.shape}")
                else:
                    print(f"[DEBUG] __getitem__: Transformed image size (index {idx}): {image.size}")

        # Map the label string to an index using label2idx (if provided)
        if self.label2idx:
            if label_str not in self.label2idx:
                raise ValueError(f"[ERROR] Label '{label_str}' not found in label2idx mapping!")
            label = self.label2idx[label_str]
        else:
            label = int(label_str)

        if DEBUG and idx < 5:
            print(f"[DEBUG] __getitem__: Mapped label '{label_str}' -> {label}")

        return image, label

# -------------------------------
# Small CNN Model (MNIST-Style for 32x128 Inputs) with Debug Prints
# -------------------------------
class SmallCNN(nn.Module):
    def __init__(self, num_classes):
        """
        A simple CNN suitable for small datasets.
        Expected input size is (1, 32, 128) (grayscale, height=32, width=128).
        """
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)   # Output: (16, 32, 128)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)                             # -> (16, 16, 64)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)   # -> (32, 16, 64)
        self.pool2 = nn.MaxPool2d(2, 2)                            # -> (32, 8, 32)
        self.fc1 = nn.Linear(32 * 8 * 32, 128)  # 32*8*32 = 8192 features
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# -------------------------------
# Training and Evaluation Functions with Debug Checks
# -------------------------------
def train(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for batch_idx, (images, labels) in enumerate(loader):
        images, labels = images.to(device), labels.to(device)
        if DEBUG and batch_idx == 0:
            print(f"[DEBUG] Training batch 0: images.shape = {images.shape}, labels = {labels}")
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
    epoch_loss = running_loss / len(loader.dataset)
    return epoch_loss

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            if DEBUG and batch_idx == 0:
                print(f"[DEBUG] Evaluation batch 0: preds = {preds[:5]}, labels = {labels[:5]}")
    epoch_loss = running_loss / total
    accuracy = correct / total
    return epoch_loss, accuracy

# -------------------------------
# Main Execution with K-Fold Cross Validation and Debugging
# -------------------------------
def main():
    # Path to your CSV file (update the path if needed)
    csv_file = '/content/drive/MyDrive/ocr_dataset/ocr_classification.csv'

    # Read CSV and build a label-to-index mapping.
    df = pd.read_csv(csv_file)
    unique_labels = df['label'].unique()
    label2idx = {label: idx for idx, label in enumerate(unique_labels)}
    num_classes = len(unique_labels)
    print(f"[DEBUG] Number of classes found: {num_classes}")
    print(f"[DEBUG] Label to index mapping:\n{label2idx}")

    # Define the transformation pipeline.
    # We resize images from their original 400x100 to 32x128 (preserving the 4:1 aspect ratio)
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((32, 128)),  # (height, width)
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    # Create the dataset.
    dataset = OCRDataset(csv_file, transform=transform, label2idx=label2idx)
    print(f"[DEBUG] Dataset has {len(dataset)} samples.")

    # Training hyperparameters.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"[DEBUG] Using device: {device}")
    num_epochs = 20
    batch_size = 16
    learning_rate = 0.001
    k_folds = 5

    # Set up K-Fold Cross Validation.
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_results = {}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"\n--- Fold {fold+1}/{k_folds} ---")
        print(f"[DEBUG] Training indices: {train_idx[:10]} ... (total {len(train_idx)})")
        print(f"[DEBUG] Validation indices: {val_idx[:10]} ... (total {len(val_idx)})")

        # Create subsets for this fold.
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

        # Initialize model, loss function, and optimizer.
        model = SmallCNN(num_classes=num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            train_loss = train(model, train_loader, criterion, optimizer, device)
            val_loss, val_acc = evaluate(model, val_loader, criterion, device)
            print(f"[DEBUG] Fold {fold+1}, Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        fold_results[fold] = {'val_loss': val_loss, 'val_acc': val_acc}

    print("\n[DEBUG] Cross Validation Results:")
    for fold in fold_results:
        print(f"Fold {fold}: {fold_results[fold]}")

if __name__ == '__main__':
    main()


[DEBUG] Number of classes found: 100
[DEBUG] Label to index mapping:
{'algorithm': 0, 'biology': 1, 'cryptocurrency': 2, 'dichotomy': 3, 'encyclopedia': 4, 'flabbergasted': 5, 'gregarious': 6, 'hypothesis': 7, 'ineffable': 8, 'juxtaposition': 9, 'kinematics': 10, 'laryngitis': 11, 'metamorphosis': 12, 'neurology': 13, 'ophthalmology': 14, 'photosynthesis': 15, 'quadrilateral': 16, 'rhythm': 17, 'saccharine': 18, 'taxonomy': 19, 'ubiquitous': 20, 'vocabulary': 21, 'wavelength': 22, 'xenophobia': 23, 'youthful': 24, 'zephyr': 25, 'abbreviation': 26, 'benevolent': 27, 'conundrum': 28, 'doppelgänger': 29, 'esoteric': 30, 'facetious': 31, 'hierarchy': 32, 'idiosyncrasy': 33, 'juxtapositions': 34, 'knapsack': 35, 'lexicography': 36, 'mnemonic': 37, 'onomatopoeia': 38, 'paradigm': 39, 'quagmire': 40, 'resilience': 41, 'serendipity': 42, 'tangential': 43, 'vicarious': 44, 'whimsical': 45, 'xenon': 46, 'yacht': 47, 'zealous': 48, 'axiomatic': 49, 'blasé': 50, 'camaraderie': 51, 'decipher': 52, 

# Task 2

In [None]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from sklearn.model_selection import KFold
from torch.cuda.amp import autocast, GradScaler

# -------------------------------
# Global Debug Flag
# -------------------------------
DEBUG = False

# -------------------------------
# Global Definitions: Character Mapping
# -------------------------------
# Define an alphabet containing lowercase and uppercase letters.
alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
# For CTCLoss, index 0 is reserved for the blank token.
char2idx = {char: i + 1 for i, char in enumerate(alphabet)}
idx2char = {i + 1: char for i, char in enumerate(alphabet)}
num_classes = len(alphabet) + 1  # Plus one for the blank

# -------------------------------
# OCR Dataset for CRNN (Converts each label into a sequence of character indices)
# -------------------------------
class OCRDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (str): Path to the CSV file with columns 'img_path' and 'label'
            transform (callable, optional): Transformations to apply to the image.
        """
        self.data = pd.read_csv(csv_file)
        if DEBUG:
            print(f"[DEBUG] Loaded CSV file '{csv_file}' with {len(self.data)} samples.")
            print("[DEBUG] First 5 rows of CSV:")
            print(self.data.head())
        self.transform = transform
        # Warn if any image file does not exist.
        for i, row in self.data.iterrows():
            path = row['img_path']
            if not os.path.exists(path):
                print(f"[WARNING] Image file does not exist: {path}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['img_path']
        label_str = row['label'].strip()
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"[ERROR] Image file not found: {img_path}")
        # Load the image in grayscale.
        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)
        # Convert the label (a word) into a sequence of integer indices using char2idx.
        target = [char2idx[c] for c in label_str if c in char2idx]
        if DEBUG and idx < 5:
            print(f"[DEBUG] __getitem__: Index {idx}: img_path='{img_path}', label='{label_str}', target={target}")
        return image, target

# -------------------------------
# Collate Function for Variable-Length Targets
# -------------------------------
def ocr_collate_fn(batch):
    """
    Given a batch of (image, target) pairs, stack images into a tensor,
    concatenate all target sequences into a 1D tensor, and record each target’s length.
    This format is required for the CTCLoss.
    """
    images, targets = zip(*batch)
    images = torch.stack(images, 0)
    target_lengths = torch.tensor([len(t) for t in targets], dtype=torch.long)
    targets_concat = torch.cat([torch.tensor(t, dtype=torch.long) for t in targets])
    return images, targets_concat, target_lengths

# -------------------------------
# CRNN Model Definition
# -------------------------------
class CRNN(nn.Module):
    def __init__(self, img_height, num_channels, num_classes, rnn_hidden=256, rnn_layers=2):
        """
        CRNN for OCR extraction.
        Args:
            img_height (int): Height of the input image.
            num_channels (int): Number of image channels (1 for grayscale).
            num_classes (int): Number of output classes (including the blank token).
            rnn_hidden (int): Hidden size for the LSTM.
            rnn_layers (int): Number of LSTM layers.
        """
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # Downsample height & width by 2.
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, None))  # Collapse height to 1.
        self.rnn = nn.LSTM(input_size=128, hidden_size=rnn_hidden, num_layers=rnn_layers, bidirectional=True)
        self.fc = nn.Linear(rnn_hidden * 2, num_classes)

    def forward(self, x):
        conv = self.cnn(x)                        # (batch, 128, H', W')
        pooled = self.adaptive_pool(conv)           # (batch, 128, 1, W')
        squeezed = pooled.squeeze(2)                # (batch, 128, W')
        rnn_input = squeezed.permute(2, 0, 1)         # (W, batch, 128) => W as time steps
        rnn_output, _ = self.rnn(rnn_input)           # (W, batch, 2*rnn_hidden)
        output = self.fc(rnn_output)                # (W, batch, num_classes)
        return output.log_softmax(2)

# -------------------------------
# Training and Evaluation Functions (with Mixed Precision)
# -------------------------------
def train_epoch(model, device, dataloader, criterion, optimizer, scaler):
    model.train()
    epoch_loss = 0.0
    for images, targets, target_lengths in dataloader:
        images = images.to(device)
        targets = targets.to(device)
        target_lengths = target_lengths.to(device)

        optimizer.zero_grad()
        with autocast():
            output = model(images)  # shape: (T, batch, num_classes)
            T, batch_size, _ = output.size()
            # All outputs have the same time length T.
            input_lengths = torch.full(size=(batch_size,), fill_value=T, dtype=torch.long).to(device)
            loss = criterion(output, targets, input_lengths, target_lengths)
        scaler.scale(loss).backward()
        # (Optional) Gradient clipping can be added here:
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item() * images.size(0)Section
    return epoch_loss / len(dataloader.dataset)

def evaluate_epoch(model, device, dataloader, criterion):
    model.eval()
    epoch_loss = 0.0
    all_preds = []
    with torch.no_grad():
        for images, targets, target_lengths in dataloader:
            images = images.to(device)
            targets = targets.to(device)
            target_lengths = target_lengths.to(device)
            output = model(images)
            T, batch_size, _ = output.size()
            input_lengths = torch.full(size=(batch_size,), fill_value=T, dtype=torch.long).to(device)
            loss = criterion(output, targets, input_lengths, target_lengths)
            epoch_loss += loss.item() * images.size(0)
            preds = decode_predictions(output, blank=0)
            all_preds.extend(preds)
    return epoch_loss / len(dataloader.dataset), all_preds

def decode_predictions(output, blank=0):
    output = output.cpu().detach().numpy()
    T, batch_size, _ = output.shape
    preds = []
    for b in range(batch_size):
        pred = []
        previous = blank
        for t in range(T):
            char_idx = output[t, b, :].argmax()
            if char_idx != blank and char_idx != previous:
                pred.append(idx2char.get(char_idx, ''))
            previous = char_idx
        preds.append(''.join(pred))
    return preds

# -------------------------------
# Main Execution with K-Fold Cross Validation
# -------------------------------
def main():
    csv_file = '/content/drive/MyDrive/ocr_dataset/ocr_classification.csv'
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((32, 128)),  # preserves a 4:1 aspect ratio
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    # Read the CSV file for ground truth display.
    df = pd.read_csv(csv_file)
    dataset = OCRDataset(csv_file, transform=transform)
    print(f"[DEBUG] Dataset has {len(dataset)} samples.")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"[DEBUG] Using device: {device}")

    num_epochs = 20
    batch_size = 32
    learning_rate = 0.001
    k_folds = 5
    scaler = GradScaler()

    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_results = {}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"\n--- Fold {fold+1}/{k_folds} ---")
        print(f"[DEBUG] Training samples: {len(train_idx)}, Validation samples: {len(val_idx)}")

        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, collate_fn=ocr_collate_fn)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, collate_fn=ocr_collate_fn)

        model = CRNN(img_height=32, num_channels=1, num_classes=num_classes,
                     rnn_hidden=256, rnn_layers=2).to(device)
        criterion = nn.CTCLoss(blank=0, zero_infinity=True)
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            train_loss = train_epoch(model, device, train_loader, criterion, optimizer, scaler)
            val_loss, val_preds = evaluate_epoch(model, device, val_loader, criterion)
            print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

            if epoch == num_epochs - 1:  # Last epoch: display sample predictions
                print("Sample Predictions (Ground Truth vs Predicted) for first 10 validation samples:")
                for i in range(10):
                    # Ground truth text is taken directly from the CSV using the validation indices.
                    true_label = df.iloc[val_idx[i]]['label']
                    print(f"True: {true_label}, Pred: {val_preds[i]}")

        fold_results[fold] = {'val_loss': val_loss}

    print("\n[DEBUG] Cross Validation Results:")
    for fold in fold_results:
        print(f"Fold {fold}: {fold_results[fold]}")

if __name__ == '__main__':
    main()


[DEBUG] Dataset has 10000 samples.
[DEBUG] Using device: cuda

--- Fold 1/5 ---
[DEBUG] Training samples: 8000, Validation samples: 2000


  scaler = GradScaler()
  with autocast():


Epoch 1/20: Train Loss: 3.3454, Val Loss: 3.0360
Epoch 2/20: Train Loss: 2.9535, Val Loss: 2.8694
Epoch 3/20: Train Loss: 2.7914, Val Loss: 2.7259
Epoch 4/20: Train Loss: 2.6846, Val Loss: 2.6544
Epoch 5/20: Train Loss: 2.5892, Val Loss: 2.5390
Epoch 6/20: Train Loss: 2.5084, Val Loss: 2.4744
Epoch 7/20: Train Loss: 2.4354, Val Loss: 2.3934
Epoch 8/20: Train Loss: 2.3793, Val Loss: 2.3767
Epoch 9/20: Train Loss: 2.3313, Val Loss: 2.3189
Epoch 10/20: Train Loss: 2.2980, Val Loss: 2.3048
Epoch 11/20: Train Loss: 2.2642, Val Loss: 2.2654
Epoch 12/20: Train Loss: 2.2225, Val Loss: 2.2332
Epoch 13/20: Train Loss: 2.1685, Val Loss: 2.1518
Epoch 14/20: Train Loss: 2.1105, Val Loss: 2.0827
Epoch 15/20: Train Loss: 2.0362, Val Loss: 2.0050
Epoch 16/20: Train Loss: 1.9619, Val Loss: 1.9276
Epoch 17/20: Train Loss: 1.8575, Val Loss: 1.8599
Epoch 18/20: Train Loss: 1.7450, Val Loss: 1.6958
Epoch 19/20: Train Loss: 1.6030, Val Loss: 1.5867
Epoch 20/20: Train Loss: 1.4394, Val Loss: 1.4084
Sample Pr

# Task 3


In [None]:
import os
import pandas as pd

# Global debug flag
DEBUG = True

# Define directory and labels file for Bonus dataset
bonus_image_dir = '/content/drive/MyDrive/ocr_dataset/bonus/images/'
bonus_labels_file = '/content/drive/MyDrive/ocr_dataset/bonus/labels.csv'

if DEBUG:
    print(f"[DEBUG] Bonus image directory: {bonus_image_dir}")
    print(f"[DEBUG] Bonus labels file: {bonus_labels_file}")

# Load the CSV file
bonus_labels = pd.read_csv(bonus_labels_file)

if DEBUG:
    print(f"[DEBUG] Loaded bonus_labels with shape: {bonus_labels.shape}")
    print("[DEBUG] First 5 rows of bonus_labels:")
    print(bonus_labels.head())

# Ensure that all the unique words are included
unique_words = bonus_labels['text'].unique().tolist()  # Assuming 'text' column holds the words
print(f"[DEBUG] Unique words: {len(unique_words)} words found.")

# Initialize an empty list to store all data (img_path, label pairs)
all_data = []

# For each unique word in the dataset, select 50 random images from the Bonus set
for word in unique_words:
    # Get 50 random images for this word from the bonus set
    bonus_samples = bonus_labels[bonus_labels['text'] == word].sample(n=50, random_state=42)

    # Add the img_path and label pairs to the all_data list
    for sample in bonus_samples.itertuples():
        img_path = os.path.join(bonus_image_dir, sample.filename)
        all_data.append({'img_path': img_path, 'label': word.lower()})  # Store label in lowercase

# Convert the list of data into a DataFrame
final_dataset = pd.DataFrame(all_data)

# Save the final dataset to a new CSV file
classification_csv_path = '/content/drive/MyDrive/ocr_dataset/ocr_bonus.csv'
final_dataset.to_csv(classification_csv_path, index=False)

print(f"Generated classification CSV file: {classification_csv_path}")


[DEBUG] Bonus image directory: /content/drive/MyDrive/ocr_dataset/bonus/images/
[DEBUG] Bonus labels file: /content/drive/MyDrive/ocr_dataset/bonus/labels.csv
[DEBUG] Loaded bonus_labels with shape: (5000, 2)
[DEBUG] First 5 rows of bonus_labels:
                  filename       text
0  bonus_0_variation_0.png  algorithm
1  bonus_0_variation_1.png  algorithm
2  bonus_0_variation_2.png  algorithm
3  bonus_0_variation_3.png  algorithm
4  bonus_0_variation_4.png  algorithm
[DEBUG] Unique words: 100 words found.
Generated classification CSV file: /content/drive/MyDrive/ocr_dataset/ocr_bonus.csv


In [None]:
import os
import random
import pandas as pd
from PIL import Image
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import KFold
from torch.cuda.amp import autocast, GradScaler

# -------------------------------
# Global Debug Flag & Definitions
# -------------------------------
DEBUG = True

# Define an alphabet containing lowercase and uppercase letters.
alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
# For CTCLoss, index 0 is reserved for the blank token.
char2idx = {char: i + 1 for i, char in enumerate(alphabet)}
idx2char = {i + 1: char for i, char in enumerate(alphabet)}
num_classes = len(alphabet) + 1  # plus one for the blank token

# -------------------------------
# Custom OCR Dataset (Accepts a DataFrame or CSV file)
# -------------------------------
class OCRDataset(Dataset):
    def __init__(self, csv_file=None, df=None, transform=None):
        """
        Args:
            csv_file (str): Path to the CSV file with columns 'img_path' and 'label'
            df (pd.DataFrame): Alternatively, pass a DataFrame.
            transform (callable, optional): Transformations to apply to the image.
        """
        if df is not None:
            self.data = df
        elif csv_file is not None:
            self.data = pd.read_csv(csv_file)
        else:
            raise ValueError("Must provide csv_file or df")

        if DEBUG:
            print(f"[DEBUG] Loaded data with {len(self.data)} samples.")
            print("[DEBUG] First 5 rows:")
            print(self.data.head())

        self.transform = transform
        # Warn if any image file does not exist.
        for i, row in self.data.iterrows():
            path = row['img_path']
            if not os.path.exists(path):
                print(f"[WARNING] Image file does not exist: {path}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['img_path']
        label_str = row['label'].strip()
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"[ERROR] Image file not found: {img_path}")
        # Load the image in grayscale.
        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)
        # Convert label (a word) into a sequence of integer indices.
        target = [char2idx[c] for c in label_str if c in char2idx]
        if DEBUG and idx < 5:
            print(f"[DEBUG] __getitem__ idx {idx}: path='{img_path}', label='{label_str}', target={target}")
        return image, target

# -------------------------------
# Collate Function for Variable-Length Targets
# -------------------------------
def ocr_collate_fn(batch):
    """
    Collate function that stacks images, concatenates target sequences and records each target’s length.
    This format is required for CTCLoss.
    """
    images, targets = zip(*batch)
    images = torch.stack(images, 0)
    target_lengths = torch.tensor([len(t) for t in targets], dtype=torch.long)
    targets_concat = torch.cat([torch.tensor(t, dtype=torch.long) for t in targets])
    return images, targets_concat, target_lengths

# -------------------------------
# CRNN Model Definition
# -------------------------------
class CRNN(nn.Module):
    def __init__(self, img_height, num_channels, num_classes, rnn_hidden=256, rnn_layers=2):
        """
        CRNN for OCR extraction.
        Args:
            img_height (int): Height of the input image.
            num_channels (int): Number of image channels (1 for grayscale).
            num_classes (int): Number of output classes (including the blank token).
            rnn_hidden (int): Hidden size for the LSTM.
            rnn_layers (int): Number of LSTM layers.
        """
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # Downsample height & width by 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, None))  # collapse height to 1
        self.rnn = nn.LSTM(input_size=128, hidden_size=rnn_hidden, num_layers=rnn_layers, bidirectional=True)
        self.fc = nn.Linear(rnn_hidden * 2, num_classes)

    def forward(self, x):
        conv = self.cnn(x)                          # shape: (batch, 128, H', W')
        pooled = self.adaptive_pool(conv)             # shape: (batch, 128, 1, W')
        squeezed = pooled.squeeze(2)                  # shape: (batch, 128, W')
        rnn_input = squeezed.permute(2, 0, 1)           # shape: (W, batch, 128)
        rnn_output, _ = self.rnn(rnn_input)             # shape: (W, batch, 2*rnn_hidden)
        output = self.fc(rnn_output)                  # shape: (W, batch, num_classes)
        return output.log_softmax(2)

# -------------------------------
# Decoding Predictions (CTC decoding)
# -------------------------------
def decode_predictions(output, blank=0):
    """
    Decodes the output of the network (in log_softmax) into strings.
    """
    output = output.cpu().detach().numpy()
    T, batch_size, _ = output.shape
    preds = []
    for b in range(batch_size):
        pred = []
        previous = blank
        for t in range(T):
            char_idx = output[t, b, :].argmax()
            if char_idx != blank and char_idx != previous:
                pred.append(idx2char.get(char_idx, ''))
            previous = char_idx
        preds.append(''.join(pred))
    return preds

# -------------------------------
# Training and Evaluation Functions (with Mixed Precision)
# -------------------------------
def train_epoch(model, device, dataloader, criterion, optimizer, scaler):
    model.train()
    epoch_loss = 0.0
    for images, targets, target_lengths in dataloader:
        images = images.to(device)
        targets = targets.to(device)
        target_lengths = target_lengths.to(device)

        optimizer.zero_grad()
        with autocast():
            output = model(images)  # shape: (T, batch, num_classes)
            T, batch_size, _ = output.size()
            input_lengths = torch.full(size=(batch_size,), fill_value=T, dtype=torch.long).to(device)
            loss = criterion(output, targets, input_lengths, target_lengths)
        scaler.scale(loss).backward()
        # (Optional) You can add gradient clipping here.
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item() * images.size(0)
    return epoch_loss / len(dataloader.dataset)

def evaluate_epoch(model, device, dataloader, criterion):
    model.eval()
    epoch_loss = 0.0
    all_preds = []
    with torch.no_grad():
        for images, targets, target_lengths in dataloader:
            images = images.to(device)
            targets = targets.to(device)
            target_lengths = target_lengths.to(device)
            output = model(images)
            T, batch_size, _ = output.size()
            input_lengths = torch.full(size=(batch_size,), fill_value=T, dtype=torch.long).to(device)
            loss = criterion(output, targets, input_lengths, target_lengths)
            epoch_loss += loss.item() * images.size(0)
            preds = decode_predictions(output, blank=0)
            all_preds.extend(preds)
    return epoch_loss / len(dataloader.dataset), all_preds

# -------------------------------
# Main Training with K-Fold Cross Validation on Bonus Set
# -------------------------------
def main():
    # Path to bonus CSV file (change as needed)
    csv_file = '/content/drive/MyDrive/ocr_dataset/ocr_bonus.csv'
    df = pd.read_csv(csv_file)
    if DEBUG:
        print("[INFO] Running Bonus Task training on bonus set")
        print(f"[INFO] Total samples: {len(df)}")

    # Define two sets of transforms
    # Training transform (with augmentation)
    train_transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((32, 128)),  # maintain a 4:1 aspect ratio
        transforms.RandomApply([transforms.RandomRotation(2, fill=(255,))], p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    # Validation transform (deterministic)
    val_transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"[INFO] Using device: {device}")

    # Training hyperparameters
    num_epochs = 20
    batch_size = 32
    learning_rate = 0.001
    k_folds = 5
    scaler = GradScaler()

    # Initialize K-Fold
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_results = {}

    # Log configuration details for later visualization
    config_log = {
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'k_folds': k_folds,
        'train_transform': str(train_transform),
        'val_transform': str(val_transform)
    }
    print("[INFO] Training configuration:")
    for k, v in config_log.items():
        print(f"  {k}: {v}")

    # For each fold, split the DataFrame into training and validation parts
    for fold, (train_idx, val_idx) in enumerate(kfold.split(df)):
        print(f"\n--- Fold {fold+1}/{k_folds} ---")
        print(f"[INFO] Training samples: {len(train_idx)}, Validation samples: {len(val_idx)}")
        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)

        # Create separate dataset objects with the appropriate transforms.
        train_dataset = OCRDataset(df=train_df, transform=train_transform)
        val_dataset   = OCRDataset(df=val_df, transform=val_transform)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=ocr_collate_fn)
        val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=ocr_collate_fn)

        # Initialize model, loss function and optimizer for this fold
        model = CRNN(img_height=32, num_channels=1, num_classes=num_classes, rnn_hidden=256, rnn_layers=2).to(device)
        criterion = nn.CTCLoss(blank=0, zero_infinity=True)
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

        # Log metrics per epoch for this fold.
        fold_epoch_logs = []

        for epoch in range(num_epochs):
            train_loss = train_epoch(model, device, train_loader, criterion, optimizer, scaler)
            val_loss, val_preds = evaluate_epoch(model, device, val_loader, criterion)
            log_entry = {
                'fold': fold,
                'epoch': epoch,
                'train_loss': train_loss,
                'val_loss': val_loss
            }
            fold_epoch_logs.append(log_entry)
            print(f"Fold {fold+1}, Epoch {epoch+1}/{num_epochs} -- Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

            # Every epoch (or at the last epoch) show a few sample predictions
            if epoch == num_epochs - 1 or (epoch+1) % 5 == 0:
                print("  Sample Predictions (Ground Truth vs Predicted):")
                # Show up to 10 samples from the validation set.
                for i in range(min(10, len(val_preds))):
                    true_label = val_df.iloc[i]['label']
                    print(f"    True: {true_label:15} | Pred: {val_preds[i]}")

        fold_results[fold] = {
            'final_val_loss': val_loss,
            'epoch_logs': fold_epoch_logs
        }

    print("\n[INFO] Cross Validation Results:")
    for fold in fold_results:
        print(f"Fold {fold}: Final Validation Loss: {fold_results[fold]['final_val_loss']:.4f}")
        # (You can also save fold_results to a file for later visualization.)

if __name__ == '__main__':
    main()


[INFO] Running Bonus Task training on bonus set
[INFO] Total samples: 5000
[INFO] Using device: cuda
[INFO] Training configuration:
  num_epochs: 20
  batch_size: 32
  learning_rate: 0.001
  k_folds: 5
  train_transform: Compose(
    Grayscale(num_output_channels=1)
    Resize(size=(32, 128), interpolation=bilinear, max_size=None, antialias=True)
    RandomApply(
    p=0.5
    RandomRotation(degrees=[-2.0, 2.0], interpolation=nearest, expand=False, fill=(255,))
)
    ToTensor()
    Normalize(mean=(0.5,), std=(0.5,))
)
  val_transform: Compose(
    Grayscale(num_output_channels=1)
    Resize(size=(32, 128), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=(0.5,), std=(0.5,))
)

--- Fold 1/5 ---
[INFO] Training samples: 4000, Validation samples: 1000
[DEBUG] Loaded data with 4000 samples.
[DEBUG] First 5 rows:
                                            img_path      label
0  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm
1  /content/

  scaler = GradScaler()


[DEBUG] Loaded data with 1000 samples.
[DEBUG] First 5 rows:
                                            img_path      label
0  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm
1  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm
2  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm
3  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm
4  /content/drive/MyDrive/ocr_dataset/bonus/image...  algorithm


  with autocast():


[DEBUG] __getitem__ idx 3: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_45.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 2: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_30.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_13.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 4: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_17.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_39.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_32.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13

  with autocast():


[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_30.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 2: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_45.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_39.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 4: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_48.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 3: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_17.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_13.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13

  with autocast():


[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_13.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_39.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 4: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_17.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 2: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_30.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 3: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_45.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_25.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13

  with autocast():


[DEBUG] __getitem__ idx 4: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_26.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_13.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 2: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_17.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 3: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_48.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_45.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_39.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13

  with autocast():


[DEBUG] __getitem__ idx 4: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_25.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 2: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_30.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_39.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_13.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 0: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_45.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13]
[DEBUG] __getitem__ idx 1: path='/content/drive/MyDrive/ocr_dataset/bonus/images/bonus_0_variation_17.png', label='algorithm', target=[1, 12, 7, 15, 18, 9, 20, 8, 13