In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms
import pandas as pd
import json

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uljibuhborjigin/mongol-ocr-zcode/mn_ocr_synthetic/mn_ocr_synthetic/this_uni_to_z_outputs.csv")

print("Path to dataset files:", path)

ValueError: Invalid dataset handle: uljibuhborjigin/mongol-ocr-zcode/mn_ocr_synthetic/mn_ocr_synthetic/this_uni_to_z_outputs.csv

In [None]:
import requests
import os
import json

# Load Kaggle credentials
with open(os.path.expanduser("~/.kaggle/kaggle.json")) as f:
    kaggle_creds = json.load(f)

# Construct URL and headers
dataset_owner = "uljibuhborjigin"
dataset_name = "mongol-ocr-zcode"
file_path = "mn_ocr_synthetic/mn_ocr_synthetic/this_uni_to_z_outputs.csv"
output_path = "C:/MNOCR/dataset/ocr_dataset/this_uni_to_z_outputs.csv"

url = f"https://www.kaggle.com/api/v1/datasets/download/{dataset_owner}/{dataset_name}/{file_path}"
headers = {"Authorization": f"Bearer {kaggle_creds['key']}"}

# Download the file
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200:
    with open(output_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded to: {output_path}")
else:
    print("Failed to download:", response.status_code, response.text)


In [10]:
class OCRDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None, max_label_length=50):
        """
        Args:
            image_paths: List of paths to images
            labels: List of lists of character IDs (already converted to IDs)
            transform: Optional transform to be applied on images
            max_label_length: Maximum length for label padding
        """
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        self.max_label_length = max_label_length

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        try:
            img_path = self.image_paths[idx]
            image = Image.open(img_path).convert('L')  # Convert to grayscale
            
            # Apply transformations
            if self.transform:
                image = self.transform(image)
            
            # Get label
            label = self.labels[idx]
            
            return image, label
        except Exception as e:
            print(f"Error loading image {self.image_paths[idx]}: {e}")
            # Return a placeholder in case of error
            if self.transform:
                placeholder = torch.zeros((1, 100, 400))  # Adjust size as needed
            else:
                placeholder = Image.new('L', (400, 100))
                placeholder = self.transform(placeholder) if self.transform else placeholder
            return placeholder, [1]  # Return a simple label

In [11]:
class CRNN(nn.Module):
    def __init__(self, input_channels, output_classes, hidden_size=64, num_layers=1):
        super(CRNN, self).__init__()
        
        # CNN Backbone - SimpleNet to avoid complexity
        self.cnn = nn.Sequential(
            # Layer 1
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Layer 2
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            # Layer 3
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1)),  # Pool only height
            
            # Layer 4 
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1))  # Pool only height
        )
        
        # Calculate width after CNN using a sample input
        self.width_test = torch.randn(1, input_channels, 32, 200)  # Adjust sample size as needed
        cnn_output = self.cnn(self.width_test)
        _, channels, height, width = cnn_output.shape
        self.output_height = height
        self.width_after_cnn = width
        self.rnn_input_size = channels * height
        
        # RNN Layer - Simple and stable
        self.rnn = nn.GRU(
            input_size=self.rnn_input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True
        )
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.2)
        
        # Final output layer
        self.linear = nn.Linear(hidden_size * 2, output_classes)  # *2 for bidirectional
    
    def forward(self, x):
        # X shape: (batch, channels, height, width)
        batch_size = x.size(0)
        
        # Apply CNN features extraction
        conv = self.cnn(x)
        
        # Prepare for RNN (batch, width, channels*height)
        # Permute from (batch, channels, height, width) to (batch, width, channels, height)
        conv = conv.permute(0, 3, 1, 2)
        # Reshape to (batch, width, channels*height)
        conv = conv.reshape(batch_size, conv.size(1), -1)
        
        # Apply RNN
        rnn_output, _ = self.rnn(conv)
        
        # Dropout
        rnn_output = self.dropout(rnn_output)
        
        # Linear layer for output probabilities
        output = self.linear(rnn_output)
        
        # Apply log_softmax for CTC Loss
        output = nn.functional.log_softmax(output, dim=2)
        
        return output

In [12]:
def train_ocr_model(image_paths, labels, char_to_id, id_to_char, batch_size=8, num_epochs=10, 
                    learning_rate=0.001, save_dir='checkpoints'):
    """
    Train OCR model end-to-end
    Args:
        image_paths: List of image paths
        labels: List of labels (each label is a list of character IDs)
        char_to_id: Dictionary mapping characters to IDs
        id_to_char: Dictionary mapping IDs to characters
        batch_size: Batch size for training
        num_epochs: Number of training epochs
        learning_rate: Initial learning rate
        save_dir: Directory to save model checkpoints
    """
    # Make sure save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Dataset and DataLoader
    transform = transforms.Compose([
        transforms.Resize((32, 200)),  # Consistent but not too large size
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to [-1, 1]
    ])
    
    # Create dataset
    dataset = OCRDataset(image_paths, labels, transform=transform)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        collate_fn=collate_fn  # Custom collate function defined below
    )
    
    # Create model
    vocab_size = len(char_to_id) + 1  # +1 for blank token in CTC
    model = CRNN(
        input_channels=1,  # Grayscale images
        output_classes=vocab_size,
        hidden_size=64,  # Small hidden size for stability
        num_layers=1  # Single layer for stability
    ).to(device)
    
    # Print model information
    print(f"Model width after CNN: {model.width_after_cnn}")
    print(f"Model RNN input size: {model.rnn_input_size}")
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total model parameters: {total_params:,}")
    
    # Loss function and optimizer
    criterion = nn.CTCLoss(blank=0, reduction='mean')
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', patience=2, factor=0.5, verbose=True
    )
    
    # Training loop
    global_step = 0
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        valid_batches = 0
        
        for i, (images, targets, target_lengths) in enumerate(dataloader):
            try:
                # Move data to device
                images = images.to(device)
                targets = targets.to(device)
                target_lengths = target_lengths.to(device)
                
                # Clear gradients
                optimizer.zero_grad()
                
                # Forward pass
                outputs = model(images)
                
                # Check for NaNs in output
                if torch.isnan(outputs).any():
                    print(f"NaN detected in outputs at batch {i}, skipping batch")
                    continue
                
                # Calculate output lengths (assuming all sequences have same width)
                batch_size = images.size(0)
                output_lengths = torch.full(
                    (batch_size,), model.width_after_cnn, dtype=torch.long
                ).to(device)
                
                # Transpose output for CTC loss: (batch, time, classes) -> (time, batch, classes)
                outputs = outputs.permute(1, 0, 2)
                
                # Calculate loss
                loss = criterion(outputs, targets, output_lengths, target_lengths)
                
                # Check for NaN loss
                if torch.isnan(loss):
                    print(f"NaN loss at batch {i}, skipping batch")
                    continue
                
                # Backward pass and optimize
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
                
                # Check for NaN gradients
                has_nan_grad = False
                for param in model.parameters():
                    if param.grad is not None and torch.isnan(param.grad).any():
                        has_nan_grad = True
                        break
                        
                if has_nan_grad:
                    print(f"NaN gradients detected at batch {i}, skipping update")
                    continue
                
                # Update weights
                optimizer.step()
                
                # Update metrics
                batch_loss = loss.item()
                epoch_loss += batch_loss
                valid_batches += 1
                global_step += 1
                
                # Print progress
                if (i + 1) % 10 == 0:
                    print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(dataloader)}], "
                          f"Loss: {batch_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")
                
            except Exception as e:
                print(f"Error in batch {i}: {e}")
                continue
        
        # End of epoch
        if valid_batches > 0:
            avg_epoch_loss = epoch_loss / valid_batches
            print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}")
            
            # Update learning rate
            scheduler.step(avg_epoch_loss)
            
            # Save checkpoint
            if avg_epoch_loss < best_loss:
                best_loss = avg_epoch_loss
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': best_loss,
                }
                torch.save(checkpoint, os.path.join(save_dir, 'best_model.pt'))
                print(f"Saved best model checkpoint with loss: {best_loss:.4f}")
        
        # Always save the last model
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss / max(1, valid_batches),
        }
        torch.save(checkpoint, os.path.join(save_dir, f'model_epoch_{epoch+1}.pt'))
    
    print("Training complete!")
    return model

# Custom collate function to handle variable length labels
def collate_fn(batch):
    """
    Custom collate function for variable length OCR labels
    """
    # Separate images and labels
    images, labels = zip(*batch)
    
    # Stack all images into a batch
    images = torch.stack(images, 0)
    
    # Get lengths of each label
    lengths = torch.tensor([len(label) for label in labels])
    
    # Flatten and concatenate all labels
    labels_concat = torch.cat([torch.tensor(label) for label in labels])
    
    return images, labels_concat, lengths

In [13]:
def decode_predictions(predictions, id_to_char):
    """
    Decode model predictions to text
    Args:
        predictions: Model output after softmax
        id_to_char: Mapping from ID to character
    Returns:
        Decoded text
    """
    # Get best prediction (greedy decoding)
    _, max_indices = torch.max(predictions, dim=2)
    
    # Convert to numpy for easier processing
    max_indices = max_indices.cpu().numpy()
    
    # Decode batch
    decoded_texts = []
    
    for indices in max_indices:
        # Create text (CTC blank removal and repeated character removal)
        text = ""
        prev_idx = -1
        
        for idx in indices:
            # Skip blank token (usually 0) and repeated characters
            if idx != 0 and idx != prev_idx:
                text += id_to_char.get(idx, "")  # Get character from ID
            prev_idx = idx
        
        decoded_texts.append(text)
    
    return decoded_texts

In [None]:
df = pd.read_csv("../dataset/mini_qa_images/mini_qa.csv")
with open("../dataset/mini_qa_images/char_mappings.json", "r", encoding="utf-8") as f:
    loaded_mapping = json.load(f)

# Convert keys back to int for id_to_char (JSON keys are saved as strings)
char_to_id = loaded_mapping["char_to_id"]
id_to_char = {int(k): v for k, v in loaded_mapping["id_to_char"].items()}

print("Character mappings loaded successfully!")
print("Loaded char_to_id:", char_to_id)
print("Loaded id_to_char:", id_to_char)

def text_to_ids(text):
    return [char_to_id.get(char, 0) for char in text]  # Defaulting to 0 for unknown characters
def ids_to_text(ids):
    return ''.join([id_to_char.get(id, '?') for id in ids])  # Use '?' for unknown IDs
image_paths = []
image_labels = []
for index, row in df.iterrows():
    # if(index%20==0):
    #     print(f"{index} out of 100")
    text = row['question']
    path = f"../dataset/mini_qa_images/question/{index}.png"
    image_paths.append(path)
    image_labels.append(text_to_ids(text))

Character mappings loaded successfully!
Loaded char_to_id: {' ': 1, '?': 2, '᠂': 3, '᠃': 4, '᠋': 5, '᠌': 6, '᠍': 7, '\u180e': 8, 'ᠠ': 9, 'ᠡ': 10, 'ᠢ': 11, 'ᠣ': 12, 'ᠤ': 13, 'ᠥ': 14, 'ᠦ': 15, 'ᠧ': 16, 'ᠨ': 17, 'ᠩ': 18, 'ᠪ': 19, 'ᠬ': 20, 'ᠭ': 21, 'ᠮ': 22, 'ᠯ': 23, 'ᠰ': 24, 'ᠱ': 25, 'ᠲ': 26, 'ᠳ': 27, 'ᠴ': 28, 'ᠵ': 29, 'ᠶ': 30, 'ᠷ': 31, 'ᠹ': 32, '\u202f': 33, '︖': 34, '？': 35}
Loaded id_to_char: {1: ' ', 2: '?', 3: '᠂', 4: '᠃', 5: '᠋', 6: '᠌', 7: '᠍', 8: '\u180e', 9: 'ᠠ', 10: 'ᠡ', 11: 'ᠢ', 12: 'ᠣ', 13: 'ᠤ', 14: 'ᠥ', 15: 'ᠦ', 16: 'ᠧ', 17: 'ᠨ', 18: 'ᠩ', 19: 'ᠪ', 20: 'ᠬ', 21: 'ᠭ', 22: 'ᠮ', 23: 'ᠯ', 24: 'ᠰ', 25: 'ᠱ', 26: 'ᠲ', 27: 'ᠳ', 28: 'ᠴ', 29: 'ᠵ', 30: 'ᠶ', 31: 'ᠷ', 32: 'ᠹ', 33: '\u202f', 34: '︖', 35: '？'}


In [17]:
train_ocr_model(image_paths, image_labels, char_to_id, id_to_char)

Using device: cuda
Model width after CNN: 50
Model RNN input size: 256
Total model parameters: 369,252


RuntimeError: DataLoader worker (pid(s) 2416, 17908) exited unexpectedly