In [None]:
## this is my final code, in which all model traiinng is done and its error free and running accurately 

In [None]:
# =============================================================================
# COMPLETE ERROR-FREE GOOGLE COLAB SOLUTION
# Neural Machine Translation: Urdu to Roman Urdu
# =============================================================================

# CELL 1: Install packages

%pip install torch torchtext nltk sacrebleu editdistance streamlit
%pip install matplotlib seaborn tqdm pandas numpy



# CELL 2: Import libraries
    
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import hashlib
from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import os
import json
import pickle
from collections import Counter, defaultdict
import random
from typing import List, Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)


# CELL 2.5: Time-Saving Checkpoint System

# =============================================================================
# TIME-SAVING CHECKPOINT SYSTEM
# Saves 15-20 minutes of tokenization time and training progress
# =============================================================================

class TokenizerCheckpoint:
    
    def __init__(self, checkpoint_dir: str = "/content/tokenizer_checkpoints"):
        self.checkpoint_dir = checkpoint_dir
        os.makedirs(checkpoint_dir, exist_ok=True)
    
    def _get_data_hash(self, train_pairs, vocab_sizes):
        data_str = f"{len(train_pairs)}_{vocab_sizes['src']}_{vocab_sizes['tgt']}"
        # Add hash of first few pairs to detect content changes
        sample_pairs = str(train_pairs[:10]) if len(train_pairs) > 10 else str(train_pairs)
        data_str += hashlib.md5(sample_pairs.encode()).hexdigest()[:8]
        return data_str
    
    def save_tokenizers(self, src_tokenizer, tgt_tokenizer, train_pairs, vocab_sizes, config):
        data_hash = self._get_data_hash(train_pairs, vocab_sizes)
        checkpoint_path = os.path.join(self.checkpoint_dir, f"tokenizers_{data_hash}.pkl")
        
        checkpoint_data = {
            'src_tokenizer': src_tokenizer,
            'tgt_tokenizer': tgt_tokenizer,
            'vocab_sizes': vocab_sizes,
            'config': config,
            'data_hash': data_hash,
            'num_pairs': len(train_pairs)
        }
        
        with open(checkpoint_path, 'wb') as f:
            pickle.dump(checkpoint_data, f)
        
        print(f"💾 Tokenizers saved to: {checkpoint_path}")
        return checkpoint_path
    
    def load_tokenizers(self, train_pairs, vocab_sizes):
        data_hash = self._get_data_hash(train_pairs, vocab_sizes)
        checkpoint_path = os.path.join(self.checkpoint_dir, f"tokenizers_{data_hash}.pkl")
        
        if os.path.exists(checkpoint_path):
            try:
                with open(checkpoint_path, 'rb') as f:
                    checkpoint_data = pickle.load(f)
                
                print(f"📂 Loading tokenizers from: {checkpoint_path}")
                print(f"📊 Checkpoint info: {checkpoint_data['num_pairs']} pairs, "
                      f"src_vocab={checkpoint_data['vocab_sizes']['src']}, "
                      f"tgt_vocab={checkpoint_data['vocab_sizes']['tgt']}")
                
                return (checkpoint_data['src_tokenizer'], 
                       checkpoint_data['tgt_tokenizer'], 
                       True)
            except Exception as e:
                print(f"⚠️ Error loading checkpoint: {e}")
                return None, None, False
        else:
            print(f"🔍 No checkpoint found for data hash: {data_hash}")
            return None, None, False

class ModelCheckpoint:
    
    def __init__(self, checkpoint_dir: str = "/content/model_checkpoints"):
        self.checkpoint_dir = checkpoint_dir
        os.makedirs(checkpoint_dir, exist_ok=True)
    
    def save_checkpoint(self, epoch, model, optimizer, train_losses, val_losses, 
                       train_perplexities, val_perplexities, best_val_loss, config):
        checkpoint_path = os.path.join(self.checkpoint_dir, f"checkpoint_epoch_{epoch}.pth")
        
        checkpoint_data = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_losses': train_losses,
            'val_losses': val_losses,
            'train_perplexities': train_perplexities,
            'val_perplexities': val_perplexities,
            'best_val_loss': best_val_loss,
            'config': config
        }
        
        torch.save(checkpoint_data, checkpoint_path)
        print(f"💾 Model checkpoint saved: {checkpoint_path}")
        return checkpoint_path
    
    def load_checkpoint(self, model, optimizer, checkpoint_path):
        if os.path.exists(checkpoint_path):
            checkpoint = torch.load(checkpoint_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
            
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            
            print(f"📂 Model checkpoint loaded: {checkpoint_path}")
            print(f"📊 Resuming from epoch: {checkpoint['epoch']}")
            
            return checkpoint
        else:
            print(f"❌ Checkpoint not found: {checkpoint_path}")
            return None

def create_minimal_dataset(dataset_path, max_pairs=1000):
    print(f"🚀 Creating minimal dataset (max {max_pairs} pairs)...")
    
    urdu_texts, roman_texts = load_dataset(dataset_path)
    
    # Take only a subset for quick testing
    subset_size = min(max_pairs, len(urdu_texts))
    urdu_subset = urdu_texts[:subset_size]
    roman_subset = roman_texts[:subset_size]
    
    print(f"📊 Minimal dataset: {len(urdu_subset)} pairs (from {len(urdu_texts)} total)")
    
    return urdu_subset, roman_subset

def mount_google_drive():
    try:
        drive.mount('/content/drive')
        print("✅ Google Drive mounted successfully!")
        return True
    except Exception as e:
        print(f"⚠️ Could not mount Google Drive: {e}")
        return False

def create_tokenizers_with_checkpoint(train_pairs, config, checkpoint_system):
    print("\\n🔧 Creating tokenizers with checkpoint system...")
    
    vocab_sizes = {
        'src': config['src_vocab_size'],
        'tgt': config['tgt_vocab_size']
    }
    
    # Try to load from checkpoint first
    src_tokenizer, tgt_tokenizer, loaded = checkpoint_system.load_tokenizers(train_pairs, vocab_sizes)
    
    if loaded:
        print("✅ Tokenizers loaded from checkpoint - saving 15+ minutes!")
        return src_tokenizer, tgt_tokenizer
    else:
        print("🔄 No checkpoint found, training new tokenizers...")
        
        # Train new tokenizers
        src_tokenizer, tgt_tokenizer = create_tokenizers(
            train_pairs,
            src_vocab_size=config['src_vocab_size'],
            tgt_vocab_size=config['tgt_vocab_size']
        )
        
        # Save checkpoint for next time
        checkpoint_system.save_tokenizers(src_tokenizer, tgt_tokenizer, train_pairs, vocab_sizes, config)
        
        return src_tokenizer, tgt_tokenizer


# CELL 3: Dataset Setup with Extraction

import zipfile

# Clone repository
!git clone https://github.com/amir9ume/urdu_ghazals_rekhta.git

print("🔧 EXTRACTING DATASET FROM ZIP FILE...")

# Extract dataset
zip_path = '/content/urdu_ghazals_rekhta/dataset/dataset.zip'
extract_to = '/content/dataset_extracted'

if not os.path.exists(extract_to):
    print(f"📦 Extracting {zip_path}")
    os.makedirs(extract_to, exist_ok=True)
    
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("✅ Extraction completed!")
    else:
        print(f"❌ Zip file not found at {zip_path}")
else:
    print("✅ Dataset already extracted!")

# Find dataset path (avoid __MACOSX)
dataset_path = None
for root, dirs, files in os.walk(extract_to):
    if '__MACOSX' in root:
        continue
    if any(poet in dirs for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal']):
        dataset_path = root
        break

if dataset_path:
    print(f"🎯 Dataset found at: {dataset_path}")
    poets = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    print(f"📚 Found {len(poets)} poets")
else:
    print("❌ Could not find dataset!")

print(f"✅ Final dataset path: {dataset_path}")


# CELL 4: Data Loader with Text Cleaning

import unicodedata

class TextCleaner:
    @staticmethod
    def clean_urdu(text: str) -> str:
        text = unicodedata.normalize('NFKC', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s۔،؍؎؏؟!]', '', text)
        return text.strip()
    
    @staticmethod
    def clean_roman(text: str) -> str:
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-ZāīūĀĪŪñṇṛṭḍṣġḥḳẓẕ\s\'\-\.]', '', text)
        return text.strip()

def load_dataset(data_path):
    print(f"📖 Loading dataset from: {data_path}")
    
    urdu_texts = []
    roman_texts = []
    
    poets = [d for d in os.listdir(data_path) 
             if os.path.isdir(os.path.join(data_path, d)) and not d.startswith('.')]
    
    print(f"📚 Found {len(poets)} poets in dataset")
    
    for poet in tqdm(poets, desc="Loading poets"):
        poet_path = os.path.join(data_path, poet)
        urdu_path = os.path.join(poet_path, 'ur')
        english_path = os.path.join(poet_path, 'en')
        
        if os.path.exists(urdu_path) and os.path.exists(english_path):
            urdu_files = set(os.listdir(urdu_path))
            english_files = set(os.listdir(english_path))
            common_files = urdu_files.intersection(english_files)
            
            for file_name in common_files:
                try:
                    with open(os.path.join(urdu_path, file_name), 'r', encoding='utf-8') as f:
                        urdu_content = f.read().strip()
                    
                    with open(os.path.join(english_path, file_name), 'r', encoding='utf-8') as f:
                        roman_content = f.read().strip()
                    
                    urdu_lines = [line.strip() for line in urdu_content.split('\n') if line.strip()]
                    roman_lines = [line.strip() for line in roman_content.split('\n') if line.strip()]
                    
                    min_lines = min(len(urdu_lines), len(roman_lines))
                    for i in range(min_lines):
                        if urdu_lines[i] and roman_lines[i]:
                            urdu_texts.append(urdu_lines[i])
                            roman_texts.append(roman_lines[i])
                except:
                    continue
    
    print(f"\nDataset loaded:")
    print(f"Total pairs: {len(urdu_texts)}")
    return urdu_texts, roman_texts

def clean_and_split_data(urdu_texts, roman_texts, train_ratio=0.5, val_ratio=0.25, test_ratio=0.25):
    if len(urdu_texts) == 0:
        return [], [], []
    
    cleaner = TextCleaner()
    
    print("🧹 Cleaning texts...")
    cleaned_urdu = [cleaner.clean_urdu(text) for text in tqdm(urdu_texts, desc="Cleaning Urdu")]
    cleaned_roman = [cleaner.clean_roman(text) for text in tqdm(roman_texts, desc="Cleaning Roman")]
    
    valid_pairs = []
    for u, r in zip(cleaned_urdu, cleaned_roman):
        if u and r and 2 <= len(u.split()) <= 50 and 2 <= len(r.split()) <= 50:
            valid_pairs.append((u, r))
    
    print(f"After cleaning and filtering: {len(valid_pairs)} valid pairs")
    
    if len(valid_pairs) == 0:
        return [], [], []
    
    random.shuffle(valid_pairs)
    total = len(valid_pairs)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)
    
    train_pairs = valid_pairs[:train_end]
    val_pairs = valid_pairs[train_end:val_end]
    test_pairs = valid_pairs[val_end:]
    
    print(f"Data splits:")
    print(f"  Train: {len(train_pairs)} pairs")
    print(f"  Validation: {len(val_pairs)} pairs")
    print(f"  Test: {len(test_pairs)} pairs")
    
    return train_pairs, val_pairs, test_pairs


# CELL 5: BPE Tokenizer (From Scratch)

class BPETokenizer:
    def __init__(self, vocab_size: int = 10000):
        self.vocab_size = vocab_size
        self.word_freqs = Counter()
        self.vocab = {}
        self.merges = []
        self.special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        
    def _get_stats(self, vocab):
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs
    
    def _merge_vocab(self, pair, vocab):
        new_vocab = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        
        for word in vocab:
            new_word = p.sub(''.join(pair), word)
            new_vocab[new_word] = vocab[word]
        return new_vocab
    
    def train(self, texts):
        print("Training BPE tokenizer...")
        
        for text in texts:
            words = text.split()
            for word in words:
                self.word_freqs[word] += 1
        
        vocab = {}
        for word, freq in self.word_freqs.items():
            vocab[' '.join(list(word)) + ' </w>'] = freq
        
        for token in self.special_tokens:
            vocab[token] = 1
        
        num_merges = self.vocab_size - len(self.special_tokens)
        
        for i in range(num_merges):
            pairs = self._get_stats(vocab)
            if not pairs:
                break
                
            best_pair = max(pairs, key=pairs.get)
            vocab = self._merge_vocab(best_pair, vocab)
            self.merges.append(best_pair)
            
            if (i + 1) % 1000 == 0:
                print(f"Merged {i + 1}/{num_merges} pairs")
        
        self.vocab = {}
        for i, token in enumerate(self.special_tokens):
            self.vocab[token] = i
        
        for word in vocab:
            if word not in self.vocab:
                self.vocab[word] = len(self.vocab)
        
        print(f"BPE training completed. Vocabulary size: {len(self.vocab)}")
    
    def encode(self, text):
        # Simplified encoding for compatibility
        tokens = []
        words = text.split()
        for word in words:
            if word in self.vocab:
                tokens.append(self.vocab[word])
            else:
                tokens.append(self.vocab['<unk>'])
        return tokens
    
    def decode(self, token_ids):
        id_to_token = {v: k for k, v in self.vocab.items()}
        tokens = []
        for token_id in token_ids:
            if token_id in id_to_token:
                token = id_to_token[token_id]
                if token not in self.special_tokens:
                    tokens.append(token)
        
        text = ' '.join(tokens)
        text = text.replace('</w>', ' ')
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def get_vocab_size(self):
        return len(self.vocab)

def create_tokenizers(train_pairs, src_vocab_size=8000, tgt_vocab_size=8000):
    src_texts = [pair[0] for pair in train_pairs]
    tgt_texts = [pair[1] for pair in train_pairs]
    
    src_tokenizer = BPETokenizer(vocab_size=src_vocab_size)
    tgt_tokenizer = BPETokenizer(vocab_size=tgt_vocab_size)
    
    print("Training source (Urdu) tokenizer...")
    src_tokenizer.train(src_texts)
    
    print("Training target (Roman Urdu) tokenizer...")
    tgt_tokenizer.train(tgt_texts)
    
    return src_tokenizer, tgt_tokenizer


# CELL 6: Model Architecture (WITH GPU/CPU FIX)

class BiLSTMEncoder(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, 
                 num_layers: int = 2, dropout: float = 0.3):
        super(BiLSTMEncoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, lengths=None):
        embedded = self.dropout(self.embedding(x))
        
        if lengths is not None:
            # FIX: Move lengths to CPU for pack_padded_sequence
            lengths_cpu = lengths.cpu() if lengths.is_cuda else lengths
            packed = nn.utils.rnn.pack_padded_sequence(
                embedded, lengths_cpu, batch_first=True, enforce_sorted=False)
            output, (hidden, cell) = self.lstm(packed)
            output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        else:
            output, (hidden, cell) = self.lstm(embedded)
        
        # Combine bidirectional hidden states
        hidden_fwd = hidden[-2]
        hidden_bwd = hidden[-1]
        final_hidden = torch.cat([hidden_fwd, hidden_bwd], dim=1)
        
        cell_fwd = cell[-2]
        cell_bwd = cell[-1]
        final_cell = torch.cat([cell_fwd, cell_bwd], dim=1)
        
        return output, (final_hidden, final_cell)

class AttentionMechanism(nn.Module):
    def __init__(self, decoder_hidden_dim: int, encoder_hidden_dim: int):
        super(AttentionMechanism, self).__init__()
        # Use the actual encoder hidden dimension (1024 for bidirectional)
        self.decoder_projection = nn.Linear(decoder_hidden_dim, encoder_hidden_dim)
        self.encoder_projection = nn.Linear(encoder_hidden_dim, encoder_hidden_dim)
        self.attention_projection = nn.Linear(encoder_hidden_dim, 1)

    def forward(self, decoder_hidden, encoder_outputs, mask=None):
        batch_size, seq_len, encoder_dim = encoder_outputs.size()

        # decoder_hidden shape: (num_layers, batch_size, hidden_dim)
        # We need: (batch_size, 1, hidden_dim) for attention
        decoder_hidden_proj = decoder_hidden[-1].unsqueeze(1)  # Take last layer, add seq dim

        # Project decoder hidden to encoder dimension
        decoder_proj = self.decoder_projection(decoder_hidden_proj)  # (batch_size, 1, encoder_dim)

        # Broadcast decoder projection to match encoder sequence length
        decoder_proj = decoder_proj.repeat(1, seq_len, 1)  # (batch_size, seq_len, encoder_dim)

        # Project encoder outputs to same dimension
        encoder_proj = self.encoder_projection(encoder_outputs)  # (batch_size, seq_len, encoder_dim)

        # Calculate attention scores
        energy = torch.tanh(decoder_proj + encoder_proj)  # (batch_size, seq_len, encoder_dim)
        attention_scores = self.attention_projection(energy).squeeze(2)  # (batch_size, seq_len)

        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e10)

        attention_weights = F.softmax(attention_scores, dim=1)  # (batch_size, seq_len)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, encoder_dim)

        return context, attention_weights

class LSTMDecoder(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, 
                 encoder_hidden_dim: int, num_layers: int = 4, dropout: float = 0.3):
        super(LSTMDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        self.hidden_projection = nn.Linear(encoder_hidden_dim, hidden_dim)
        self.cell_projection = nn.Linear(encoder_hidden_dim, hidden_dim)
        
        # Fix: Use the actual encoder hidden dimension (encoder_hidden_dim is already hidden_dim * 2)
        self.attention = AttentionMechanism(hidden_dim, encoder_hidden_dim)
        
        # Fix: LSTM input should be embed_dim + encoder_hidden_dim (not *2)
        self.lstm = nn.LSTM(embed_dim + encoder_hidden_dim, hidden_dim, 
                           num_layers, batch_first=True, dropout=dropout)
        
        self.output_projection = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_token, hidden, cell, encoder_outputs, mask=None):
        embedded = self.dropout(self.embedding(input_token))
        
        # Fix: Pass the full hidden state to attention, not just the last layer
        context, attention_weights = self.attention(hidden, encoder_outputs, mask)
        
        # Fix: Ensure context has the right shape for concatenation
        lstm_input = torch.cat([embedded, context], dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.output_projection(output)
        
        return output, hidden, cell, attention_weights

class Seq2SeqModel(nn.Module):
    def __init__(self, src_vocab_size: int, tgt_vocab_size: int, 
                 embed_dim: int = 256, hidden_dim: int = 512, 
                 encoder_layers: int = 2, decoder_layers: int = 4,
                 dropout: float = 0.3):
        super(Seq2SeqModel, self).__init__()
        
        self.encoder = BiLSTMEncoder(
            vocab_size=src_vocab_size,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            num_layers=encoder_layers,
            dropout=dropout
        )
        
        self.decoder = LSTMDecoder(
            vocab_size=tgt_vocab_size,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            encoder_hidden_dim=hidden_dim * 2,  # This is correct - encoder outputs hidden_dim * 2
            num_layers=decoder_layers,
            dropout=dropout
        )
        
        self.tgt_vocab_size = tgt_vocab_size
        
    def forward(self, src, tgt, src_lengths=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        
        encoder_outputs, (hidden, cell) = self.encoder(src, src_lengths)
        
        if src_lengths is not None:
            mask = torch.zeros(batch_size, src.size(1), device=src.device)
            for i, length in enumerate(src_lengths):
                mask[i, :length] = 1
        else:
            mask = torch.ones(batch_size, src.size(1), device=src.device)
        
        decoder_hidden = self.decoder.hidden_projection(hidden).unsqueeze(0)
        decoder_cell = self.decoder.cell_projection(cell).unsqueeze(0)
        
        decoder_hidden = decoder_hidden.repeat(self.decoder.num_layers, 1, 1)
        decoder_cell = decoder_cell.repeat(self.decoder.num_layers, 1, 1)
        
        outputs = torch.zeros(batch_size, tgt_len, self.tgt_vocab_size, device=src.device)
        
        input_token = tgt[:, 0].unsqueeze(1)
        
        for t in range(1, tgt_len):
            output, decoder_hidden, decoder_cell, _ = self.decoder(
                input_token, decoder_hidden, decoder_cell, encoder_outputs, mask)
            
            outputs[:, t] = output.squeeze(1)
            
            if random.random() < teacher_forcing_ratio:
                input_token = tgt[:, t].unsqueeze(1)
            else:
                input_token = output.argmax(2)
        
        return outputs


# CELL 7: Dataset and DataLoader

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_tokenizer, tgt_tokenizer):
        self.pairs = pairs
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        
        self.tokenized_pairs = []
        for src_text, tgt_text in pairs:
            src_tokens = src_tokenizer.encode(src_text)
            tgt_tokens = tgt_tokenizer.encode(f"<sos> {tgt_text} <eos>")
            
            if len(src_tokens) > 0 and len(tgt_tokens) > 0:
                self.tokenized_pairs.append((src_tokens, tgt_tokens))
    
    def __len__(self):
        return len(self.tokenized_pairs)
    
    def __getitem__(self, idx):
        src_tokens, tgt_tokens = self.tokenized_pairs[idx]
        return {
            'src': torch.tensor(src_tokens, dtype=torch.long),
            'tgt': torch.tensor(tgt_tokens, dtype=torch.long),
            'src_len': len(src_tokens),
            'tgt_len': len(tgt_tokens)
        }

def collate_fn(batch):
    src_sequences = [item['src'] for item in batch]
    tgt_sequences = [item['tgt'] for item in batch]
    src_lengths = [item['src_len'] for item in batch]
    tgt_lengths = [item['tgt_len'] for item in batch]
    
    src_padded = pad_sequence(src_sequences, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_sequences, batch_first=True, padding_value=0)
    
    return {
        'src': src_padded,
        'tgt': tgt_padded,
        'src_lengths': torch.tensor(src_lengths, dtype=torch.long),
        'tgt_lengths': torch.tensor(tgt_lengths, dtype=torch.long)
    }

def create_data_loaders(train_pairs, val_pairs, test_pairs, src_tokenizer, tgt_tokenizer, 
                       batch_size=32, num_workers=0):
    train_dataset = TranslationDataset(train_pairs, src_tokenizer, tgt_tokenizer)
    val_dataset = TranslationDataset(val_pairs, src_tokenizer, tgt_tokenizer)
    test_dataset = TranslationDataset(test_pairs, src_tokenizer, tgt_tokenizer)
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    print(f"Data loaders created:")
    print(f"  Train: {len(train_loader)} batches")
    print(f"  Validation: {len(val_loader)} batches")
    print(f"  Test: {len(test_loader)} batches")
    
    return train_loader, val_loader, test_loader


# CELL 8: Training Class

from torch.optim.lr_scheduler import ReduceLROnPlateau
import time

class Trainer:
    def __init__(self, model, train_loader, val_loader, src_tokenizer, tgt_tokenizer, 
                 lr=1e-3, device='cuda', checkpoint_system=None):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.device = device
        self.checkpoint_system = checkpoint_system
        
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', patience=3, factor=0.5)
        
        self.train_losses = []
        self.val_losses = []
        self.train_perplexities = []
        self.val_perplexities = []
        
    def train_epoch(self, epoch, teacher_forcing_ratio=0.5):
        self.model.train()
        total_loss = 0
        total_tokens = 0
        
        pbar = tqdm(self.train_loader, desc=f'Epoch {epoch}')
        for batch_idx, batch in enumerate(pbar):
            src = batch['src'].to(self.device)
            tgt = batch['tgt'].to(self.device)
            src_lengths = batch['src_lengths'].to(self.device)
            
            self.optimizer.zero_grad()
            
            outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio)
            
            outputs = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
            targets = tgt[:, 1:].contiguous().view(-1)
            
            loss = self.criterion(outputs, targets)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            total_loss += loss.item()
            total_tokens += targets.ne(0).sum().item()
            
            pbar.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(self.train_loader)
        perplexity = np.exp(total_loss * len(self.train_loader) / total_tokens)
        
        return avg_loss, perplexity
    
    def validate(self):
        self.model.eval()
        total_loss = 0
        total_tokens = 0
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc='Validation'):
                src = batch['src'].to(self.device)
                tgt = batch['tgt'].to(self.device)
                src_lengths = batch['src_lengths'].to(self.device)
                
                outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio=0)
                
                outputs = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
                targets = tgt[:, 1:].contiguous().view(-1)
                
                loss = self.criterion(outputs, targets)
                total_loss += loss.item()
                total_tokens += targets.ne(0).sum().item()
        
        avg_loss = total_loss / len(self.val_loader)
        perplexity = np.exp(total_loss * len(self.val_loader) / total_tokens)
        
        return avg_loss, perplexity
    
    def train(self, num_epochs, save_path='best_model.pth'):
        best_val_loss = float('inf')
        patience_counter = 0
        max_patience = 5
        
        print(f"Starting training for {num_epochs} epochs...")
        print(f"Device: {self.device}")
        print(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
        
        for epoch in range(1, num_epochs + 1):
            start_time = time.time()
            
            train_loss, train_perplexity = self.train_epoch(epoch)
            val_loss, val_perplexity = self.validate()
            
            self.scheduler.step(val_loss)
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            self.train_perplexities.append(train_perplexity)
            self.val_perplexities.append(val_perplexity)
            
            epoch_time = time.time() - start_time
            
            print(f'Epoch {epoch}/{num_epochs}:')
            print(f'  Train Loss: {train_loss:.4f}, Train Perplexity: {train_perplexity:.4f}')
            print(f'  Val Loss: {val_loss:.4f}, Val Perplexity: {val_perplexity:.4f}')
            print(f'  Time: {epoch_time:.2f}s, LR: {self.optimizer.param_groups[0]["lr"]:.6f}')
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'train_losses': self.train_losses,
                    'val_losses': self.val_losses,
                    'train_perplexities': self.train_perplexities,
                    'val_perplexities': self.val_perplexities,
                    'best_val_loss': best_val_loss
                }, save_path)
                print(f'  New best model saved! Val Loss: {val_loss:.4f}')
            else:
                patience_counter += 1
            
            # Save checkpoint every 2 epochs if checkpoint system is available
            if self.checkpoint_system and epoch % 2 == 0:
                self.checkpoint_system.save_checkpoint(
                    epoch, self.model, self.optimizer, 
                    self.train_losses, self.val_losses,
                    self.train_perplexities, self.val_perplexities,
                    best_val_loss, {}
                )
                
            if patience_counter >= max_patience:
                print(f'Early stopping after {epoch} epochs')
                break
                
            print('-' * 60)
        
        print('Training completed!')
        return self.train_losses, self.val_losses
    
    def plot_training_curves(self, save_path='training_curves.png'):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        ax1.plot(self.train_losses, label='Train Loss', color='blue')
        ax1.plot(self.val_losses, label='Validation Loss', color='red')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.set_title('Training and Validation Loss')
        ax1.legend()
        ax1.grid(True)
        
        ax2.plot(self.train_perplexities, label='Train Perplexity', color='blue')
        ax2.plot(self.val_perplexities, label='Validation Perplexity', color='red')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Perplexity')
        ax2.set_title('Training and Validation Perplexity')
        ax2.legend()
        ax2.grid(True)
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

class Evaluator:
    
    def __init__(self, model, src_tokenizer, tgt_tokenizer, device='cuda'):
        self.model = model
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.device = device
        
    def calculate_accuracy(self, predictions, targets):
        correct = 0
        total = 0
        
        for pred, target in zip(predictions, targets):
            # Remove padding tokens (0) for comparison
            pred_clean = [token for token in pred if token != 0]
            target_clean = [token for token in target if token != 0]
            
            # Calculate word-level accuracy
            min_len = min(len(pred_clean), len(target_clean))
            if min_len > 0:
                correct += sum(1 for p, t in zip(pred_clean[:min_len], target_clean[:min_len]) if p == t)
                total += len(target_clean)
        
        accuracy = (correct / total * 100) if total > 0 else 0
        return accuracy, correct, total
    
    def calculate_bleu_score(self, predictions, targets):
        try:
            from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
            
            bleu_scores = []
            smoothie = SmoothingFunction().method4
            
            for pred, target in zip(predictions, targets):
                # Convert token IDs to words
                pred_words = [self.tgt_tokenizer.idx_to_word.get(token, '<UNK>') for token in pred if token != 0]
                target_words = [self.tgt_tokenizer.idx_to_word.get(token, '<UNK>') for token in target if token != 0]
                
                if len(pred_words) > 0 and len(target_words) > 0:
                    bleu = sentence_bleu([target_words], pred_words, smoothing_function=smoothie)
                    bleu_scores.append(bleu)
            
            return sum(bleu_scores) / len(bleu_scores) * 100 if bleu_scores else 0
        except:
            return 0
    
    def evaluate_dataset(self, data_loader, dataset_name="Dataset"):
        self.model.eval()
        all_predictions = []
        all_targets = []
        total_loss = 0
        total_tokens = 0
        
        criterion = nn.CrossEntropyLoss(ignore_index=0)
        
        print(f"\\n📊 Evaluating {dataset_name}...")
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc=f"Evaluating {dataset_name}"):
                src = batch['src'].to(self.device)
                tgt = batch['tgt'].to(self.device)
                src_lengths = batch['src_lengths'].to(self.device)
                
                # Get predictions
                outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio=0.0)
                
                # Calculate loss
                outputs_flat = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
                targets_flat = tgt[:, 1:].contiguous().view(-1)
                loss = criterion(outputs_flat, targets_flat)
                
                total_loss += loss.item()
                total_tokens += (targets_flat != 0).sum().item()
                
                # Get predictions
                predictions = outputs.argmax(dim=-1)
                all_predictions.extend(predictions.cpu().numpy())
                all_targets.extend(tgt.cpu().numpy())
        
        # Calculate metrics
        avg_loss = total_loss / len(data_loader)
        perplexity = np.exp(avg_loss)
        
        # Calculate accuracy
        accuracy, correct, total = self.calculate_accuracy(all_predictions, all_targets)
        
        # Calculate BLEU score
        bleu_score = self.calculate_bleu_score(all_predictions, all_targets)
        
        print(f"\\n📈 {dataset_name} Results:")
        print(f"  Loss: {avg_loss:.4f}")
        print(f"  Perplexity: {perplexity:.4f}")
        print(f"  Accuracy: {accuracy:.2f}% ({correct}/{total} words)")
        print(f"  BLEU Score: {bleu_score:.2f}")
        
        return {
            'loss': avg_loss,
            'perplexity': perplexity,
            'accuracy': accuracy,
            'correct_words': correct,
            'total_words': total,
            'bleu_score': bleu_score
        }


# CELL 9: Main Training Execution

# Main training execution
config = {
    'seed': 42,
    'embed_dim': 256,
    'hidden_dim': 512,
    'encoder_layers': 2,
    'decoder_layers': 4,
    'dropout': 0.3,
    'learning_rate': 1e-3,
    'batch_size': 32,
    'src_vocab_size': 8000,
    'tgt_vocab_size': 8000,
    'num_epochs': 10,
    'use_minimal_dataset': False,   # Set to True for quick testing with 2000 words
    'minimal_dataset_size': 1000   # Limit to 2000 words for time-saving
}

print("Starting Urdu to Roman Urdu NMT Training...")
print(f"Configuration: {config}")

# Dataset limitation notice
if config['use_minimal_dataset']:
    print(f"\n⚠️ DATASET LIMITATION: Using only {config['minimal_dataset_size']} words for quick testing")
    print("💡 This saves time but may not reflect full model performance")
    print("💡 If results look good, change 'use_minimal_dataset': False for full training")
else:
    print("\n✅ Using full dataset for complete training")

# 1. Load and preprocess data
print("\n1. Loading and preprocessing data...")

# Find correct dataset path (avoid __MACOSX)
dataset_path = None
if os.path.exists('/content/dataset_extracted'):
    for root, dirs, files in os.walk('/content/dataset_extracted'):
        if '__MACOSX' in root:
            continue
        if any(poet in dirs for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal']):
            dataset_path = root
            print(f"✅ Found dataset at: {dataset_path}")
            break

if dataset_path:
    if config['use_minimal_dataset']:
        print(f"🚀 Using minimal dataset for quick testing ({config['minimal_dataset_size']} pairs)...")
        urdu_texts, roman_texts = create_minimal_dataset(dataset_path, config['minimal_dataset_size'])
    else:
        urdu_texts, roman_texts = load_dataset(dataset_path)
    
    train_pairs, val_pairs, test_pairs = clean_and_split_data(
        urdu_texts, roman_texts,
        train_ratio=0.5, val_ratio=0.25, test_ratio=0.25
    )
    
    if len(train_pairs) == 0:
        print("❌ No training data available!")
    else:
        print(f"✅ Ready to proceed with {len(train_pairs)} training pairs")

        # 2. Create tokenizers with checkpoint system
        print("\n2. Creating tokenizers with checkpoint system...")
        
        # Initialize checkpoint systems
        tokenizer_checkpoint = TokenizerCheckpoint()
        model_checkpoint = ModelCheckpoint()
        
        # Mount Google Drive for persistent storage
        print("🔗 Mounting Google Drive for persistent storage...")
        drive_mounted = mount_google_drive()
        
        # Create tokenizers with checkpoint support
        src_tokenizer, tgt_tokenizer = create_tokenizers_with_checkpoint(
            train_pairs, config, tokenizer_checkpoint
        )

        # 3. Create data loaders
        print("\n3. Creating data loaders...")
        train_loader, val_loader, test_loader = create_data_loaders(
            train_pairs, val_pairs, test_pairs,
            src_tokenizer, tgt_tokenizer,
            batch_size=config['batch_size']
        )

        # 4. Create model
        print("\n4. Creating model...")
        model = Seq2SeqModel(
            src_vocab_size=src_tokenizer.get_vocab_size(),
            tgt_vocab_size=tgt_tokenizer.get_vocab_size(),
            embed_dim=config['embed_dim'],
            hidden_dim=config['hidden_dim'],
            encoder_layers=config['encoder_layers'],
            decoder_layers=config['decoder_layers'],
            dropout=config['dropout']
        )

        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

        # 5. Train model
        print("\n5. Starting training...")
        trainer = Trainer(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            src_tokenizer=src_tokenizer,
            tgt_tokenizer=tgt_tokenizer,
            lr=config['learning_rate'],
            device=device,
            checkpoint_system=model_checkpoint
        )

        train_losses, val_losses = trainer.train(
            num_epochs=config['num_epochs'],
            save_path='best_model.pth'
        )

        # 6. Evaluate model on validation and test sets
        print("\n6. Evaluating model performance...")
        evaluator = Evaluator(model, src_tokenizer, tgt_tokenizer, device)
        
        # Evaluate on validation set
        val_results = evaluator.evaluate_dataset(val_loader, "Validation Set")
        
        # Evaluate on test set
        test_results = evaluator.evaluate_dataset(test_loader, "Test Set")
        
        # 7. Plot training curves
        trainer.plot_training_curves('training_curves.png')

        print("\n" + "=" * 60)
        print("TRAINING COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        
        # 8. Show comprehensive results summary
        print("\n📊 COMPREHENSIVE RESULTS SUMMARY:")
        print("=" * 50)
        print(f"📈 VALIDATION SET PERFORMANCE:")
        print(f"  • Accuracy: {val_results['accuracy']:.2f}%")
        print(f"  • BLEU Score: {val_results['bleu_score']:.2f}")
        print(f"  • Perplexity: {val_results['perplexity']:.4f}")
        print(f"  • Loss: {val_results['loss']:.4f}")
        print(f"  • Correct Words: {val_results['correct_words']}/{val_results['total_words']}")
        
        print(f"\n📈 TEST SET PERFORMANCE:")
        print(f"  • Accuracy: {test_results['accuracy']:.2f}%")
        print(f"  • BLEU Score: {test_results['bleu_score']:.2f}")
        print(f"  • Perplexity: {test_results['perplexity']:.4f}")
        print(f"  • Loss: {test_results['loss']:.4f}")
        print(f"  • Correct Words: {test_results['correct_words']}/{test_results['total_words']}")
        
        # Performance assessment
        print(f"\n🎯 PERFORMANCE ASSESSMENT:")
        if val_results['accuracy'] > 80:
            print("✅ EXCELLENT: Accuracy > 80% - Model is performing very well!")
        elif val_results['accuracy'] > 60:
            print("✅ GOOD: Accuracy > 60% - Model is performing well!")
        elif val_results['accuracy'] > 40:
            print("⚠️ FAIR: Accuracy > 40% - Model needs improvement")
        else:
            print("❌ POOR: Accuracy < 40% - Model needs significant improvement")
        
        if config['use_minimal_dataset']:
            print(f"\n💡 DATASET LIMIT: Using {config['minimal_dataset_size']} words for quick testing")
            print("💡 If results look good, set 'use_minimal_dataset': False for full training")
        
        # Show time-saving summary
        print("\n🚀 TIME-SAVING FEATURES USED:")
        print("=" * 40)
        print("✅ Tokenizer Checkpoint System - Saves 15+ minutes on next run")
        print("✅ Model Checkpoint System - Saves training progress every 2 epochs")
        print("✅ Google Drive Mounting - Persistent storage across sessions")
        if config['use_minimal_dataset']:
            print("✅ Minimal Dataset Testing - Quick validation with subset")
        print("\n💡 NEXT RUN: Tokenizers will load instantly from checkpoint!")
        print("💡 TRAINING: Can resume from any saved checkpoint if interrupted!")

else:
    print("❌ Could not find dataset!")


print("=== COMPLETE ERROR-FREE SOLUTION READY ===")
print("Copy each cell block (between triple quotes) into separate Colab cells")
print("Run them in order for guaranteed success!")


In [None]:
# CELL 10: Experiment System with Different Hyperparameters

# =============================================================================
# EXPERIMENT SYSTEM - TRAIN MULTIPLE MODELS WITH DIFFERENT HYPERPARAMETERS
# =============================================================================

class ExperimentRunner:
    def __init__(self):
        self.results = []
        self.best_model = None
        self.best_score = 0
        
    def run_single_experiment(self, config, experiment_num):
        """Run a single experiment with given configuration"""
        print(f"\n{'='*80}")
        print(f"🚀 RUNNING Experiment {experiment_num}: {config['name']}")
        print(f"{'='*80}")
        
        # Set random seed for reproducibility
        torch.manual_seed(config['seed'])
        np.random.seed(config['seed'])
        random.seed(config['seed'])
        if torch.cuda.is_available():
            torch.cuda.manual_seed(config['seed'])
        
        try:
            # 1. Load data (same for all experiments)
            print(f"\n1. Loading data for {config['name']}...")
            
            # Find correct dataset path
            dataset_path = None
            if os.path.exists('/content/dataset_extracted'):
                for root, dirs, files in os.walk('/content/dataset_extracted'):
                    if '__MACOSX' in root:
                        continue
                    if any(poet in dirs for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal']):
                        dataset_path = root
                        print(f"✅ Found dataset at: {dataset_path}")
                        break
            
            if not dataset_path:
                print("❌ Dataset not found!")
                return None
            
            # Load dataset
            if config.get('use_minimal_dataset', False):
                print(f"🚀 Using minimal dataset ({config.get('minimal_dataset_size', 1000)} pairs)...")
                urdu_texts, roman_texts = create_minimal_dataset(dataset_path, config.get('minimal_dataset_size', 1000))
            else:
                urdu_texts, roman_texts = load_dataset(dataset_path)
            
            # Clean and split data (same splits for all experiments)
            train_pairs, val_pairs, test_pairs = clean_and_split_data(
                urdu_texts, roman_texts,
                train_ratio=0.5, val_ratio=0.25, test_ratio=0.25
            )
            
            if len(train_pairs) == 0:
                print("❌ No training data available!")
                return None
            
            print(f"✅ Data loaded: {len(train_pairs)} train, {len(val_pairs)} val, {len(test_pairs)} test pairs")
            
            # 2. Create tokenizers with checkpoint system
            print(f"\n2. Creating tokenizers for {config['name']}...")
            
            # Initialize checkpoint systems for this experiment
            exp_dir = f"exp_{experiment_num}"
            tokenizer_checkpoint = TokenizerCheckpoint(f"/content/tokenizer_checkpoints_{exp_dir}")
            model_checkpoint = ModelCheckpoint(f"/content/model_checkpoints_{exp_dir}")
            
            # Create tokenizers
            src_tokenizer, tgt_tokenizer = create_tokenizers_with_checkpoint(
                train_pairs, config, tokenizer_checkpoint
            )
            
            # 3. Create data loaders
            print(f"\n3. Creating data loaders for {config['name']}...")
            train_loader, val_loader, test_loader = create_data_loaders(
                train_pairs, val_pairs, test_pairs,
                src_tokenizer, tgt_tokenizer,
                batch_size=config['batch_size']
            )
            
            # 4. Create model
            print(f"\n4. Creating model for {config['name']}...")
            model = Seq2SeqModel(
                src_vocab_size=src_tokenizer.get_vocab_size(),
                tgt_vocab_size=tgt_tokenizer.get_vocab_size(),
                embed_dim=config['embed_dim'],
                hidden_dim=config['hidden_dim'],
                encoder_layers=config['encoder_layers'],
                decoder_layers=config['decoder_layers'],
                dropout=config['dropout']
            )
            
            print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
            
            # 5. Train model
            print(f"\n5. Training {config['name']}...")
            trainer = Trainer(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                src_tokenizer=src_tokenizer,
                tgt_tokenizer=tgt_tokenizer,
                lr=config['learning_rate'],
                device=device,
                checkpoint_system=model_checkpoint
            )
            
            train_losses, val_losses = trainer.train(
                num_epochs=config['num_epochs'],
                save_path=f'best_model_exp_{experiment_num}.pth'
            )
            
            # 6. Evaluate model
            print(f"\n6. Evaluating {config['name']}...")
            evaluator = Evaluator(model, src_tokenizer, tgt_tokenizer, device)
            
            # Evaluate on validation set
            val_results = evaluator.evaluate_dataset(val_loader, f"Validation Set - {config['name']}")
            
            # Evaluate on test set
            test_results = evaluator.evaluate_dataset(test_loader, f"Test Set - {config['name']}")
            
            # 7. Calculate additional metrics
            print(f"\n7. Calculating additional metrics for {config['name']}...")
            
            # Calculate Character Error Rate (CER)
            val_cer = self.calculate_cer(model, val_loader, src_tokenizer, tgt_tokenizer, device)
            test_cer = self.calculate_cer(model, test_loader, src_tokenizer, tgt_tokenizer, device)
            
            # Get qualitative examples
            qualitative_examples = self.get_qualitative_examples(
                model, val_loader, src_tokenizer, tgt_tokenizer, device, num_examples=3
            )
            
            # Store results
            result = {
                'experiment_num': experiment_num,
                'config': config,
                'val_results': val_results,
                'test_results': test_results,
                'val_cer': val_cer,
                'test_cer': test_cer,
                'qualitative_examples': qualitative_examples,
                'model_path': f'best_model_exp_{experiment_num}.pth',
                'tokenizer_path': f'/content/tokenizer_checkpoints_{exp_dir}'
            }
            
            # Update best model
            if val_results['accuracy'] > self.best_score:
                self.best_score = val_results['accuracy']
                self.best_model = result
                print(f"🏆 New best model! Accuracy: {val_results['accuracy']:.2f}%")
            
            return result
            
        except Exception as e:
            print(f"❌ Error in experiment {experiment_num}: {e}")
            return None
    
    def calculate_cer(self, model, data_loader, src_tokenizer, tgt_tokenizer, device, num_samples=100):
        """Calculate Character Error Rate"""
        model.eval()
        total_cer = 0
        samples_processed = 0
        
        with torch.no_grad():
            for batch in data_loader:
                if samples_processed >= num_samples:
                    break
                    
                src = batch['src'].to(device)
                tgt = batch['tgt'].to(device)
                src_lengths = batch['src_lengths'].to(device)
                
                # Get predictions
                outputs = model(src, tgt, src_lengths, teacher_forcing_ratio=0.0)
                predictions = outputs.argmax(dim=-1)
                
                # Convert to text and calculate CER
                for j in range(src.size(0)):
                    if samples_processed >= num_samples:
                        break
                        
                    pred_tokens = predictions[j].cpu().numpy()
                    target_tokens = tgt[j].cpu().numpy()
                    
                    # Remove padding and convert to text
                    pred_clean = [token for token in pred_tokens if token != 0]
                    target_clean = [token for token in target_tokens if token != 0]
                    
                    # Convert to text using decode method
                    pred_text = tgt_tokenizer.decode(pred_clean)
                    target_text = tgt_tokenizer.decode(target_clean)
                    
                    # Calculate edit distance
                    if len(target_text) > 0:
                        cer = self.edit_distance(pred_text, target_text) / len(target_text)
                        total_cer += cer
                        samples_processed += 1
        
        return total_cer / samples_processed if samples_processed > 0 else 0
    
    def edit_distance(self, s1, s2):
        """Calculate Levenshtein distance"""
        if len(s1) < len(s2):
            return self.edit_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]
    
    def get_qualitative_examples(self, model, data_loader, src_tokenizer, tgt_tokenizer, device, num_examples=3):
        """Get qualitative translation examples"""
        model.eval()
        examples = []
        
        with torch.no_grad():
            for i, batch in enumerate(data_loader):
                if len(examples) >= num_examples:
                    break
                    
                src = batch['src'].to(device)
                tgt = batch['tgt'].to(device)
                src_lengths = batch['src_lengths'].to(device)
                
                # Get predictions
                outputs = model(src, tgt, src_lengths, teacher_forcing_ratio=0.0)
                predictions = outputs.argmax(dim=-1)
                
                for j in range(min(num_examples - len(examples), src.size(0))):
                    # Source text
                    src_tokens = src[j].cpu().numpy()
                    src_clean = [token for token in src_tokens if token != 0]
                    src_text = src_tokenizer.decode(src_clean)
                    
                    # Target text
                    target_tokens = tgt[j].cpu().numpy()
                    target_clean = [token for token in target_tokens if token != 0]
                    target_text = tgt_tokenizer.decode(target_clean)
                    
                    # Predicted text
                    pred_tokens = predictions[j].cpu().numpy()
                    pred_clean = [token for token in pred_tokens if token != 0]
                    pred_text = tgt_tokenizer.decode(pred_clean)
                    
                    examples.append({
                        'source': src_text,
                        'target': target_text,
                        'prediction': pred_text
                    })
        
        return examples

# Define experiment configurations
experiment_configs = [
    {
        'name': 'Experiment 1: Small Model',
        'seed': 42,
        'embed_dim': 128,
        'hidden_dim': 256,
        'encoder_layers': 1,
        'decoder_layers': 2,
        'dropout': 0.1,
        'learning_rate': 1e-3,
        'batch_size': 32,
        'src_vocab_size': 8000,
        'tgt_vocab_size': 8000,
        'num_epochs': 10,
        'use_minimal_dataset': True,
        'minimal_dataset_size': 2000
    },
    {
        'name': 'Experiment 2: Medium Model',
        'seed': 42,
        'embed_dim': 256,
        'hidden_dim': 512,
        'encoder_layers': 2,
        'decoder_layers': 3,
        'dropout': 0.3,
        'learning_rate': 5e-4,
        'batch_size': 64,
        'src_vocab_size': 8000,
        'tgt_vocab_size': 8000,
        'num_epochs': 10,
        'use_minimal_dataset': True,
        'minimal_dataset_size': 2000
    },
    {
        'name': 'Experiment 3: Large Model',
        'seed': 42,
        'embed_dim': 512,
        'hidden_dim': 512,
        'encoder_layers': 3,
        'decoder_layers': 4,
        'dropout': 0.5,
        'learning_rate': 1e-4,
        'batch_size': 128,
        'src_vocab_size': 8000,
        'tgt_vocab_size': 8000,
        'num_epochs': 10,
        'use_minimal_dataset': True,
        'minimal_dataset_size': 2000
    }
]


In [None]:
# CELL 12: Run All Experiments

print("\n🚀 STARTING EXPERIMENTATION PHASE")
print("="*80)
print("Running 3 experiments with different parameter configurations...")
print("="*80)

# Initialize experiment runner
experiment_runner = ExperimentRunner()

# Run all experiments
results = []
for i, config in enumerate(experiment_configs, 1):
    result = experiment_runner.run_single_experiment(config, i)
    if result:
        results.append(result)

# Generate comprehensive report
print("\n" + "="*80)
print("📊 EXPERIMENTATION RESULTS SUMMARY")
print("="*80)

if results:
    # Create results table
    print(f"\n{'Experiment':<20} {'Val Acc':<10} {'Test Acc':<10} {'Val BLEU':<10} {'Test BLEU':<10} {'Val CER':<10} {'Test CER':<10}")
    print("-" * 100)
    
    for result in results:
        config = result['config']
        val_results = result['val_results']
        test_results = result['test_results']
        
        print(f"{config['name']:<20} "
              f"{val_results['accuracy']:<10.2f} "
              f"{test_results['accuracy']:<10.2f} "
              f"{val_results['bleu_score']:<10.2f} "
              f"{test_results['bleu_score']:<10.2f} "
              f"{result['val_cer']:<10.4f} "
              f"{result['test_cer']:<10.4f}")
    
    # Announce best model
    if experiment_runner.best_model:
        best_config = experiment_runner.best_model['config']
        best_val_acc = experiment_runner.best_model['val_results']['accuracy']
        print(f"\n🏆 BEST MODEL: {best_config['name']}")
        print(f"   Validation Accuracy: {best_val_acc:.2f}%")
        print(f"   Model saved as: {experiment_runner.best_model['model_path']}")

print("\n" + "="*80)
print("✅ EXPERIMENTATION COMPLETED SUCCESSFULLY!")
print("="*80)


In [None]:
# CELL 13: Simple Inference Function for Testing

# =============================================================================
# SIMPLE INFERENCE FUNCTION - USE TRAINED MODELS FOR TRANSLATION
# =============================================================================

def translate_urdu_poetry(model, src_tokenizer, tgt_tokenizer, urdu_text, device='cuda', max_length=100):
    """
    Simple function to translate Urdu poetry using trained model
    
    Args:
        model: Trained Seq2Seq model
        src_tokenizer: Source (Urdu) tokenizer
        tgt_tokenizer: Target (Roman Urdu) tokenizer
        urdu_text: Input Urdu text to translate
        device: Device to run inference on
        max_length: Maximum length of generated translation
    
    Returns:
        dict: Translation results
    """
    model.eval()
    
    # Clean the input text
    cleaner = TextCleaner()
    cleaned_urdu = cleaner.clean_urdu(urdu_text)
    
    print(f"🔤 Input Urdu: {cleaned_urdu}")
    
    # Debug: Check tokenizer vocabulary
    print(f"🔍 Tokenizer vocab size: {src_tokenizer.get_vocab_size()}")
    print(f"🔍 Sample vocab entries: {list(src_tokenizer.vocab.items())[:10]}")
    
    # Tokenize input
    src_tokens = src_tokenizer.encode(cleaned_urdu)
    src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)
    src_lengths = torch.tensor([len(src_tokens)], dtype=torch.long).to(device)
    
    print(f"📝 Tokenized input: {src_tokens}")
    
    # Debug: Check if all tokens are <unk>
    unk_token_id = src_tokenizer.vocab.get('<unk>', 1)
    if all(token == unk_token_id for token in src_tokens):
        print("⚠️ WARNING: All tokens are <unk>! Tokenizer may not be properly trained on Urdu vocabulary.")
        print("💡 This means the model can't understand the input text.")
        print("🔧 Attempting to use word-level tokenization as fallback...")
        
        # Fallback: Try word-level tokenization
        words = cleaned_urdu.split()
        fallback_tokens = []
        for word in words:
            if word in src_tokenizer.vocab:
                fallback_tokens.append(src_tokenizer.vocab[word])
            else:
                fallback_tokens.append(unk_token_id)
        
        if fallback_tokens != src_tokens:
            print(f"🔄 Using fallback tokenization: {fallback_tokens}")
            src_tokens = fallback_tokens
            src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)
            src_lengths = torch.tensor([len(src_tokens)], dtype=torch.long).to(device)
    
    # Initialize target sequence with SOS token
    sos_token = tgt_tokenizer.vocab.get('<sos>', 1)
    eos_token = tgt_tokenizer.vocab.get('<eos>', 2)
    
    # Start with SOS token
    target_sequence = [sos_token]
    target_tensor = torch.tensor([target_sequence], dtype=torch.long).to(device)
    
    # Generate translation using greedy decoding
    with torch.no_grad():
        for step in range(max_length):
            # Forward pass
            outputs = model(src_tensor, target_tensor, src_lengths, teacher_forcing_ratio=0.0)
            
            # Get the last predicted token
            next_token_logits = outputs[0, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).item()
            
            # Stop if EOS token is generated
            if next_token == eos_token:
                break
            
            # Add token to sequence
            target_sequence.append(next_token)
            target_tensor = torch.tensor([target_sequence], dtype=torch.long).to(device)
    
    # Decode the generated sequence
    generated_text = tgt_tokenizer.decode(target_sequence)
    
    # Calculate confidence (average probability of generated tokens)
    with torch.no_grad():
        final_outputs = model(src_tensor, target_tensor, src_lengths, teacher_forcing_ratio=0.0)
        probabilities = torch.softmax(final_outputs, dim=-1)
        confidence = torch.mean(torch.max(probabilities, dim=-1)[0]).item()
    
    result = {
        'source_urdu': urdu_text,
        'cleaned_urdu': cleaned_urdu,
        'translation': generated_text,
        'confidence': confidence,
        'tokens_generated': len(target_sequence),
        'model_used': 'Trained NMT Model'
    }
    
    return result

def load_trained_model(model_path, device='cuda'):
    """
    Load a trained model from checkpoint
    
    Args:
        model_path: Path to the model checkpoint
        device: Device to load model on
    
    Returns:
        tuple: (model, config)
    """
    print(f"📂 Loading model from: {model_path}")
    
    # Load checkpoint (fix for PyTorch 2.6+ weights_only security)
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    config = checkpoint.get('config', {})
    
    print(f"🔧 Model config: {config}")
    
    # Extract model parameters from the saved state_dict
    state_dict = checkpoint['model_state_dict']
    
    # Extract vocabulary sizes from embedding layers
    src_vocab_size = state_dict['encoder.embedding.weight'].shape[0]
    tgt_vocab_size = state_dict['decoder.embedding.weight'].shape[0]
    
    # Extract embedding dimension
    embed_dim = state_dict['encoder.embedding.weight'].shape[1]
    
    # Extract hidden dimension from LSTM weights
    hidden_dim = state_dict['encoder.lstm.weight_ih_l0'].shape[0] // 4  # LSTM has 4 gates
    
    # Count encoder layers by counting weight_ih_l* keys
    encoder_layers = max([int(k.split('_l')[1].split('_')[0]) for k in state_dict.keys() 
                         if 'encoder.lstm.weight_ih_l' in k and '_reverse' not in k]) + 1
    
    # Count decoder layers by counting weight_ih_l* keys
    decoder_layers = max([int(k.split('_l')[1].split('_')[0]) for k in state_dict.keys() 
                         if 'decoder.lstm.weight_ih_l' in k]) + 1
    
    print(f"🔍 Extracted model parameters:")
    print(f"  • src_vocab_size: {src_vocab_size}")
    print(f"  • tgt_vocab_size: {tgt_vocab_size}")
    print(f"  • embed_dim: {embed_dim}")
    print(f"  • hidden_dim: {hidden_dim}")
    print(f"  • encoder_layers: {encoder_layers}")
    print(f"  • decoder_layers: {decoder_layers}")
    
    # Create model with extracted parameters
    model = Seq2SeqModel(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        embed_dim=embed_dim,
        hidden_dim=hidden_dim,
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers,
        dropout=config.get('dropout', 0.3)
    )
    
    # Load model weights
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    
    print(f"✅ Model loaded successfully!")
    print(f"📊 Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    return model, config

def test_translation_examples():
    """
    Test the translation function with example Urdu poetry
    """
    print("\n🎯 TESTING TRANSLATION FUNCTION")
    print("="*50)
    
    # Example Urdu poetry
    urdu_poems = [
        "یہ نہ تھی ہماری قسمت کہ وصال یار ہوتا",
        "اگر اپنا کہا آپ ہی سمجھتے تو کیا کہتے",
        "دل سے جو بات نکلتی ہے اثر رکھتی ہے"
    ]
    
    print("📝 Example Urdu Poems:")
    for i, poem in enumerate(urdu_poems, 1):
        print(f"  {i}. {poem}")
    
    print("\n💡 To use your trained model:")
    print("1. Load the best model: model, config = load_trained_model('best_model_exp_2.pth')")
    print("2. Load tokenizers from the experiment directory")
    print("3. Translate poetry: result = translate_urdu_poetry(model, src_tokenizer, tgt_tokenizer, 'یہ نہ تھی ہماری قسمت')")
    print("4. Get translation: print(result['translation'])")
    
    print("\n🚀 Example usage code:")
    print("""
# Load your best trained model
model, config = load_trained_model('best_model_exp_2.pth')

# Load tokenizers (you'll need to load them from the experiment directory)
# src_tokenizer, tgt_tokenizer = load_tokenizers_from_experiment('exp_2')

# Translate a poem
result = translate_urdu_poetry(
    model, src_tokenizer, tgt_tokenizer, 
    "یہ نہ تھی ہماری قسمت کہ وصال یار ہوتا"
)

print(f"Translation: {result['translation']}")
print(f"Confidence: {result['confidence']:.3f}")
""")

# Run the test
test_translation_examples()

print("\n=== COMPLETE ERROR-FREE SOLUTION READY ===")
print("Copy each cell block (between triple quotes) into separate Colab cells")
print("Run them in order for guaranteed success!")


In [None]:
# CELL 14: Interactive User Input Function for Testing

def interactive_translation_test():
    """
    Interactive function to test the trained model with user input
    """
    print("🎯 INTERACTIVE URDU TO ROMAN URDU TRANSLATION TEST")
    print("="*60)
    
    # Check if any trained models exist
    model_files = []
    for i in range(1, 4):  # Check for exp 1, 2, 3
        model_path = f'best_model_exp_{i}.pth'
        if os.path.exists(model_path):
            model_files.append((i, model_path))
    
    if not model_files:
        print("❌ No trained models found!")
        print("💡 Please run the experiments first (CELL 12) to train models")
        return
    
    print(f"📂 Found {len(model_files)} trained model(s):")
    for exp_num, model_path in model_files:
        print(f"  - Experiment {exp_num}: {model_path}")
    
    # Let user choose model
    if len(model_files) == 1:
        chosen_exp, chosen_model = model_files[0]
        print(f"\n✅ Using: {chosen_model}")
    else:
        print(f"\n🤔 Which model would you like to use?")
        for i, (exp_num, model_path) in enumerate(model_files, 1):
            print(f"  {i}. Experiment {exp_num}")
        
        while True:
            try:
                choice = int(input("Enter your choice (1-{}): ".format(len(model_files))))
                if 1 <= choice <= len(model_files):
                    chosen_exp, chosen_model = model_files[choice - 1]
                    break
                else:
                    print("❌ Invalid choice! Please try again.")
            except ValueError:
                print("❌ Please enter a valid number!")
    
    print(f"\n🔄 Loading model: {chosen_model}")
    
    try:
        # Load the chosen model
        model, config = load_trained_model(chosen_model, device)
        
        # Load tokenizers for this experiment
        tokenizer_dir = f"/content/tokenizer_checkpoints_exp_{chosen_exp}"
        tokenizer_files = []
        
        if os.path.exists(tokenizer_dir):
            for root, dirs, files in os.walk(tokenizer_dir):
                for file in files:
                    if file.endswith('.pkl'):
                        tokenizer_files.append(os.path.join(root, file))
        
        if not tokenizer_files:
            print("❌ No tokenizer files found!")
            print("💡 Please ensure tokenizers were saved during training")
            return
        
        # Load tokenizers
        with open(tokenizer_files[0], 'rb') as f:
            tokenizer_data = pickle.load(f)
            src_tokenizer = tokenizer_data['src_tokenizer']
            tgt_tokenizer = tokenizer_data['tgt_tokenizer']
        
        print("✅ Model and tokenizers loaded successfully!")
        print(f"📊 Model config: embed_dim={config.get('embed_dim', 'N/A')}, hidden_dim={config.get('hidden_dim', 'N/A')}")
        
        # Debug: Check tokenizer info
        print(f"🔍 Source tokenizer vocab size: {src_tokenizer.get_vocab_size()}")
        print(f"🔍 Target tokenizer vocab size: {tgt_tokenizer.get_vocab_size()}")
        print(f"🔍 Source tokenizer sample words: {list(src_tokenizer.vocab.keys())[:10]}")
        print(f"🔍 Target tokenizer sample words: {list(tgt_tokenizer.vocab.keys())[:10]}")
        
        # Interactive translation loop
        print("\n" + "="*60)
        print("🚀 READY FOR TRANSLATION!")
        print("="*60)
        print("💡 Enter Urdu text to translate (type 'quit' to exit)")
        print("💡 Example: یہ نہ تھی ہماری قسمت کہ وصال یار ہوتا")
        print("-" * 60)
        
        while True:
            try:
                # Get user input
                urdu_text = input("\n📝 Enter Urdu text: ").strip()
                
                # Check for exit command
                if urdu_text.lower() in ['quit', 'exit', 'q', '']:
                    print("👋 Goodbye!")
                    break
                
                if not urdu_text:
                    print("⚠️ Please enter some text!")
                    continue
                
                print(f"\n🔄 Translating: {urdu_text}")
                print("-" * 40)
                
                # Translate the text
                result = translate_urdu_poetry(
                    model, src_tokenizer, tgt_tokenizer, 
                    urdu_text, device=device
                )
                
                # Display results
                print(f"📤 Translation: {result['translation']}")
                print(f"🎯 Confidence: {result['confidence']:.3f}")
                print(f"📊 Tokens Generated: {result['tokens_generated']}")
                
                # Ask if user wants to continue
                continue_translation = input("\n🔄 Translate another text? (y/n): ").strip().lower()
                if continue_translation not in ['y', 'yes', '']:
                    print("👋 Goodbye!")
                    break
                    
            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"❌ Error during translation: {e}")
                print("💡 Please try again with different text")
    
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("💡 Please ensure the model file is valid and complete")

def quick_test_examples():
    """
    Quick test with predefined examples
    """
    print("🎯 QUICK TEST WITH PREDEFINED EXAMPLES")
    print("="*50)
    
    # Example Urdu poetry
    examples = [
        "یہ نہ تھی ہماری قسمت کہ وصال یار ہوتا",
        "اگر اپنا کہا آپ ہی سمجھتے تو کیا کہتے", 
        "دل سے جو بات نکلتی ہے اثر رکھتی ہے",
        "ہم کو معلوم ہے جنت کی حقیقت لیکن",
        "عشق میں غم کا مزہ بھی نہیں آتا"
    ]
    
    print("📝 Testing with example Urdu poetry:")
    print("-" * 50)
    
    # Check for available models
    model_files = []
    for i in range(1, 4):
        model_path = f'best_model_exp_{i}.pth'
        if os.path.exists(model_path):
            model_files.append((i, model_path))
    
    if not model_files:
        print("❌ No trained models found!")
        print("💡 Please run the experiments first (CELL 12) to train models")
        return
    
    # Use the first available model
    chosen_exp, chosen_model = model_files[0]
    print(f"🔄 Using model: {chosen_model}")
    
    try:
        # Load model and tokenizers
        model, config = load_trained_model(chosen_model, device)
        
        # Load tokenizers
        tokenizer_dir = f"/content/tokenizer_checkpoints_exp_{chosen_exp}"
        tokenizer_files = []
        
        if os.path.exists(tokenizer_dir):
            for root, dirs, files in os.walk(tokenizer_dir):
                for file in files:
                    if file.endswith('.pkl'):
                        tokenizer_files.append(os.path.join(root, file))
        
        if tokenizer_files:
            with open(tokenizer_files[0], 'rb') as f:
                tokenizer_data = pickle.load(f)
                src_tokenizer = tokenizer_data['src_tokenizer']
                tgt_tokenizer = tokenizer_data['tgt_tokenizer']
            
            print("✅ Model loaded successfully!")
            print("-" * 50)
            
            # Test each example
            for i, example in enumerate(examples, 1):
                print(f"\n📝 Example {i}: {example}")
                
                result = translate_urdu_poetry(
                    model, src_tokenizer, tgt_tokenizer, 
                    example, device=device
                )
                
                print(f"📤 Translation: {result['translation']}")
                print(f"🎯 Confidence: {result['confidence']:.3f}")
                print("-" * 30)
        
        else:
            print("❌ Tokenizer files not found!")
    
    except Exception as e:
        print(f"❌ Error: {e}")

# Main menu function
def main_test_menu():
    """
    Main menu for testing the trained model
    """
    print("\n🎯 URDU TO ROMAN URDU TRANSLATION TESTING")
    print("="*50)
    print("Choose testing option:")
    print("1. Interactive Translation (Enter your own text)")
    print("2. Quick Test (Predefined examples)")
    print("3. Exit")
    print("-" * 50)
    
    while True:
        try:
            choice = input("Enter your choice (1-3): ").strip()
            
            if choice == '1':
                interactive_translation_test()
                break
            elif choice == '2':
                quick_test_examples()
                break
            elif choice == '3':
                print("👋 Goodbye!")
                break
            else:
                print("❌ Invalid choice! Please enter 1, 2, or 3.")
        
        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")

# Run the main menu
print("🚀 READY TO TEST YOUR TRAINED MODEL!")
print("💡 Make sure you have trained models by running CELL 12 first")
print("💡 Then run this cell to test your model interactively")

# Uncomment the line below to run the test menu automatically
# main_test_menu()
