# Neural Machine Translation: Urdu to Roman Urdu
## BiLSTM Encoder-Decoder Architecture

**Assignment**: Project1 - Neural Machine Translation (15 Abs)  
**Objective**: Build a sequence-to-sequence model using BiLSTM encoder-decoder to translate Urdu text into Roman Urdu transliteration.

**Architecture**:
- Encoder: 2-layer Bidirectional LSTM
- Decoder: 4-layer LSTM
- Custom BPE Tokenization (implemented from scratch)

**Dataset**: urdu_ghazals_rekhta - Classical Urdu poetry with Roman transliterations

In [None]:
# Install required packages
%pip install torch torchtext nltk sacrebleu editdistance streamlit
%pip install matplotlib seaborn tqdm pandas numpy

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import os
import json
import pickle
from collections import Counter, defaultdict
import random
from typing import List, Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

In [None]:
# =============================================================================
# DATASET EXTRACTION AND SETUP
# =============================================================================

import zipfile

# Clone the dataset repository
!git clone https://github.com/amir9ume/urdu_ghazals_rekhta.git

print("🔧 EXTRACTING DATASET FROM ZIP FILE...")

# Extract the dataset zip file
zip_path = '/content/urdu_ghazals_rekhta/dataset/dataset.zip'
extract_to = '/content/dataset_extracted'

if not os.path.exists(extract_to):
    print(f"📦 Extracting {zip_path}")
    os.makedirs(extract_to, exist_ok=True)
    
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("✅ Extraction completed!")
    else:
        print(f"❌ Zip file not found at {zip_path}")
        # Try alternative paths
        alt_paths = [
            '/content/urdu_ghazals_rekhta/dataset.zip',
            '/content/dataset/dataset.zip',
            '/content/dataset.zip'
        ]
        
        for alt_path in alt_paths:
            if os.path.exists(alt_path):
                zip_path = alt_path
                print(f"✅ Found zip at: {zip_path}")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_to)
                print("✅ Extraction completed!")
                break
else:
    print("✅ Dataset already extracted!")

# Find the dataset directory with poets
dataset_path = None
for root, dirs, files in os.walk(extract_to):
    if any(poet in dirs for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal']):
        dataset_path = root
        break

if dataset_path:
    print(f"🎯 Dataset found at: {dataset_path}")
    poets = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    print(f"📚 Found {len(poets)} poets")
    
    # Test a sample poet
    if poets:
        sample_poet = poets[0]
        poet_path = os.path.join(dataset_path, sample_poet)
        subdirs = os.listdir(poet_path)
        print(f"📖 Sample poet '{sample_poet}' structure: {subdirs}")
else:
    print("❌ Could not find dataset with poets!")
    # Fallback to original paths
    if os.path.exists('/content/urdu_ghazals_rekhta/dataset'):
        dataset_path = '/content/urdu_ghazals_rekhta/dataset'
    elif os.path.exists('urdu_ghazals_rekhta/dataset'):
        dataset_path = 'urdu_ghazals_rekhta/dataset'
    else:
        dataset_path = 'dataset/dataset'
    print(f"Using fallback path: {dataset_path}")

print(f"✅ Final dataset path: {dataset_path}")

## Data Loader

In [None]:
import os
import re
import unicodedata
from tqdm import tqdm
from typing import List, Tuple
import random

class TextCleaner:
    """
    Text cleaning and normalization utilities
    """
    
    @staticmethod
    def clean_urdu(text: str) -> str:
        """
        Clean and normalize Urdu text
        """
        # Normalize Unicode
        text = unicodedata.normalize('NFKC', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove unwanted punctuation but keep essential ones
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s۔،؍؎؏؟!]', '', text)
        
        return text.strip()
    
    @staticmethod
    def clean_roman(text: str) -> str:
        """
        Clean and normalize Roman Urdu text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Keep only alphanumeric, spaces, and basic punctuation
        text = re.sub(r'[^a-zA-ZāīūĀĪŪñṇṛṭḍṣġḥḳẓẕ\s\'\-\.]', '', text)
        
        return text.strip()
    
    @staticmethod
    def add_special_tokens(text: str, is_target: bool = False) -> str:
        """
        Add special tokens for sequence processing
        """
        if is_target:
            return f"<sos> {text} <eos>"
        return text

def load_dataset(data_path='dataset/dataset'):
    """
    Load Urdu-Roman Urdu parallel corpus from the dataset
    """
    urdu_texts = []
    roman_texts = []
    
    # Get all poet directories
    poets = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
    print(f"Found {len(poets)} poets in dataset")
    
    for poet in tqdm(poets, desc="Loading poets"):
        poet_path = os.path.join(data_path, poet)
        urdu_path = os.path.join(poet_path, 'ur')
        english_path = os.path.join(poet_path, 'en')
        
        if os.path.exists(urdu_path) and os.path.exists(english_path):
            # Get all files in both directories
            urdu_files = set(os.listdir(urdu_path))
            english_files = set(os.listdir(english_path))
            
            # Find common files
            common_files = urdu_files.intersection(english_files)
            
            for file_name in common_files:
                try:
                    # Read Urdu text
                    with open(os.path.join(urdu_path, file_name), 'r', encoding='utf-8') as f:
                        urdu_content = f.read().strip()
                    
                    # Read Roman Urdu text
                    with open(os.path.join(english_path, file_name), 'r', encoding='utf-8') as f:
                        roman_content = f.read().strip()
                    
                    # Split into lines and pair them
                    urdu_lines = [line.strip() for line in urdu_content.split('\n') if line.strip()]
                    roman_lines = [line.strip() for line in roman_content.split('\n') if line.strip()]
                    
                    # Ensure same number of lines
                    min_lines = min(len(urdu_lines), len(roman_lines))
                    for i in range(min_lines):
                        if urdu_lines[i] and roman_lines[i]:
                            urdu_texts.append(urdu_lines[i])
                            roman_texts.append(roman_lines[i])
                            
                except Exception as e:
                    print(f"Error processing {poet}/{file_name}: {e}")
                    continue
    
    print(f"\nDataset loaded:")
    print(f"Total pairs: {len(urdu_texts)}")
    
    return urdu_texts, roman_texts

def clean_and_split_data(urdu_texts: List[str], roman_texts: List[str], 
                        train_ratio: float = 0.5, val_ratio: float = 0.25, test_ratio: float = 0.25):
    """
    Clean data and split into train/val/test sets
    """
    cleaner = TextCleaner()
    
    print("Cleaning texts...")
    cleaned_urdu = [cleaner.clean_urdu(text) for text in tqdm(urdu_texts, desc="Cleaning Urdu")]
    cleaned_roman = [cleaner.clean_roman(text) for text in tqdm(roman_texts, desc="Cleaning Roman")]
    
    # Filter out empty pairs and very short/long sequences
    valid_pairs = []
    for u, r in zip(cleaned_urdu, cleaned_roman):
        if u and r and 2 <= len(u.split()) <= 50 and 2 <= len(r.split()) <= 50:
            valid_pairs.append((u, r))
    
    print(f"After cleaning and filtering: {len(valid_pairs)} valid pairs")
    
    # Shuffle data
    random.shuffle(valid_pairs)
    
    # Split data
    total = len(valid_pairs)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)
    
    train_pairs = valid_pairs[:train_end]
    val_pairs = valid_pairs[train_end:val_end]
    test_pairs = valid_pairs[val_end:]
    
    print(f"Data splits:")
    print(f"  Train: {len(train_pairs)} pairs")
    print(f"  Validation: {len(val_pairs)} pairs")
    print(f"  Test: {len(test_pairs)} pairs")
    
    return train_pairs, val_pairs, test_pairs


## Bpe Tokenizer

In [None]:
import re
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Set
import pickle

class BPETokenizer:
    """
    Byte-Pair Encoding tokenizer implemented from scratch
    """
    
    def __init__(self, vocab_size: int = 10000):
        self.vocab_size = vocab_size
        self.word_freqs = Counter()
        self.vocab = {}
        self.merges = []
        self.special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        
    def _get_stats(self, vocab: Dict[str, int]) -> Dict[Tuple[str, str], int]:
        """
        Get frequency of consecutive symbol pairs
        """
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs
    
    def _merge_vocab(self, pair: Tuple[str, str], vocab: Dict[str, int]) -> Dict[str, int]:
        """
        Merge the most frequent pair in the vocabulary
        """
        new_vocab = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        
        for word in vocab:
            new_word = p.sub(''.join(pair), word)
            new_vocab[new_word] = vocab[word]
        return new_vocab
    
    def train(self, texts: List[str]):
        """
        Train BPE on the given texts
        """
        print("Training BPE tokenizer...")
        
        # Initialize word frequencies
        for text in texts:
            words = text.split()
            for word in words:
                self.word_freqs[word] += 1
        
        # Initialize vocabulary with character-level splits
        vocab = {}
        for word, freq in self.word_freqs.items():
            # Split word into characters and add end-of-word token
            vocab[' '.join(list(word)) + ' </w>'] = freq
        
        # Add special tokens to vocabulary
        for token in self.special_tokens:
            vocab[token] = 1
        
        # Iteratively merge most frequent pairs
        num_merges = self.vocab_size - len(self.special_tokens)
        
        for i in range(num_merges):
            pairs = self._get_stats(vocab)
            if not pairs:
                break
                
            best_pair = max(pairs, key=pairs.get)
            vocab = self._merge_vocab(best_pair, vocab)
            self.merges.append(best_pair)
            
            if (i + 1) % 1000 == 0:
                print(f"Merged {i + 1}/{num_merges} pairs")
        
        # Create final vocabulary
        self.vocab = {}
        for i, token in enumerate(self.special_tokens):
            self.vocab[token] = i
        
        for word in vocab:
            if word not in self.vocab:
                self.vocab[word] = len(self.vocab)
        
        print(f"BPE training completed. Vocabulary size: {len(self.vocab)}")
    
    def _get_word_tokens(self, word: str) -> List[str]:
        """
        Tokenize a single word using learned BPE merges
        """
        if word in self.vocab:
            return [word]
        
        word = ' '.join(list(word)) + ' </w>'
        pairs = self._get_word_pairs(word)
        
        if not pairs:
            return [word]
        
        while True:
            bigram = min(pairs, key=lambda pair: self.merges.index(pair) if pair in self.merges else float('inf'))
            if bigram not in self.merges:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break
                
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = self._get_word_pairs(word)
        
        return word
    
    def _get_word_pairs(self, word) -> Set[Tuple[str, str]]:
        """
        Get all pairs from a word
        """
        if isinstance(word, str):
            word = word.split()
        pairs = set()
        prev_char = word[0]
        for char in word[1:]:
            pairs.add((prev_char, char))
            prev_char = char
        return pairs
    
    def encode(self, text: str) -> List[int]:
        """
        Encode text to token IDs
        """
        tokens = []
        words = text.split()
        
        for word in words:
            word_tokens = self._get_word_tokens(word)
            for token in word_tokens:
                if token in self.vocab:
                    tokens.append(self.vocab[token])
                else:
                    tokens.append(self.vocab['<unk>'])
        
        return tokens
    
    def decode(self, token_ids: List[int]) -> str:
        """
        Decode token IDs back to text
        """
        # Create reverse vocabulary
        id_to_token = {v: k for k, v in self.vocab.items()}
        
        tokens = []
        for token_id in token_ids:
            if token_id in id_to_token:
                token = id_to_token[token_id]
                if token not in self.special_tokens:
                    tokens.append(token)
        
        # Join tokens and clean up
        text = ' '.join(tokens)
        text = text.replace('</w>', ' ')
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def save(self, filepath: str):
        """
        Save tokenizer to file
        """
        with open(filepath, 'wb') as f:
            pickle.dump({
                'vocab': self.vocab,
                'merges': self.merges,
                'vocab_size': self.vocab_size,
                'special_tokens': self.special_tokens
            }, f)
    
    def load(self, filepath: str):
        """
        Load tokenizer from file
        """
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
            self.vocab = data['vocab']
            self.merges = data['merges']
            self.vocab_size = data['vocab_size']
            self.special_tokens = data['special_tokens']
    
    def get_vocab_size(self) -> int:
        return len(self.vocab)

def create_tokenizers(train_pairs: List[Tuple[str, str]], 
                     src_vocab_size: int = 8000, 
                     tgt_vocab_size: int = 8000) -> Tuple[BPETokenizer, BPETokenizer]:
    """
    Create and train source and target tokenizers
    """
    src_texts = [pair[0] for pair in train_pairs]
    tgt_texts = [pair[1] for pair in train_pairs]
    
    # Create tokenizers
    src_tokenizer = BPETokenizer(vocab_size=src_vocab_size)
    tgt_tokenizer = BPETokenizer(vocab_size=tgt_vocab_size)
    
    # Train tokenizers
    print("Training source (Urdu) tokenizer...")
    src_tokenizer.train(src_texts)
    
    print("Training target (Roman Urdu) tokenizer...")
    tgt_tokenizer.train(tgt_texts)
    
    return src_tokenizer, tgt_tokenizer


## Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import random

class BiLSTMEncoder(nn.Module):
    """
    2-layer Bidirectional LSTM Encoder
    """
    
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, 
                 num_layers: int = 2, dropout: float = 0.3):
        super(BiLSTMEncoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, lengths=None):
        # x: (batch_size, seq_len)
        embedded = self.dropout(self.embedding(x))
        
        if lengths is not None:
            # Pack padded sequence for efficiency
            packed = nn.utils.rnn.pack_padded_sequence(
                embedded, lengths, batch_first=True, enforce_sorted=False)
            output, (hidden, cell) = self.lstm(packed)
            output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        else:
            output, (hidden, cell) = self.lstm(embedded)
        
        # output: (batch_size, seq_len, hidden_dim * 2)
        # hidden: (num_layers * 2, batch_size, hidden_dim)
        # cell: (num_layers * 2, batch_size, hidden_dim)
        
        # Combine forward and backward hidden states
        # Take the last layer's hidden states
        hidden_fwd = hidden[-2]  # Forward direction
        hidden_bwd = hidden[-1]  # Backward direction
        final_hidden = torch.cat([hidden_fwd, hidden_bwd], dim=1)
        
        cell_fwd = cell[-2]
        cell_bwd = cell[-1]
        final_cell = torch.cat([cell_fwd, cell_bwd], dim=1)
        
        return output, (final_hidden, final_cell)

class LSTMDecoder(nn.Module):
    """
    4-layer LSTM Decoder with Attention
    """
    
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, 
                 encoder_hidden_dim: int, num_layers: int = 4, dropout: float = 0.3):
        super(LSTMDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Project encoder hidden state to decoder hidden state size
        self.hidden_projection = nn.Linear(encoder_hidden_dim, hidden_dim)
        self.cell_projection = nn.Linear(encoder_hidden_dim, hidden_dim)
        
        # Attention mechanism
        self.attention = AttentionMechanism(hidden_dim, encoder_hidden_dim * 2)
        
        # LSTM layers
        self.lstm = nn.LSTM(embed_dim + encoder_hidden_dim * 2, hidden_dim, 
                           num_layers, batch_first=True, dropout=dropout)
        
        # Output projection
        self.output_projection = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_token, hidden, cell, encoder_outputs, mask=None):
        # input_token: (batch_size, 1)
        # hidden: (num_layers, batch_size, hidden_dim)
        # cell: (num_layers, batch_size, hidden_dim)
        # encoder_outputs: (batch_size, seq_len, encoder_hidden_dim * 2)
        
        embedded = self.dropout(self.embedding(input_token))
        # embedded: (batch_size, 1, embed_dim)
        
        # Apply attention
        context, attention_weights = self.attention(
            hidden[-1].unsqueeze(1), encoder_outputs, mask)
        # context: (batch_size, 1, encoder_hidden_dim * 2)
        
        # Concatenate embedding and context
        lstm_input = torch.cat([embedded, context], dim=2)
        # lstm_input: (batch_size, 1, embed_dim + encoder_hidden_dim * 2)
        
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        # output: (batch_size, 1, hidden_dim)
        
        # Project to vocabulary size
        output = self.output_projection(output)
        # output: (batch_size, 1, vocab_size)
        
        return output, hidden, cell, attention_weights

class AttentionMechanism(nn.Module):
    """
    Additive (Bahdanau) Attention Mechanism
    """
    
    def __init__(self, decoder_hidden_dim: int, encoder_hidden_dim: int):
        super(AttentionMechanism, self).__init__()
        self.decoder_projection = nn.Linear(decoder_hidden_dim, encoder_hidden_dim)
        self.encoder_projection = nn.Linear(encoder_hidden_dim, encoder_hidden_dim)
        self.attention_projection = nn.Linear(encoder_hidden_dim, 1)
        
    def forward(self, decoder_hidden, encoder_outputs, mask=None):
        # decoder_hidden: (batch_size, 1, decoder_hidden_dim)
        # encoder_outputs: (batch_size, seq_len, encoder_hidden_dim)
        
        batch_size, seq_len, encoder_dim = encoder_outputs.size()
        
        # Project decoder hidden state
        decoder_proj = self.decoder_projection(decoder_hidden)
        # decoder_proj: (batch_size, 1, encoder_hidden_dim)
        
        # Project encoder outputs
        encoder_proj = self.encoder_projection(encoder_outputs)
        # encoder_proj: (batch_size, seq_len, encoder_hidden_dim)
        
        # Calculate attention scores
        energy = torch.tanh(decoder_proj + encoder_proj)
        # energy: (batch_size, seq_len, encoder_hidden_dim)
        
        attention_scores = self.attention_projection(energy).squeeze(2)
        # attention_scores: (batch_size, seq_len)
        
        # Apply mask if provided
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e10)
        
        # Apply softmax to get attention weights
        attention_weights = F.softmax(attention_scores, dim=1)
        # attention_weights: (batch_size, seq_len)
        
        # Calculate context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        # context: (batch_size, 1, encoder_hidden_dim)
        
        return context, attention_weights

class Seq2SeqModel(nn.Module):
    """
    Complete Sequence-to-Sequence Model with BiLSTM Encoder and LSTM Decoder
    """
    
    def __init__(self, src_vocab_size: int, tgt_vocab_size: int, 
                 embed_dim: int = 256, hidden_dim: int = 512, 
                 encoder_layers: int = 2, decoder_layers: int = 4,
                 dropout: float = 0.3):
        super(Seq2SeqModel, self).__init__()
        
        self.encoder = BiLSTMEncoder(
            vocab_size=src_vocab_size,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            num_layers=encoder_layers,
            dropout=dropout
        )
        
        self.decoder = LSTMDecoder(
            vocab_size=tgt_vocab_size,
            embed_dim=embed_dim,
            hidden_dim=hidden_dim,
            encoder_hidden_dim=hidden_dim * 2,  # Bidirectional
            num_layers=decoder_layers,
            dropout=dropout
        )
        
        self.tgt_vocab_size = tgt_vocab_size
        
    def forward(self, src, tgt, src_lengths=None, teacher_forcing_ratio=0.5):
        # src: (batch_size, src_seq_len)
        # tgt: (batch_size, tgt_seq_len)
        
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        
        # Encoder
        encoder_outputs, (hidden, cell) = self.encoder(src, src_lengths)
        
        # Create mask for encoder outputs
        if src_lengths is not None:
            mask = torch.zeros(batch_size, src.size(1), device=src.device)
            for i, length in enumerate(src_lengths):
                mask[i, :length] = 1
        else:
            mask = torch.ones(batch_size, src.size(1), device=src.device)
        
        # Initialize decoder hidden states
        decoder_hidden = self.decoder.hidden_projection(hidden).unsqueeze(0)
        decoder_cell = self.decoder.cell_projection(cell).unsqueeze(0)
        
        # Repeat for all decoder layers
        decoder_hidden = decoder_hidden.repeat(self.decoder.num_layers, 1, 1)
        decoder_cell = decoder_cell.repeat(self.decoder.num_layers, 1, 1)
        
        # Prepare outputs tensor
        outputs = torch.zeros(batch_size, tgt_len, self.tgt_vocab_size, device=src.device)
        
        # First input to decoder is SOS token
        input_token = tgt[:, 0].unsqueeze(1)  # (batch_size, 1)
        
        for t in range(1, tgt_len):
            output, decoder_hidden, decoder_cell, _ = self.decoder(
                input_token, decoder_hidden, decoder_cell, encoder_outputs, mask)
            
            outputs[:, t] = output.squeeze(1)
            
            # Teacher forcing
            if random.random() < teacher_forcing_ratio:
                input_token = tgt[:, t].unsqueeze(1)
            else:
                input_token = output.argmax(2)
        
        return outputs
    
    def inference(self, src, src_tokenizer, tgt_tokenizer, max_length=50):
        """
        Inference method for generating translations
        """
        self.eval()
        with torch.no_grad():
            batch_size = src.size(0)
            device = src.device
            
            # Encoder
            encoder_outputs, (hidden, cell) = self.encoder(src)
            
            # Create mask
            mask = torch.ones(batch_size, src.size(1), device=device)
            
            # Initialize decoder
            decoder_hidden = self.decoder.hidden_projection(hidden).unsqueeze(0)
            decoder_cell = self.decoder.cell_projection(cell).unsqueeze(0)
            decoder_hidden = decoder_hidden.repeat(self.decoder.num_layers, 1, 1)
            decoder_cell = decoder_cell.repeat(self.decoder.num_layers, 1, 1)
            
            # Start with SOS token
            sos_id = tgt_tokenizer.vocab['<sos>']
            eos_id = tgt_tokenizer.vocab['<eos>']
            
            input_token = torch.full((batch_size, 1), sos_id, device=device)
            
            outputs = []
            
            for _ in range(max_length):
                output, decoder_hidden, decoder_cell, _ = self.decoder(
                    input_token, decoder_hidden, decoder_cell, encoder_outputs, mask)
                
                predicted = output.argmax(2)
                outputs.append(predicted)
                
                input_token = predicted
                
                # Stop if all sequences have generated EOS
                if (predicted == eos_id).all():
                    break
            
            # Concatenate outputs
            if outputs:
                outputs = torch.cat(outputs, dim=1)
            else:
                outputs = torch.empty(batch_size, 0, device=device)
            
            return outputs


## Dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from typing import List, Tuple
import random

class TranslationDataset(Dataset):
    """
    Dataset class for Urdu to Roman Urdu translation
    """
    
    def __init__(self, pairs: List[Tuple[str, str]], src_tokenizer, tgt_tokenizer):
        self.pairs = pairs
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        
        # Tokenize all pairs
        self.tokenized_pairs = []
        for src_text, tgt_text in pairs:
            src_tokens = src_tokenizer.encode(src_text)
            tgt_tokens = tgt_tokenizer.encode(f"<sos> {tgt_text} <eos>")
            
            if len(src_tokens) > 0 and len(tgt_tokens) > 0:
                self.tokenized_pairs.append((src_tokens, tgt_tokens))
    
    def __len__(self):
        return len(self.tokenized_pairs)
    
    def __getitem__(self, idx):
        src_tokens, tgt_tokens = self.tokenized_pairs[idx]
        return {
            'src': torch.tensor(src_tokens, dtype=torch.long),
            'tgt': torch.tensor(tgt_tokens, dtype=torch.long),
            'src_len': len(src_tokens),
            'tgt_len': len(tgt_tokens)
        }

def collate_fn(batch):
    """
    Collate function for DataLoader to handle variable length sequences
    """
    src_sequences = [item['src'] for item in batch]
    tgt_sequences = [item['tgt'] for item in batch]
    src_lengths = [item['src_len'] for item in batch]
    tgt_lengths = [item['tgt_len'] for item in batch]
    
    # Pad sequences
    src_padded = pad_sequence(src_sequences, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_sequences, batch_first=True, padding_value=0)
    
    return {
        'src': src_padded,
        'tgt': tgt_padded,
        'src_lengths': torch.tensor(src_lengths, dtype=torch.long),
        'tgt_lengths': torch.tensor(tgt_lengths, dtype=torch.long)
    }

def create_data_loaders(train_pairs, val_pairs, test_pairs, src_tokenizer, tgt_tokenizer, 
                       batch_size=32, num_workers=0):
    """
    Create DataLoaders for train, validation, and test sets
    """
    # Create datasets
    train_dataset = TranslationDataset(train_pairs, src_tokenizer, tgt_tokenizer)
    val_dataset = TranslationDataset(val_pairs, src_tokenizer, tgt_tokenizer)
    test_dataset = TranslationDataset(test_pairs, src_tokenizer, tgt_tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, 
        collate_fn=collate_fn, num_workers=num_workers
    )
    
    print(f"Data loaders created:")
    print(f"  Train: {len(train_loader)} batches")
    print(f"  Validation: {len(val_loader)} batches")
    print(f"  Test: {len(test_loader)} batches")
    
    return train_loader, val_loader, test_loader


## Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import os

class Trainer:
    """
    Training class for the Seq2Seq model
    """
    
    def __init__(self, model, train_loader, val_loader, src_tokenizer, tgt_tokenizer, 
                 lr=1e-3, device='cuda'):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.device = device
        
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', patience=3, factor=0.5)
        
        self.train_losses = []
        self.val_losses = []
        self.train_perplexities = []
        self.val_perplexities = []
        
    def train_epoch(self, epoch, teacher_forcing_ratio=0.5):
        """
        Train for one epoch
        """
        self.model.train()
        total_loss = 0
        total_tokens = 0
        
        pbar = tqdm(self.train_loader, desc=f'Epoch {epoch}')
        for batch_idx, batch in enumerate(pbar):
            src = batch['src'].to(self.device)
            tgt = batch['tgt'].to(self.device)
            src_lengths = batch['src_lengths'].to(self.device)
            
            self.optimizer.zero_grad()
            
            # Forward pass
            outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio)
            
            # Calculate loss (ignore first token which is SOS)
            outputs = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
            targets = tgt[:, 1:].contiguous().view(-1)
            
            loss = self.criterion(outputs, targets)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            # Statistics
            total_loss += loss.item()
            total_tokens += targets.ne(0).sum().item()  # Count non-padding tokens
            
            pbar.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(self.train_loader)
        perplexity = np.exp(total_loss * len(self.train_loader) / total_tokens)
        
        return avg_loss, perplexity
    
    def validate(self):
        """
        Validate the model
        """
        self.model.eval()
        total_loss = 0
        total_tokens = 0
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc='Validation'):
                src = batch['src'].to(self.device)
                tgt = batch['tgt'].to(self.device)
                src_lengths = batch['src_lengths'].to(self.device)
                
                outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio=0)
                
                outputs = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
                targets = tgt[:, 1:].contiguous().view(-1)
                
                loss = self.criterion(outputs, targets)
                total_loss += loss.item()
                total_tokens += targets.ne(0).sum().item()
        
        avg_loss = total_loss / len(self.val_loader)
        perplexity = np.exp(total_loss * len(self.val_loader) / total_tokens)
        
        return avg_loss, perplexity
    
    def train(self, num_epochs, save_path='best_model.pth'):
        """
        Full training loop
        """
        best_val_loss = float('inf')
        patience_counter = 0
        max_patience = 5
        
        print(f"Starting training for {num_epochs} epochs...")
        print(f"Device: {self.device}")
        print(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
        
        for epoch in range(1, num_epochs + 1):
            start_time = time.time()
            
            # Training
            train_loss, train_perplexity = self.train_epoch(epoch)
            
            # Validation
            val_loss, val_perplexity = self.validate()
            
            # Learning rate scheduling
            self.scheduler.step(val_loss)
            
            # Save metrics
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            self.train_perplexities.append(train_perplexity)
            self.val_perplexities.append(val_perplexity)
            
            epoch_time = time.time() - start_time
            
            print(f'Epoch {epoch}/{num_epochs}:')
            print(f'  Train Loss: {train_loss:.4f}, Train Perplexity: {train_perplexity:.4f}')
            print(f'  Val Loss: {val_loss:.4f}, Val Perplexity: {val_perplexity:.4f}')
            print(f'  Time: {epoch_time:.2f}s, LR: {self.optimizer.param_groups[0]["lr"]:.6f}')
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'train_losses': self.train_losses,
                    'val_losses': self.val_losses,
                    'train_perplexities': self.train_perplexities,
                    'val_perplexities': self.val_perplexities,
                    'best_val_loss': best_val_loss
                }, save_path)
                print(f'  New best model saved! Val Loss: {val_loss:.4f}')
            else:
                patience_counter += 1
                
            # Early stopping
            if patience_counter >= max_patience:
                print(f'Early stopping after {epoch} epochs')
                break
                
            print('-' * 60)
        
        print('Training completed!')
        return self.train_losses, self.val_losses
    
    def plot_training_curves(self, save_path='training_curves.png'):
        """
        Plot training and validation curves
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Loss curves
        ax1.plot(self.train_losses, label='Train Loss', color='blue')
        ax1.plot(self.val_losses, label='Validation Loss', color='red')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.set_title('Training and Validation Loss')
        ax1.legend()
        ax1.grid(True)
        
        # Perplexity curves
        ax2.plot(self.train_perplexities, label='Train Perplexity', color='blue')
        ax2.plot(self.val_perplexities, label='Validation Perplexity', color='red')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Perplexity')
        ax2.set_title('Training and Validation Perplexity')
        ax2.legend()
        ax2.grid(True)
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

def load_model(model_class, model_path, src_vocab_size, tgt_vocab_size, device='cuda', **model_kwargs):
    """
    Load a trained model
    """
    model = model_class(src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, **model_kwargs)
    
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    
    return model, checkpoint


## Evaluation

In [None]:
import torch
import numpy as np
from tqdm import tqdm
import editdistance
from collections import Counter
import math
import re

class Evaluator:
    """
    Evaluation metrics for translation quality
    """
    
    def __init__(self, model, test_loader, src_tokenizer, tgt_tokenizer, device='cuda'):
        self.model = model
        self.test_loader = test_loader
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.device = device
    
    def calculate_bleu(self, references, hypotheses, n=4):
        """
        Calculate BLEU score
        """
        def get_ngrams(tokens, n):
            if len(tokens) < n:
                return Counter()
            ngrams = []
            for i in range(len(tokens) - n + 1):
                ngrams.append(tuple(tokens[i:i+n]))
            return Counter(ngrams)
        
        def calculate_precision(ref_tokens, hyp_tokens, n):
            ref_ngrams = get_ngrams(ref_tokens, n)
            hyp_ngrams = get_ngrams(hyp_tokens, n)
            
            if not hyp_ngrams:
                return 0.0
            
            matches = 0
            for ngram, count in hyp_ngrams.items():
                matches += min(count, ref_ngrams.get(ngram, 0))
            
            return matches / sum(hyp_ngrams.values())
        
        # Calculate precision for each n-gram order
        precisions = []
        for i in range(1, n + 1):
            precision_scores = []
            for ref, hyp in zip(references, hypotheses):
                ref_tokens = ref.split()
                hyp_tokens = hyp.split()
                precision = calculate_precision(ref_tokens, hyp_tokens, i)
                precision_scores.append(precision)
            precisions.append(np.mean(precision_scores))
        
        # Calculate brevity penalty
        ref_lengths = [len(ref.split()) for ref in references]
        hyp_lengths = [len(hyp.split()) for hyp in hypotheses]
        
        ref_len = sum(ref_lengths)
        hyp_len = sum(hyp_lengths)
        
        if hyp_len > ref_len:
            bp = 1.0
        else:
            bp = math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0
        
        # Calculate BLEU score
        if any(p == 0 for p in precisions):
            return 0.0
        
        geometric_mean = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
        bleu = bp * geometric_mean
        
        return bleu
    
    def calculate_cer(self, references, hypotheses):
        """
        Calculate Character Error Rate
        """
        total_chars = 0
        total_errors = 0
        
        for ref, hyp in zip(references, hypotheses):
            ref_chars = list(ref.replace(' ', ''))
            hyp_chars = list(hyp.replace(' ', ''))
            
            total_chars += len(ref_chars)
            total_errors += editdistance.eval(ref_chars, hyp_chars)
        
        return total_errors / total_chars if total_chars > 0 else 1.0
    
    def calculate_perplexity(self):
        """
        Calculate perplexity on test set
        """
        self.model.eval()
        total_loss = 0
        total_tokens = 0
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
        
        with torch.no_grad():
            for batch in tqdm(self.test_loader, desc='Calculating perplexity'):
                src = batch['src'].to(self.device)
                tgt = batch['tgt'].to(self.device)
                src_lengths = batch['src_lengths'].to(self.device)
                
                outputs = self.model(src, tgt, src_lengths, teacher_forcing_ratio=0)
                
                outputs = outputs[:, 1:].contiguous().view(-1, outputs.size(-1))
                targets = tgt[:, 1:].contiguous().view(-1)
                
                loss = criterion(outputs, targets)
                total_loss += loss.item()
                total_tokens += targets.ne(0).sum().item()
        
        avg_loss = total_loss / len(self.test_loader)
        perplexity = np.exp(total_loss * len(self.test_loader) / total_tokens)
        
        return perplexity
    
    def generate_translations(self, num_samples=None):
        """
        Generate translations for the test set
        """
        self.model.eval()
        references = []
        hypotheses = []
        
        with torch.no_grad():
            for batch_idx, batch in enumerate(tqdm(self.test_loader, desc='Generating translations')):
                if num_samples and batch_idx * self.test_loader.batch_size >= num_samples:
                    break
                
                src = batch['src'].to(self.device)
                tgt = batch['tgt'].to(self.device)
                
                # Generate translations
                generated = self.model.inference(src, self.src_tokenizer, self.tgt_tokenizer)
                
                # Decode sequences
                for i in range(src.size(0)):
                    # Reference (remove SOS and EOS tokens)
                    ref_tokens = tgt[i].cpu().numpy()
                    ref_tokens = ref_tokens[1:]  # Remove SOS
                    eos_idx = np.where(ref_tokens == self.tgt_tokenizer.vocab['<eos>'])[0]
                    if len(eos_idx) > 0:
                        ref_tokens = ref_tokens[:eos_idx[0]]
                    
                    ref_text = self.tgt_tokenizer.decode(ref_tokens.tolist())
                    
                    # Hypothesis
                    if generated.size(1) > 0:
                        hyp_tokens = generated[i].cpu().numpy()
                        eos_idx = np.where(hyp_tokens == self.tgt_tokenizer.vocab['<eos>'])[0]
                        if len(eos_idx) > 0:
                            hyp_tokens = hyp_tokens[:eos_idx[0]]
                        hyp_text = self.tgt_tokenizer.decode(hyp_tokens.tolist())
                    else:
                        hyp_text = ""
                    
                    references.append(ref_text)
                    hypotheses.append(hyp_text)
        
        return references, hypotheses
    
    def evaluate(self, num_samples=None):
        """
        Comprehensive evaluation
        """
        print("Starting evaluation...")
        
        # Generate translations
        references, hypotheses = self.generate_translations(num_samples)
        
        # Calculate metrics
        bleu_score = self.calculate_bleu(references, hypotheses)
        cer = self.calculate_cer(references, hypotheses)
        perplexity = self.calculate_perplexity()
        
        # Print results
        print(f"\nEvaluation Results:")
        print(f"  BLEU Score: {bleu_score:.4f}")
        print(f"  Character Error Rate: {cer:.4f}")
        print(f"  Perplexity: {perplexity:.4f}")
        
        # Show some examples
        print(f"\nSample Translations:")
        for i in range(min(5, len(references))):
            print(f"\n{i+1}.")
            print(f"Reference: {references[i]}")
            print(f"Generated: {hypotheses[i]}")
        
        return {
            'bleu': bleu_score,
            'cer': cer,
            'perplexity': perplexity,
            'references': references,
            'hypotheses': hypotheses
        }

def translate_text(model, text, src_tokenizer, tgt_tokenizer, device='cuda', max_length=50):
    """
    Translate a single text using the trained model
    """
    model.eval()
    
    # Tokenize input
    src_tokens = src_tokenizer.encode(text)
    src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)
    
    # Generate translation
    with torch.no_grad():
        generated = model.inference(src_tensor, src_tokenizer, tgt_tokenizer, max_length)
    
    # Decode output
    if generated.size(1) > 0:
        output_tokens = generated[0].cpu().numpy()
        eos_idx = np.where(output_tokens == tgt_tokenizer.vocab['<eos>'])[0]
        if len(eos_idx) > 0:
            output_tokens = output_tokens[:eos_idx[0]]
        translation = tgt_tokenizer.decode(output_tokens.tolist())
    else:
        translation = ""
    
    return translation


## Training Execution

In [None]:
# Main training execution

# Set up experiment configuration
config = {
    'seed': 42,
    'embed_dim': 256,
    'hidden_dim': 512,
    'encoder_layers': 2,
    'decoder_layers': 4,
    'dropout': 0.3,
    'learning_rate': 1e-3,
    'batch_size': 32,  # Reduced for Colab free tier
    'src_vocab_size': 8000,
    'tgt_vocab_size': 8000,
    'num_epochs': 10  # Reduced for Colab free tier
}

print("Starting Urdu to Roman Urdu NMT Training...")
print(f"Configuration: {config}")

# 1. Load and preprocess data
print("\n1. Loading and preprocessing data...")

# QUICK FIX: Find the correct dataset path (avoiding __MACOSX)
print("🔍 Searching for correct dataset...")
dataset_path = None

# Check if extraction was successful - SKIP __MACOSX folders
if os.path.exists('/content/dataset_extracted'):
    for root, dirs, files in os.walk('/content/dataset_extracted'):
        # Skip __MACOSX directories - they contain corrupted files
        if '__MACOSX' in root:
            continue
            
        # Look for poet directories
        poet_dirs = [d for d in dirs if any(poet in d for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal'])]
        if poet_dirs:
            dataset_path = root
            print(f"✅ Found valid dataset at: {dataset_path}")
            print(f"📚 Sample poets: {poet_dirs[:3]}")
            break

if dataset_path:
    print(f"✅ Using dataset: {dataset_path}")
    urdu_texts, roman_texts = load_dataset(dataset_path)
else:
    print("❌ Could not find valid dataset! Trying alternative extraction...")
    
    # Alternative: Extract directly without __MACOSX
    import zipfile
    zip_path = '/content/urdu_ghazals_rekhta/dataset/dataset.zip'
    extract_to = '/content/dataset_clean'
    
    if os.path.exists(zip_path):
        print(f"📦 Re-extracting {zip_path} without __MACOSX...")
        os.makedirs(extract_to, exist_ok=True)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for member in zip_ref.infolist():
                # Skip __MACOSX files
                if '__MACOSX' not in member.filename and not member.filename.startswith('.'):
                    zip_ref.extract(member, extract_to)
        
        # Find dataset in clean extraction
        for root, dirs, files in os.walk(extract_to):
            if any(poet in dirs for poet in ['mirza-ghalib', 'ahmad-faraz', 'allama-iqbal']):
                dataset_path = root
                print(f"✅ Found clean dataset at: {dataset_path}")
                break
        
        if dataset_path:
            urdu_texts, roman_texts = load_dataset(dataset_path)
        else:
            print("❌ Still no valid dataset found!")
            urdu_texts, roman_texts = [], []
    else:
        print("❌ Zip file not found!")
        urdu_texts, roman_texts = [], []

train_pairs, val_pairs, test_pairs = clean_and_split_data(
    urdu_texts, roman_texts,
    train_ratio=0.5, val_ratio=0.25, test_ratio=0.25
)

# Check if we have data to proceed
if len(train_pairs) == 0:
    print("❌ No training data available! Please check dataset extraction.")
    print("🛑 STOPPING EXECUTION - No data to train on!")
    exit()  # Stop here
else:
    print(f"✅ Ready to proceed with {len(train_pairs)} training pairs")

# 2. Create tokenizers
print("\n2. Training tokenizers...")
src_tokenizer, tgt_tokenizer = create_tokenizers(
    train_pairs,
    src_vocab_size=config['src_vocab_size'],
    tgt_vocab_size=config['tgt_vocab_size']
)

# 3. Create data loaders
print("\n3. Creating data loaders...")
train_loader, val_loader, test_loader = create_data_loaders(
    train_pairs, val_pairs, test_pairs,
    src_tokenizer, tgt_tokenizer,
    batch_size=config['batch_size']
)

# 4. Create model
print("\n4. Creating model...")
model = Seq2SeqModel(
    src_vocab_size=src_tokenizer.get_vocab_size(),
    tgt_vocab_size=tgt_tokenizer.get_vocab_size(),
    embed_dim=config['embed_dim'],
    hidden_dim=config['hidden_dim'],
    encoder_layers=config['encoder_layers'],
    decoder_layers=config['decoder_layers'],
    dropout=config['dropout']
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# 5. Train model
print("\n5. Starting training...")
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    src_tokenizer=src_tokenizer,
    tgt_tokenizer=tgt_tokenizer,
    lr=config['learning_rate'],
    device=device
)

train_losses, val_losses = trainer.train(
    num_epochs=config['num_epochs'],
    save_path='best_model.pth'
)

# 6. Plot training curves
trainer.plot_training_curves('training_curves.png')

# 7. Evaluate model
print("\n6. Evaluating model...")
evaluator = Evaluator(model, test_loader, src_tokenizer, tgt_tokenizer, device)
results = evaluator.evaluate(num_samples=200)  # Reduced for speed

print("\n=" * 60)
print("TRAINING COMPLETED!")
print("=" * 60)
print(f"Final Results:")
print(f"  BLEU Score: {results['bleu']:.4f}")
print(f"  Character Error Rate: {results['cer']:.4f}")
print(f"  Perplexity: {results['perplexity']:.4f}")

# 8. Test with sample translations
print("\n7. Sample translations:")
sample_texts = [
    "محبت میں نہیں ہے فرق جینے اور مرنے کا",
    "دل ہی تو ہے نہ سنگ و خشت درد سے بھر نہ آئے کیوں",
    "ہزاروں خواہشیں ایسی کہ ہر خواہش پہ دم نکلے"
]

for text in sample_texts:
    translation = translate_text(model, text, src_tokenizer, tgt_tokenizer, device)
    print(f"Urdu: {text}")
    print(f"Roman: {translation}")
    print()