In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import shutil

# Define the base dataset path (read-only input)
dataset_base = r"/kaggle/input/dataset/dataset/dataset/dataset"

# Define the output directories (writable in Kaggle working directory)
output_base = r"/kaggle/working/shayari_consolidated"
urdu_all_dir = os.path.join(output_base, "Urdu_All")
roman_all_dir = os.path.join(output_base, "Roman_All")

# Create the main output directories if they don't exist
os.makedirs(urdu_all_dir, exist_ok=True)
os.makedirs(roman_all_dir, exist_ok=True)

# Get the list of poet folders
poet_dirs = sorted(os.listdir(dataset_base))

# Iterate through each poet's folder
for poet in poet_dirs:
    poet_path = os.path.join(dataset_base, poet)
    if os.path.isdir(poet_path):  # Ensure it's a directory
        urdu_src_dir = os.path.join(poet_path, "ur")
        roman_src_dir = os.path.join(poet_path, "en")

        # Create poet subfolders in the consolidated directories
        urdu_poet_dir = os.path.join(urdu_all_dir, poet)
        roman_poet_dir = os.path.join(roman_all_dir, poet)
        os.makedirs(urdu_poet_dir, exist_ok=True)
        os.makedirs(roman_poet_dir, exist_ok=True)

        # Copy Urdu files
        if os.path.exists(urdu_src_dir):
            urdu_files = os.listdir(urdu_src_dir)
            for file_name in urdu_files:
                src_file = os.path.join(urdu_src_dir, file_name)
                dst_file = os.path.join(urdu_poet_dir, file_name)
                if os.path.isfile(src_file):
                    shutil.copy2(src_file, dst_file)
                    print(f"Copied Urdu file: {file_name} from {poet} to {urdu_poet_dir}")

        # Copy Roman/English files
        if os.path.exists(roman_src_dir):
            roman_files = os.listdir(roman_src_dir)
            for file_name in roman_files:
                src_file = os.path.join(roman_src_dir, file_name)
                dst_file = os.path.join(roman_poet_dir, file_name)
                if os.path.isfile(src_file):
                    shutil.copy2(src_file, dst_file)
                    print(f"Copied Roman file: {file_name} from {poet} to {roman_poet_dir}")

print(f"\nConsolidation complete!")
print(f"Urdu files in: {urdu_all_dir}")
print(f"Roman files in: {roman_all_dir}")
print("Structure: Each poet has a subfolder inside Urdu_All and Roman_All containing their shayari files.")

In [None]:
!pip install jiwer

In [None]:
!ls -l /kaggle/working/shayari_consolidated/Urdu_All
!ls -l /kaggle/working/shayari_consolidated/Roman_All
!ls -l /kaggle/working/shayari_consolidated/Urdu_All/* | head -n 20
!ls -l /kaggle/working/shayari_consolidated/Roman_All/* | head -n 20

In [None]:
# Install jiwer
!pip install jiwer

# Libraries
import os
import re
import unicodedata
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import jiwer
import numpy as np
import random
from tqdm import tqdm

# Random seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Dataset paths
URDU_BASE_DIR = '/kaggle/working/shayari_consolidated/Urdu_All'
ROMAN_BASE_DIR = '/kaggle/working/shayari_consolidated/Roman_All'
print(f"Urdu base dir: {URDU_BASE_DIR}")
print(f"Roman base dir: {ROMAN_BASE_DIR}")

# Clean text function (preserves Roman diacritics)
def clean_text(text, is_urdu=True):
    text = unicodedata.normalize('NFKC', text)
    if is_urdu:
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\s\!\?.,]', '', text)
    else:
        # For Roman, keep diacritics and basic punctuation
        text = re.sub(r'[^\w\s\!\?.,āīūḳḥñ]', '', text)
    return text.strip()

# Validate Urdu/Roman text
def is_urdu_text(text):
    return any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text)

def is_roman_text(text):
    return all(ord(c) < 0x0600 or c in ' !?.,āīūḳḥñ' for c in text)

# Load data with validation
def load_shayari_pairs(urdu_base, roman_base, max_lines_per_file=1000):
    pairs = []
    poets = set()
    
    urdu_poets = [d for d in os.listdir(urdu_base) if os.path.isdir(os.path.join(urdu_base, d))]
    roman_poets = [d for d in os.listdir(roman_base) if os.path.isdir(os.path.join(roman_base, d))]
    common_poets = set(urdu_poets) & set(roman_poets)
    print(f"Found {len(common_poets)} common poets: {common_poets}")
    
    if not common_poets:
        print("No common poets found! Check directory structure.")
        return pairs
    
    for poet in tqdm(sorted(common_poets), desc="Loading poets"):
        urdu_poet_dir = os.path.join(urdu_base, poet)
        roman_poet_dir = os.path.join(roman_base, poet)
        
        urdu_files = [f for f in os.listdir(urdu_poet_dir) if os.path.isfile(os.path.join(urdu_poet_dir, f))]
        roman_files = [f for f in os.listdir(roman_poet_dir) if os.path.isfile(os.path.join(roman_poet_dir, f))]
        print(f"Poet: {poet}, Urdu files: {len(urdu_files)}, Roman files: {len(roman_files)}")
        print(f"Sample Urdu files: {urdu_files[:2]}")
        print(f"Sample Roman files: {roman_files[:2]}")
        
        common_files = set(urdu_files) & set(roman_files)
        print(f"Poet: {poet}, Common files: {len(common_files)}")
        
        for file_name in common_files:
            ur_path = os.path.join(urdu_poet_dir, file_name)
            rom_path = os.path.join(roman_poet_dir, file_name)
            print(f"Processing pair: {file_name}")
            
            try:
                with open(ur_path, 'r', encoding='utf-8-sig') as f_ur, \
                     open(rom_path, 'r', encoding='utf-8-sig') as f_rom:
                    
                    ur_lines = [clean_text(line, is_urdu=True) for line in f_ur.readlines() if line.strip()]
                    rom_lines = [clean_text(line, is_urdu=False) for line in f_rom.readlines() if line.strip()]
                    
                    if ur_lines and rom_lines:
                        f_ur.seek(0)
                        f_rom.seek(0)
                        print(f"Sample raw Urdu lines: {f_ur.readlines()[:10]}")
                        print(f"Sample cleaned Urdu: {ur_lines[:10]}")
                        print(f"Sample raw Roman lines: {f_rom.readlines()[:10]}")
                        print(f"Sample cleaned Roman: {rom_lines[:10]}")
                    
                    ur_lines = [line for line in ur_lines if line and is_urdu_text(line)]
                    rom_lines = [line for line in rom_lines if line and is_roman_text(line)]
                    
                    min_len = min(len(ur_lines), len(rom_lines))
                    ur_lines = ur_lines[:min(min_len, max_lines_per_file)]
                    rom_lines = rom_lines[:min(min_len, max_lines_per_file)]
                    
                    if min_len > 0:
                        valid_pairs = [(ur, rom) for ur, rom in zip(ur_lines, rom_lines) if is_urdu_text(ur) and is_roman_text(rom)]
                        pairs.extend(valid_pairs)
                        poets.add(poet)
                        print(f"Loaded {len(valid_pairs)} pairs for poet: {poet} from {file_name}")
                    else:
                        print(f"No valid pairs for {file_name} (min_len={min_len})")
            except Exception as e:
                print(f"Error loading {file_name} for {poet}: {e}")
    
    print(f"Total pairs loaded: {len(pairs)} from {len(poets)} poets.")
    return pairs

# Load pairs
pairs = load_shayari_pairs(URDU_BASE_DIR, ROMAN_BASE_DIR)
if not pairs:
    print("No pairs loaded! Check paths, file names, or content.")
    raise SystemExit
else:
    print(f"Sample pair: Urdu: '{pairs[0][0]}' | Roman: '{pairs[0][1]}'")
    
    # Prepare sentences for word-level processing
    urdu_sentences = [pair[0] for pair in pairs]
    roman_sentences = [pair[1] for pair in pairs]
    
    # Data augmentation
    AUG_MULTIPLIER = 3
    urdu_sentences = urdu_sentences * AUG_MULTIPLIER
    roman_sentences = roman_sentences * AUG_MULTIPLIER
    swapped_pairs = [(rom, ur) for ur, rom in zip(urdu_sentences[:len(urdu_sentences)//AUG_MULTIPLIER], roman_sentences[:len(roman_sentences)//AUG_MULTIPLIER])]
    urdu_sentences.extend([pair[0] for pair in swapped_pairs])
    roman_sentences.extend([pair[1] for pair in swapped_pairs])
    print(f"After augmentation: {len(urdu_sentences)} sentences.")
    
    import pickle
    with open('/kaggle/working/sentences.pkl', 'wb') as f:
        pickle.dump((urdu_sentences, roman_sentences), f)
    print("Sentences saved to /kaggle/working/sentences.pkl")

In [None]:
# Set multiprocessing start method
import torch.multiprocessing as mp
try:
    mp.set_start_method('spawn', force=True)
except RuntimeError:
    pass

# Load sentences
import pickle
with open('/kaggle/working/sentences.pkl', 'rb') as f:
    urdu_sentences, roman_sentences = pickle.load(f)
print(f"Loaded {len(urdu_sentences)} sentence pairs from disk.")

# Vocabulary
class Vocab:
    def __init__(self, texts, min_freq=1):
        self.freq = {}
        for t in texts:
            for tok in t.split():
                self.freq[tok] = self.freq.get(tok, 0) + 1
        self.itos = ["<pad>", "<sos>", "<eos>", "<unk>"]
        for tok, c in self.freq.items():
            if c >= min_freq:
                self.itos.append(tok)
        self.stoi = {tok: i for i, tok in enumerate(self.itos)}
    def encode(self, text):
        return [self.stoi.get(tok, self.stoi["<unk>"]) for tok in text.split()]
    def decode(self, ids):
        return [self.itos[i] for i in ids if i not in [0, 1, 2]]

# Build vocabularies
src_vocab = Vocab(urdu_sentences)
tgt_vocab = Vocab(roman_sentences)
print(f"Urdu vocab size: {len(src_vocab.itos)}, Roman vocab size: {len(tgt_vocab.itos)}")

# Dataset
class NMTDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_vocab, tgt_vocab, max_len=20):
        self.src, self.tgt = src_texts, tgt_texts
        self.src_vocab, self.tgt_vocab = src_vocab, tgt_vocab
        self.max_len = max_len
    def __len__(self): return len(self.src)
    def __getitem__(self, idx):
        src_ids = [1] + self.src_vocab.encode(self.src[idx])[:self.max_len-2] + [2]
        tgt_ids = [1] + self.tgt_vocab.encode(self.tgt[idx])[:self.max_len-2] + [2]
        src_ids = src_ids + [0]*(self.max_len-len(src_ids)) if len(src_ids) < self.max_len else src_ids[:self.max_len]
        tgt_ids = tgt_ids + [0]*(self.max_len-len(tgt_ids)) if len(tgt_ids) < self.max_len else tgt_ids[:self.max_len]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

# Data split
train_idx, temp_idx = train_test_split(range(len(urdu_sentences)), test_size=0.2, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)
train_urdu = [urdu_sentences[i] for i in train_idx]
train_roman = [roman_sentences[i] for i in train_idx]
val_urdu = [urdu_sentences[i] for i in val_idx]
val_roman = [roman_sentences[i] for i in val_idx]
test_urdu = [urdu_sentences[i] for i in test_idx]
test_roman = [roman_sentences[i] for i in test_idx]
print(f"Train: {len(train_urdu)}, Val: {len(val_urdu)}, Test: {len(test_urdu)}")

train_dataset = NMTDataset(train_urdu, train_roman, src_vocab, tgt_vocab)
val_dataset = NMTDataset(val_urdu, val_roman, src_vocab, tgt_vocab)
test_dataset = NMTDataset(test_urdu, test_roman, src_vocab, tgt_vocab)

# Collate function
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=0, batch_first=True)
    return src_batch, tgt_batch

# Model classes
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0,
                            batch_first=True, bidirectional=True)
    def forward(self, src):
        emb = self.embedding(src)
        outputs, (h, c) = self.lstm(emb)
        return outputs, (h, c)

class LuongAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim*2, hidden_dim*2)
    def forward(self, hidden, encoder_outputs):
        scores = torch.bmm(encoder_outputs, self.attn(hidden).unsqueeze(2)).squeeze(2)
        attn_weights = torch.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        return context.squeeze(1), attn_weights

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim*2, n_layers, dropout=dropout if n_layers > 1 else 0,
                            batch_first=True)
        self.attn = LuongAttention(hidden_dim)
        self.fc = nn.Linear(hidden_dim*4, vocab_size)
        self.hidden_dim = hidden_dim*2
        self.n_layers = n_layers
    def forward(self, tgt, hidden, encoder_outputs):
        emb = self.embedding(tgt)
        output, hidden = self.lstm(emb, hidden)
        context, attn_w = self.attn(output.squeeze(1), encoder_outputs)
        combined = torch.cat((output.squeeze(1), context), dim=1)
        logits = self.fc(combined)
        return logits, hidden, attn_w

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, hidden_dim):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.hidden_dim = hidden_dim
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        encoder_outputs, (h, c) = self.encoder(src)
        h_cat = torch.cat([h[-2], h[-1]], dim=1).unsqueeze(0).repeat(self.decoder.n_layers, 1, 1)
        c_cat = torch.cat([c[-2], c[-1]], dim=1).unsqueeze(0).repeat(self.decoder.n_layers, 1, 1)
        hidden = (h_cat, c_cat)
        outputs = []
        for t in range(tgt.size(1)-1):
            input_t = tgt[:, t].unsqueeze(1)
            if t > 0 and random.random() > teacher_forcing_ratio:
                input_t = outputs[-1].argmax(-1)
            logits, hidden, _ = self.decoder(input_t, hidden, encoder_outputs)
            outputs.append(logits.unsqueeze(1))
        return torch.cat(outputs, dim=1)

In [None]:
# Import required modules
import random
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import jiwer

# Hyperparameter ranges
PARAM_GRID = {
    'emb_dim': [128, 256, 512],
    'hidden_dim': [128, 256, 512],
    'n_layers': [1, 2, 3],
    'dropout': [0.1, 0.3, 0.5],
    'lr': [1e-3, 5e-4, 1e-4],
    'batch_size': [16, 32, 64],  # Reduced to avoid OOM
    'teacher_forcing_ratio': [0.3, 0.5, 0.7]
}

# Random search
N_TRIALS = 5
best_bleu = 0
best_params = None
best_model_path = '/kaggle/working/best_model.pt'

for trial in range(N_TRIALS):
    print(f"\n=== Trial {trial+1}/{N_TRIALS} ===")
    params = {
        'emb_dim': random.choice(PARAM_GRID['emb_dim']),
        'hidden_dim': random.choice(PARAM_GRID['hidden_dim']),
        'n_layers': random.choice(PARAM_GRID['n_layers']),
        'dropout': random.choice(PARAM_GRID['dropout']),
        'lr': random.choice(PARAM_GRID['lr']),
        'batch_size': random.choice(PARAM_GRID['batch_size']),
        'teacher_forcing_ratio': random.choice(PARAM_GRID['teacher_forcing_ratio'])
    }
    print(f"Parameters: {params}")

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, collate_fn=collate_fn, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], collate_fn=collate_fn, num_workers=0, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], collate_fn=collate_fn, num_workers=0, pin_memory=True)
    print(f"Loaders ready: Train {len(train_loader)} batches, Val {len(val_loader)}, Test {len(test_loader)}")

    # Initialize model
    encoder = Encoder(len(src_vocab.itos), params['emb_dim'], params['hidden_dim'], params['n_layers'], params['dropout'])
    decoder = Decoder(len(tgt_vocab.itos), params['emb_dim'], params['hidden_dim'], params['n_layers'], params['dropout'])
    model = Seq2Seq(encoder, decoder, params['hidden_dim']).to(device)

    optimizer = optim.Adam(model.parameters(), lr=params['lr'])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Training function
    def train_model(model, train_loader, val_loader, optimizer, criterion, n_epochs=10, patience=3):
        best_val_loss = float('inf')
        patience_counter = 0
        for epoch in range(n_epochs):
            model.train()
            train_loss = 0
            batch_count = 0
            for src, tgt in tqdm(train_loader, desc=f"Trial {trial+1} Epoch {epoch+1}"):
                src = src.to(device)
                tgt = tgt.to(device)
                optimizer.zero_grad()
                output = model(src, tgt, params['teacher_forcing_ratio'])
                output = output.view(-1, output.size(-1))
                tgt = tgt[:, 1:].reshape(-1)
                loss = criterion(output, tgt)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                train_loss += loss.item()
                batch_count += 1
            
            train_loss /= batch_count
            
            model.eval()
            val_loss = 0
            batch_count = 0
            with torch.no_grad():
                for src, tgt in val_loader:
                    src = src.to(device)
                    tgt = tgt.to(device)
                    output = model(src, tgt, 0)
                    output = output.view(-1, output.size(-1))
                    tgt = tgt[:, 1:].reshape(-1)
                    loss = criterion(output, tgt)
                    val_loss += loss.item()
                    batch_count += 1
            
            val_loss /= batch_count
            
            print(f'Trial {trial+1} Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}')
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f'/kaggle/working/model_trial_{trial+1}.pt')
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break
        
        model.load_state_dict(torch.load(f'/kaggle/working/model_trial_{trial+1}.pt'))
        return model, best_val_loss

    # Batch inference function
    def translate_batch(model, src_tensor, max_len=20):
        model.eval()
        outputs = []
        with torch.no_grad():
            src_tensor = src_tensor.to(device)
            encoder_outputs, (h, c) = model.encoder(src_tensor)
            h_cat = torch.cat([h[-2], h[-1]], dim=1).unsqueeze(0).repeat(model.decoder.n_layers, 1, 1)
            c_cat = torch.cat([c[-2], c[-1]], dim=1).unsqueeze(0).repeat(model.decoder.n_layers, 1, 1)
            hidden = (h_cat, c_cat)
            inputs = torch.tensor([1] * src_tensor.shape[0], device=device).unsqueeze(1)
            
            for _ in range(max_len):
                logits, hidden, _ = model.decoder(inputs, hidden, encoder_outputs)
                pred_tokens = logits.argmax(-1).unsqueeze(1)
                outputs.append(pred_tokens)
                if (pred_tokens == 2).all():
                    break
                inputs = pred_tokens
            
            outputs = torch.cat(outputs, dim=1)
        return outputs.cpu().numpy().tolist()

    # Evaluation function (fixed)
    def evaluate_model(model, test_loader):
        model.eval()
        references = []
        hypotheses = []
        losses = []
        
        with torch.no_grad():
            for i, (src, tgt) in enumerate(tqdm(test_loader, desc="Evaluating")):
                src = src.to(device)
                tgt = tgt.to(device)
                output = model(src, tgt, 0)
                output = output.view(-1, output.size(-1))
                tgt_flat = tgt[:, 1:].reshape(-1)
                loss = criterion(output, tgt_flat)
                losses.append(loss.item())
                
                hypotheses_batch = translate_batch(model, src)
                for j in range(src.shape[0]):
                    src_j = src[j, 1:-1].cpu().numpy().tolist()
                    tgt_j = tgt[j, 1:-1].cpu().numpy().tolist()
                    hyp_j = hypotheses_batch[j]
                    if len(tgt_j) == 0 or len(hyp_j) == 0:
                        print(f"Warning: Empty sequence at index {j}, skipping.")
                        continue
                    # Fix: Use list indexing instead of .get()
                    tgt_str = ' '.join([tgt_vocab.itos[idx] for idx in tgt_j if idx not in [0, 1, 2, 3] and idx < len(tgt_vocab.itos)])
                    if any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in tgt_str):
                        print(f"Warning: Urdu script detected in ground truth at index {j}, skipping.")
                        continue
                    references.append([tgt_j])
                    hypotheses.append(hyp_j)
        
        # Fix: Use list indexing for decoding
        ref_strings = [' '.join([tgt_vocab.itos[idx] for idx in ref[0] if idx not in [0, 1, 2, 3] and idx < len(tgt_vocab.itos)]) for ref in references]
        hyp_strings = [' '.join([tgt_vocab.itos[idx] for idx in hyp if idx not in [0, 1, 2, 3] and idx < len(tgt_vocab.itos)]) for hyp in hypotheses]
        
        perplexity = np.exp(np.mean(losses)) if losses else float('inf')
        smooth = SmoothingFunction().method1
        bleu = corpus_bleu(references, hypotheses, smoothing_function=smooth) if references else 0.0
        cer_scores = [jiwer.cer(ref, hyp) for ref, hyp in zip(ref_strings, hyp_strings) if ref and hyp]
        avg_cer = np.mean(cer_scores) if cer_scores else float('inf')
        
        print(f'Perplexity: {perplexity:.3f}, BLEU: {bleu:.3f}, CER: {avg_cer:.3f}')
        
        for i in range(min(5, len(test_urdu))):
            # Fix: Use list indexing for decoding
            urdu = ' '.join([src_vocab.itos[idx.item()] for idx in test_dataset[i][0][1:-1] if idx not in [0, 1, 2, 3] and idx < len(src_vocab.itos)])
            ground_truth = ' '.join([tgt_vocab.itos[idx] for idx in test_dataset[i][1][1:-1].cpu().numpy().tolist() if idx not in [0, 1, 2, 3] and idx < len(tgt_vocab.itos)])
            if any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in ground_truth):
                print(f"Warning: Urdu script in ground truth at test pair {i}, skipping.")
                continue
            hyp = translate_batch(model, test_dataset[i][0][1:-1].unsqueeze(0))[0]
            predicted = ' '.join([tgt_vocab.itos[idx] for idx in hyp if idx not in [0, 1, 2, 3] and idx < len(tgt_vocab.itos)])
            print(f"\nUrdu: {urdu}\nGround Truth: {ground_truth}\nPredicted: {predicted}")
        
        return perplexity, bleu, avg_cer

    # Train and evaluate
    try:
        model, val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, n_epochs=10)
        perplexity, bleu, cer = evaluate_model(model, test_loader)
        
        if bleu > best_bleu:
            best_bleu = bleu
            best_params = params
            torch.save(model.state_dict(), best_model_path)
            print(f"New best BLEU: {bleu:.3f}, saved to {best_model_path}")
    except RuntimeError as e:
        print(f"Trial {trial+1} failed with error: {e}")
        continue

print(f"\nBest Parameters: {best_params}")
print(f"Best BLEU: {best_bleu:.3f}")