<a href="https://colab.research.google.com/github/MdA-Saad/NMT-Neural-Machine-Translation-/blob/main/seq2seq_es_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence
from torch.optim import Adam
import torch.nn.functional as F

In [None]:
import os

def load_parallel_data(en_path, es_path, num_samples=5):
    """
    Reads English and Spanish files and pairs them up.
    """
    # Check if files exist
    if not os.path.exists(en_path) or not os.path.exists(es_path):
        return "Error: One or both files were not found. Check your file paths!"

    parallel_data = []

    # Using 'zip' ensures we only pair lines that have a match in both files
    with open(en_path, 'r', encoding='utf-8') as en_file, \
         open(es_path, 'r', encoding='utf-8') as es_file:

        for en_line, es_line in zip(en_file, es_file):
            # .strip() removes the newline character (\n)
            en_sent = en_line.strip()
            es_sent = es_line.strip()

            # Skip empty lines if any
            if en_sent and es_sent:
                parallel_data.append({"en": en_sent, "es": es_sent})

    print(f"Successfully loaded {len(parallel_data)} parallel sentences.")

    # Show a few examples
    print("\n--- Examples ---")
    for i in range(min(num_samples, len(parallel_data))):
        print(f"[{i}] EN: {parallel_data[i]['en']}")
        print(f"    ES: {parallel_data[i]['es']}\n")

    return parallel_data

# Replace these with your actual filenames
EN_FILE = "/content/drive/MyDrive/Colab Notebooks/eng2esp_transformer/es-en/TED2020.en-es.en"
ES_FILE = "/content/drive/MyDrive/Colab Notebooks/eng2esp_transformer/es-en/TED2020.en-es.es"

data = load_parallel_data(EN_FILE, ES_FILE)

In [None]:
# VOCABULARY CLASS
class Vocab:
    SPECIAL_TOKENS = {
        "<PAD>": 0,
        "<SOS>": 1,
        "<EOS>": 2,
        "<UNK>": 3,
    }

    def __init__(self, language: str):
        self.language = language
        self.word2index = {}
        self.index2word = {}
        self.n_words = 0

        # Initialize with special tokens
        for token, idx in self.SPECIAL_TOKENS.items():
            self.word2index[token] = idx
            self.index2word[idx] = token
        self.n_words = len(self.SPECIAL_TOKENS)

    def normalize_string(self, string: str) -> str:
        """Normalize text for the given language"""
        string = string.lower().strip()
        # Add spaces around punctuation
        string = re.sub(r"([.!?¿¡])", r" \1 ", string)
        # Keep language-specific characters
        if self.language == "spanish":
            string = re.sub(r"[^a-záéíóúüñ.!?¿¡]+", r" ", string)
        else:
            string = re.sub(r"[^a-z.!?]+", r" ", string)
        return re.sub(r"\s+", " ", string).strip()

    def add_sentence(self, sentence: str):
        """Add all words in a sentence to vocabulary"""
        for word in self.normalize_string(sentence).split():
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.index2word[self.n_words] = word
                self.n_words += 1

    def numericalize(self, sentence: str, max_len: Optional[int] = None) -> List[int]:
        """Convert sentence to list of token indices"""
        tokens = self.normalize_string(sentence).split()
        if max_len:
            tokens = tokens[:max_len-2]  # Reserve space for SOS and EOS

        ids = [self.word2index["<SOS>"]]
        ids.extend(self.word2index.get(t, self.word2index["<UNK>"]) for t in tokens)
        ids.append(self.word2index["<EOS>"])
        return ids

    def __len__(self):
        return self.n_words

In [None]:
# DATASET CLASS
class TranslationDataset(Dataset):
    def __init__(self, source_sentences: List[str], target_sentences: List[str],
                 source_vocab: Vocab, target_vocab: Vocab, max_len: int = 50):
        assert len(source_sentences) == len(target_sentences), "Source and target must have same length"
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source = self.source_vocab.numericalize(self.source_sentences[idx], self.max_len)
        target = self.target_vocab.numericalize(self.target_sentences[idx], self.max_len)
        return torch.tensor(source), torch.tensor(target)

def collate_fn(batch, source_pad_id=0, target_pad_id=0):
    """Custom collate function for padding sequences"""
    source_seqs, target_seqs = zip(*batch)

    source_padded = pad_sequence(source_seqs, batch_first=True, padding_value=source_pad_id)
    target_padded = pad_sequence(target_seqs, batch_first=True, padding_value=target_pad_id)

    source_lengths = torch.tensor([len(seq) for seq in source_seqs], dtype=torch.long)
    target_lengths = torch.tensor([len(seq) for seq in target_seqs], dtype=torch.long)

    # Sort by source lengths in descending order for pack_padded_sequence
    source_lengths, perm_idx = source_lengths.sort(0, descending=True)
    source_padded = source_padded[perm_idx]
    target_padded = target_padded[perm_idx]
    target_lengths = target_lengths[perm_idx]

    return {
        "source_padded": source_padded,
        "target_padded": target_padded,
        "source_lengths": source_lengths,
        "target_lengths": target_lengths,
        "perm_idx": perm_idx  # To restore original order after decoding
    }


In [None]:
# ENCODER
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, embed_size=None, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        embed_size = embed_size or hidden_size

        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_size * 2, hidden_size)  # For combining bidirectional states

    def forward(self, source, source_lengths):
        embedded = self.dropout(self.embedding(source))

        # Pack padded sequences
        packed = pack_padded_sequence(embedded, source_lengths.cpu(), batch_first=True, enforce_sorted=True)
        packed_output, hidden = self.gru(packed)

        # Unpack sequences
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # Combine bidirectional hidden states
        # hidden shape: (num_layers * num_directions, batch, hidden_size)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # FIXED: missing comma and parentheses
        hidden = torch.tanh(self.fc(hidden))
        hidden = hidden.unsqueeze(0)  # (1, batch, hidden_size)

        return output, hidden

In [None]:
# ATTENTION (FIXED)
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size, bias=False)
        self.Ua = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.va = nn.Parameter(torch.rand(hidden_size))

    def forward(self, decoder_hidden, encoder_outputs, source_mask=None):
        """
        decoder_hidden: (1, batch, hidden_size)
        encoder_outputs: (batch, seq_len, hidden_size*2)
        source_mask: (batch, 1, seq_len) - True for valid positions
        """
        # decoder_hidden: (batch, hidden_size)
        dec_hid = decoder_hidden.squeeze(0)

        # Transform decoder hidden state: (batch, 1, hidden_size)
        dec_transformed = self.Wa(dec_hid).unsqueeze(1)

        # Transform encoder outputs: (batch, seq_len, hidden_size)
        enc_transformed = self.Ua(encoder_outputs)

        # Calculate attention energies: (batch, seq_len, hidden_size)
        energy = torch.tanh(dec_transformed + enc_transformed)

        # Calculate scores: (batch, seq_len)
        scores = torch.matmul(energy, self.va.unsqueeze(1)).squeeze(2)

        # Apply mask if provided
        if source_mask is not None:
            scores = scores.masked_fill(~source_mask.squeeze(1), -1e10)

        # Calculate attention weights: (batch, 1, seq_len)
        attn_weights = F.softmax(scores, dim=1).unsqueeze(1)

        # Calculate context vector: (batch, hidden_size*2)
        context = torch.bmm(attn_weights, encoder_outputs).squeeze(1)

        return context, attn_weights.squeeze(1)

In [None]:
# DECODER (FIXED)
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, embed_size=None, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        embed_size = embed_size or hidden_size

        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(embed_size + hidden_size * 2, hidden_size, batch_first=True)
        self.fc_out = nn.Linear(hidden_size + hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, target, encoder_outputs, hidden, source_mask=None, teacher_forcing_ratio=0.5):
        """
        target: (batch, tgt_len)
        encoder_outputs: (batch, src_len, hidden_size*2)
        hidden: (1, batch, hidden_size)
        """
        batch_size = target.size(0)
        target_len = target.size(1)
        vocab_size = self.fc_out.out_features

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(target.device)

        # First input token is SOS
        input_token = target[:, 0]

        for t in range(1, target_len):
            # Embed input token
            embedded = self.dropout(self.embedding(input_token)).unsqueeze(1)  # (batch, 1, embed_size)

            # Calculate attention context
            context, attn_weights = self.attention(hidden, encoder_outputs, source_mask)
            context = context.unsqueeze(1)  # (batch, 1, hidden_size*2)

            # Combine embedded input and context
            rnn_input = torch.cat((embedded, context), dim=2)  # (batch, 1, embed_size + hidden_size*2)

            # Pass through GRU
            output, hidden = self.gru(rnn_input, hidden)

            # Combine output and context for prediction
            output = output.squeeze(1)  # (batch, hidden_size)
            context = context.squeeze(1)  # (batch, hidden_size*2)

            pred = self.fc_out(torch.cat((output, context), dim=1))  # (batch, vocab_size)
            outputs[:, t, :] = pred

            # Teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = pred.argmax(1)

            # Next input is current target token (teacher forcing) or predicted token
            input_token = target[:, t] if teacher_force else top1

        return outputs

In [None]:
# UTILITY FUNCTIONS
def create_mask(source, pad_idx=0):
    """Create mask for source sequences"""
    return (source != pad_idx).unsqueeze(1)  # (batch, 1, seq_len)

def translate_sentence(encoder, decoder, sentence, src_vocab, tgt_vocab, device, max_len=50):
    """Translate a single sentence"""
    encoder.eval()
    decoder.eval()

    # Tokenize and numericalize input sentence
    tokens = src_vocab.numericalize(sentence, max_len)
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(tokens)]).to(device)

    # Encode source sentence
    with torch.no_grad():
        encoder_outputs, hidden = encoder(src_tensor, src_len)

    # Prepare target sequence
    trg_indexes = [tgt_vocab.word2index["<SOS>"]]

    # Generate translation
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            # Get context vector using attention
            context, _ = decoder.attention(hidden, encoder_outputs)

            # Prepare decoder input
            embedded = decoder.dropout(decoder.embedding(trg_tensor.unsqueeze(0)))
            rnn_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)

            output, hidden = decoder.gru(rnn_input, hidden)
            output = output.squeeze(1)
            context = context.squeeze(0)

            pred = decoder.fc_out(torch.cat((output, context.unsqueeze(0)), dim=1))
            pred_token = pred.argmax(1).item()

        trg_indexes.append(pred_token)

        if pred_token == tgt_vocab.word2index["<EOS>"]:
            break

    # Convert indexes to tokens
    trg_tokens = [tgt_vocab.index2word[i] for i in trg_indexes]

    # Remove special tokens
    translation = ' '.join(trg_tokens[1:-1])  # Remove SOS and EOS

    return translation


In [None]:
# TRAINING FUNCTIONS
def train_epoch(encoder, decoder, dataloader, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.5, device='cpu'):
    encoder.train()
    decoder.train()
    total_loss = 0

    for batch in dataloader:
        source = batch["source_padded"].to(device)
        target = batch["target_padded"].to(device)
        source_lengths = batch["source_lengths"].to(device)

        optimizer.zero_grad()

        # Forward pass
        encoder_outputs, hidden = encoder(source, source_lengths)
        source_mask = create_mask(source).to(device)
        outputs = decoder(target, encoder_outputs, hidden, source_mask, teacher_forcing_ratio)

        # Calculate loss (ignore padding tokens)
        output_dim = outputs.shape[-1]
        outputs = outputs[:, 1:].reshape(-1, output_dim)  # Skip SOS token
        target = target[:, 1:].reshape(-1)  # Skip SOS token

        loss = criterion(outputs, target)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# Inference
@torch.no_grad()
def evaluate(encoder, decoder, dataloader, criterion, device='cpu'):
    encoder.eval()
    decoder.eval()
    total_loss = 0

    for batch in dataloader:
        source = batch["source_padded"].to(device)
        target = batch["target_padded"].to(device)
        source_lengths = batch["source_lengths"].to(device)

        encoder_outputs, hidden = encoder(source, source_lengths)
        source_mask = create_mask(source).to(device)
        outputs = decoder(target, encoder_outputs, hidden, source_mask, teacher_forcing_ratio=0.0)  # Turn off teacher forcing

        output_dim = outputs.shape[-1]
        outputs = outputs[:, 1:].reshape(-1, output_dim)
        target = target[:, 1:].reshape(-1)

        loss = criterion(outputs, target)
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# TRAINING LOOP
def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Prepare data
    english_sentences = df['english'].tolist()
    spanish_sentences = df['spanish'].tolist()

    # Create train/val split
    train_eng, val_eng, train_spa, val_spa = train_test_split(
        english_sentences, spanish_sentences, test_size=0.1, random_state=42
    )

    print(f"Training samples: {len(train_eng)}")
    print(f"Validation samples: {len(val_eng)}")

    # Build vocabularies
    english_vocab = Vocab("english")
    spanish_vocab = Vocab("spanish")

    # Add sentences to vocabularies
    for eng, spa in zip(train_eng, train_spa):
        english_vocab.add_sentence(eng)
        spanish_vocab.add_sentence(spa)

    print(f"English vocab size: {len(english_vocab)}")
    print(f"Spanish vocab size: {len(spanish_vocab)}")

    # Create datasets
    MAX_LEN = 30
    train_dataset = TranslationDataset(train_eng, train_spa, english_vocab, spanish_vocab, max_len=MAX_LEN)
    val_dataset = TranslationDataset(val_eng, val_spa, english_vocab, spanish_vocab, max_len=MAX_LEN)

    # Create dataloaders
    BATCH_SIZE = 64
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda b: collate_fn(
            b,
            source_pad_id=english_vocab.word2index["<PAD>"],
            target_pad_id=spanish_vocab.word2index["<PAD>"]
        )
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=lambda b: collate_fn(
            b,
            source_pad_id=english_vocab.word2index["<PAD>"],
            target_pad_id=spanish_vocab.word2index["<PAD>"]
        )
    )

    # Initialize model
    HIDDEN_SIZE = 256
    encoder = EncoderRNN(len(english_vocab), HIDDEN_SIZE).to(device)
    decoder = DecoderRNN(len(spanish_vocab), HIDDEN_SIZE).to(device)

    # Optimizer and loss function
    optimizer = Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=spanish_vocab.word2index["<PAD>"])

    # Training loop
    N_EPOCHS = 10
    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):
        train_loss = train_epoch(
            encoder, decoder, train_loader, optimizer, criterion,
            clip=1.0, teacher_forcing_ratio=0.5, device=device
        )
        valid_loss = evaluate(encoder, decoder, val_loader, criterion, device=device)

        print(f"Epoch: {epoch+1}/{N_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {valid_loss:.4f}")

        # Save best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save({
                'epoch': epoch,
                'encoder_state_dict': encoder.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'english_vocab': english_vocab,
                'spanish_vocab': spanish_vocab,
            }, "best_translation_model.pt")
            print(f"Saved new best model with validation loss: {valid_loss:.4f}")

    print("Training complete!")

    # Test translation with a few examples
    print("\nTranslation Examples:")
    test_sentences = [
        "Hello, how are you?",
        "I love programming.",
        "What is your name?",
        "The weather is nice today."
    ]

    for sentence in test_sentences:
        translation = translate_sentence(
            encoder, decoder, sentence, english_vocab, spanish_vocab, device, max_len=30
        )
        print(f"EN: {sentence}")
        print(f"ES: {translation}\n")

if __name__ == "__main__":
    main()

In [None]:
#Evaluation
import sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER
from nltk.translate.meteor_score import meteor_score
import nltk
import numpy as np
from tqdm import tqdm

# Download required NLTK data once
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

class MTEvaluator:
    """Comprehensive MT evaluation suite"""

    def __init__(self, target_language='es'):
        self.bleu = BLEU()
        self.chrf = CHRF(word_order=2)  # chrF++
        self.ter = TER()
        self.target_lang = target_language

    def normalize_text(self, text):
        """Light normalization for fair comparison"""
        text = text.strip().lower()
        text = re.sub(r'\s+', ' ', text)
        return text

    @torch.no_grad()
    def generate_translations(self, encoder, decoder, dataloader,
                             src_vocab, tgt_vocab, device, max_len=50):
        """Generate translations for entire dataset"""
        encoder.eval()
        decoder.eval()

        sources, references, hypotheses = [], [], []

        for batch in tqdm(dataloader, desc="Generating translations"):
            source = batch["source_padded"].to(device)
            target = batch["target_padded"].to(device)
            source_lengths = batch["source_lengths"].to(device)
            perm_idx = batch["perm_idx"]

            # Reverse sorting to restore original order
            inv_perm = torch.argsort(perm_idx)

            # Encode
            encoder_outputs, hidden = encoder(source, source_lengths)
            source_mask = create_mask(source).to(device)

            # Greedy decoding
            batch_size = source.size(0)
            trg_indexes = [[tgt_vocab.word2index["<SOS>"]] * batch_size]
            hidden_states = hidden

            # Track which sequences are finished
            unfinished = torch.ones(batch_size, dtype=torch.bool).to(device)

            for t in range(1, max_len):
                # Get last predicted tokens
                input_tokens = torch.tensor([trg_indexes[-1][i] for i in range(batch_size)]).to(device)

                # Embedding + attention
                embedded = decoder.embedding(input_tokens.unsqueeze(1))
                context, _ = decoder.attention(hidden_states, encoder_outputs, source_mask)
                rnn_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)

                output, hidden_states = decoder.gru(rnn_input, hidden_states)
                output = output.squeeze(1)
                context = context

                pred = decoder.fc_out(torch.cat((output, context), dim=1))
                pred_tokens = pred.argmax(1)

                # Stop predicting for finished sequences
                pred_tokens[~unfinished] = tgt_vocab.word2index["<PAD>"]
                unfinished &= (pred_tokens != tgt_vocab.word2index["<EOS>"])

                trg_indexes.append(pred_tokens.cpu().tolist())

                if not unfinished.any():
                    break

            # Convert indexes to text
            for i in range(batch_size):
                # Get hypothesis
                hyp_ids = [trg_indexes[t][i] for t in range(len(trg_indexes))
                          if trg_indexes[t][i] not in [tgt_vocab.word2index["<SOS>"],
                                                      tgt_vocab.word2index["<EOS>"],
                                                      tgt_vocab.word2index["<PAD>"]]]
                hyp = ' '.join([tgt_vocab.index2word.get(idx, '<UNK>') for idx in hyp_ids])

                # Get reference (skip special tokens)
                ref_ids = target[inv_perm[i]][1:]  # Skip SOS
                ref_ids = [idx.item() for idx in ref_ids
                          if idx.item() not in [tgt_vocab.word2index["<SOS>"],
                                               tgt_vocab.word2index["<EOS>"],
                                               tgt_vocab.word2index["<PAD>"]]]
                ref = ' '.join([tgt_vocab.index2word.get(idx, '<UNK>') for idx in ref_ids])

                # Get source for debugging
                src_ids = source[inv_perm[i]]
                src_ids = [idx.item() for idx in src_ids
                          if idx.item() not in [src_vocab.word2index["<PAD>"]]]
                src = ' '.join([src_vocab.index2word.get(idx, '<UNK>') for idx in src_ids])

                hypotheses.append(hyp)
                references.append(ref)
                sources.append(src)

        return sources, references, hypotheses

    def compute_metrics(self, hypotheses, references):
        """Compute multiple MT metrics"""
        # Ensure references is list of lists (multiple references per hypothesis)
        if isinstance(references[0], str):
            references = [[ref] for ref in references]

        # BLEU (sacrebleu handles tokenization properly)
        bleu = sacrebleu.corpus_bleu(hypotheses, references,
                                    tokenize='intl' if self.target_lang == 'es' else '13a')

        # chrF++ (better for Spanish morphology)
        chrf = sacrebleu.corpus_chrf(hypotheses, references, word_order=2)

        # TER
        ter = sacrebleu.corpus_ter(hypotheses, references)

        # METEOR (requires NLTK)
        meteor_scores = [
            meteor_score([ref[0].split() for ref in references[i]], hyp.split())
            for i, hyp in enumerate(hypotheses)
        ]
        meteor_avg = np.mean(meteor_scores)

        return {
            'BLEU': bleu.score,
            'chrF++': chrf.score,
            'TER': ter.score,
            'METEOR': meteor_avg * 100,  # Convert to percentage
            'n_samples': len(hypotheses)
        }

    def detailed_report(self, sources, references, hypotheses, n_samples=5):
        """Print detailed examples with metrics"""
        metrics = self.compute_metrics(hypotheses, references)

        print("="*80)
        print("TRANSLATION EVALUATION REPORT")
        print("="*80)
        print(f"Total samples: {metrics['n_samples']}")
        print(f"BLEU:    {metrics['BLEU']:.2f}")
        print(f"chrF++:  {metrics['chrF++']:.2f}  ← Best for Spanish morphology")
        print(f"TER:     {metrics['TER']:.2f} (lower is better)")
        print(f"METEOR:  {metrics['METEOR']:.2f}")
        print("="*80)

        # Show examples
        print("\nEXAMPLE TRANSLATIONS:\n")
        for i in range(min(n_samples, len(sources))):
            print(f"Source (EN):      {sources[i]}")
            print(f"Reference (ES):   {references[i][0] if isinstance(references[i], list) else references[i]}")
            print(f"Hypothesis (ES):  {hypotheses[i]}")
            print("-"*80)

        return metrics

# Usage example:
def evaluate_model(encoder, decoder, val_loader, src_vocab, tgt_vocab, device):
    evaluator = MTEvaluator(target_language='es')

    # Generate translations
    sources, references, hypotheses = evaluator.generate_translations(
        encoder, decoder, val_loader, src_vocab, tgt_vocab, device, max_len=40
    )

    # Compute metrics
    metrics = evaluator.compute_metrics(hypotheses, references)

    # Print detailed report
    evaluator.detailed_report(sources, references, hypotheses, n_samples=10)

    return metrics

In [None]:
#Benchmarking
# After training, evaluate on WMT test set
import datasets

# Load official WMT20 test set
wmt_test = datasets.load_dataset('wmt19', 'en-es', split='validation')

# Extract sentences
test_sources = [ex['translation']['en'] for ex in wmt_test]
test_references = [[ex['translation']['es']] for ex in wmt_test]  # List of lists

# Translate your sources
your_hypotheses = [translate_sentence(encoder, decoder, src, src_vocab, tgt_vocab, device)
                   for src in test_sources]

# Compute BLEU against official references
bleu = sacrebleu.corpus_bleu(your_hypotheses, test_references, tokenize='intl')
print(f"Your model BLEU on WMT20 en-es: {bleu.score:.2f}")

# Compare to published results:
# - Google Translate (2023): ~43.5 BLEU
# - Modern Transformer (WMT winner): ~45.2 BLEU
# - Your Seq2Seq RNN baseline: ~25-32 BLEU (typical)