In [2]:
# imports

import os
import sys
import time
import math

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from torch.utils.data import DataLoader

import spacy
from collections import Counter

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random

from torchmetrics.functional.text import bleu_score

In [3]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Device", device)

Device mps


In [4]:
# 1. Set Seeds for Reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check if MPS is available and set the seed
if torch.backends.mps.is_available():
    torch.mps.manual_seed(SEED)

# Check if CUDA is available (just in case you move this code to a server later)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [5]:
# Download and load the WMT14 French-English dataset
# This might take a few minutes as the dataset is large
dataset = load_dataset("wmt14", "fr-en")

# Print the dataset structure to verify
print(dataset)

# Example: Inspect the first training example
print("First training example:", dataset['train'][0])

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40836715
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})
First training example: {'translation': {'en': 'Resumption of the session', 'fr': 'Reprise de la session'}}


In [6]:
spacy_fr = spacy.load("fr_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [7]:
#First, we create a class to manage the mapping between words and IDs. This handles the "80k vocabulary" limit mentioned in the paper.

class Vocabulary:
    def __init__(self, freq_threshold=2, max_size=80000):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.max_size = max_size

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    @staticmethod
    def tokenizer_fr(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

    def build_vocabulary(self, sentence_list, tokenizer):
        frequencies = Counter()
        idx = 4 # Start index after special tokens

        # 1. Count frequencies of all words
        for sentence in sentence_list:
            for word in tokenizer(sentence):
                frequencies[word] += 1

        # 2. Sort by frequency and keep top 'max_size' words
        # This matches the paper's strategy of capping vocab size
        common_words = frequencies.most_common(self.max_size - 4)

        # 3. Add valid words to our dictionary
        for word, count in common_words:
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text, tokenizer):
        tokenized_text = tokenizer(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [8]:
#Now we define a PyTorch Dataset that takes the raw Hugging Face data and converts it into numbers using the Vocabulary class above.

class WMT14Dataset_regular_order(Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        # Get the pair: {'fr': '...', 'en': '...'}
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # Convert text to indices
        # Add <SOS> at start and <EOS> at end
        numericalized_source = [self.source_vocab.stoi["<SOS>"]]
        numericalized_source += self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        numericalized_source.append(self.source_vocab.stoi["<EOS>"])

        numericalized_target = [self.target_vocab.stoi["<SOS>"]]
        numericalized_target += self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr)
        numericalized_target.append(self.target_vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

In [9]:
class WMT14Dataset(Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # 1. Numericalize Source
        # Get list of numbers: [4, 10, 55] ("the", "cat", "sat")
        src_indices = self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        
        # --- IMPLEMENTATION OF PAPER POINT #1: REVERSE INPUT ---
        # Reverse the list: [55, 10, 4] ("sat", "cat", "the")
        src_indices = src_indices[::-1] 
        
        # Add special tokens
        numericalized_source = [self.source_vocab.stoi["<SOS>"]]
        numericalized_source += src_indices
        numericalized_source.append(self.source_vocab.stoi["<EOS>"])

        # 2. Numericalize Target (Do NOT reverse this)
        trg_indices = self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr)
        
        numericalized_target = [self.target_vocab.stoi["<SOS>"]]
        numericalized_target += trg_indices
        numericalized_target.append(self.target_vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

In [10]:
#Since sentences have different lengths, we cannot simply stack them into a matrix.
# We need a specific function (called collate_fn) to pad short sentences with zeros (the <PAD> token) so that every batch is rectangular.

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        source = [item[0] for item in batch]
        target = [item[1] for item in batch]

        # Pad sequences to the max length in this batch
        source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)

        return source, target

In [11]:
# 1. Load Data
# Select subsets: 10k for training, 1k for validation
train_subset = dataset['train'].select(range(10000))
valid_subset = dataset['validation'].select(range(1000))

print(f"Train Subset Size: {len(train_subset)}")
print(f"Valid Subset Size: {len(valid_subset)}")

# 2. Build Vocabulary
print("Building English Vocabulary...")
english_sentences = [item['translation']['en'] for item in train_subset]
vocab_en = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_en.build_vocabulary(english_sentences, vocab_en.tokenizer_eng)


print("Building French Vocabulary...")
french_sentences = [item['translation']['fr'] for item in train_subset]
vocab_fr = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_fr.build_vocabulary(french_sentences, vocab_fr.tokenizer_fr)

# 3. Create Dataset
train_subsetset = WMT14Dataset(train_subset, vocab_en, vocab_fr)
valid_subsetset = WMT14Dataset(valid_subset, vocab_en, vocab_fr)

# 4. Create DataLoaders
BATCH_SIZE = 32 # Reduced batch size for smaller dataset 10k
pad_idx = vocab_en.stoi["<PAD>"]

train_loader = DataLoader(
    dataset=train_subsetset,
    batch_size=BATCH_SIZE,
    num_workers=0,
    shuffle=True,
    collate_fn=MyCollate(pad_idx=pad_idx)
)

# 5. Test it
print("Testing the pipeline...")
for src_batch, trg_batch in train_loader:
    print(f"Source Shape: {src_batch.shape}") # Expect [Seq_Len, Batch_Size]
    print(f"Target Shape: {trg_batch.shape}")
    break

Train Subset Size: 10000
Valid Subset Size: 1000
Building English Vocabulary...
Building French Vocabulary...
Testing the pipeline...
Source Shape: torch.Size([80, 32])
Target Shape: torch.Size([86, 32])


In [12]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size) # dropiut after embedding corrupts the input word vectors to prevent reliance on specific features.
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p) # LSTM layer, dropout between layers prevents deeper
                                                                               #layers from co-adapting too strongly with shallower layers.

    def forward(self, x):
        # x shape : (seq_length, Batch_size) -> seq_length is the length of the input sentence, and we process the entire sequence at once
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (seq_length, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape : (seq_length, Batch_size, hidden_size)
        # hidden shape : (num_layers, Batch_size, hidden_size)
        # cell shape : (num_layers, Batch_size, hidden_size)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc_out = nn.Linear(hidden_size, output_size) #output_size = input_size of the decoder = size of target vocabulary
        self.output_size = output_size
        
    def forward(self, x, hidden, cell):
        # x shape : (1, Batch_size)  -> we process one time step at a time
        x = x.unsqueeze(0) #that's why we added one dimention 
        # x shape : (1, Batch_size, 1)
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape : (1, Batch_size, hidden_size)
        predictions = self.fc_out(outputs.squeeze(0)) #remove the time step dimension for the linear layer
        # predictions shape : (Batch_size, output_size)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # First input to the decoder is the <SOS> tokens
        x = target[0,:]  # shape: (Batch_size)

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output

            # Decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_force_ratio

            # Get the highest predicted token from our predictions
            best_guess = output.argmax(1) 

            # If teacher forcing, use actual next token as next input; if not, use predicted token
            x = target[t] if teacher_force else best_guess

        return outputs

In [13]:
# Helper function to initialize weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

# Helper function to change LR
def adjust_learning_rate(optimizer, decay_factor=0.5):
    for param_group in optimizer.param_groups:
        param_group['lr'] *= decay_factor
    print(f"üìâ Learning Rate decayed to: {optimizer.param_groups[0]['lr']}")

In [14]:
# --- Architecture Specs from Paper ---
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_fr)
ENC_EMB_DIM = 1000  # Paper used 1000
DEC_EMB_DIM = 1000  # Paper used 1000
HID_DIM = 1000      # Paper used 1000
N_LAYERS = 4        # Paper used 4
DROPOUT = 0.2       # Paper implies some regularization, usually 0.2 is safe

# --- Setup ---
# Use the correct index for <PAD> from your English vocabulary
TRG_PAD_IDX = vocab_en.stoi["<PAD>"]

# Create Model
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec).to(device)

# Initialize Weights
model.apply(init_weights)

# --- Optimization Specs ---
BATCH_SIZE = 32    # Paper used 128
LEARNING_RATE = 0.7 # Paper used fixed 0.7 initially
CLIP = 5            # Paper threshold for gradient norm
TOTAL_EPOCHS = 8 # Paper trained for 7.5 epochs
# Optimizer: SGD without momentum
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [61]:
# --- Training Loop ---
for epoch in range(TOTAL_EPOCHS):
    
    model.train()
    epoch_loss = 0
    
    # We need to know when we are "halfway" through
    num_batches = len(train_loader)
    halfway_point = num_batches // 2
    
    for i, (src, trg) in enumerate(train_loader):
        
        # --- THE HALF-EPOCH CHECK ---
        # If we are past epoch 5, we check if we are at the halfway point OR the end
        if epoch >= 5:
            # Check if we are exactly at the halfway mark of the batch list
            if i == halfway_point:
                adjust_learning_rate(optimizer, 0.5)
        
        # Standard Training Step
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        epoch_loss += loss.item()

    # --- END OF EPOCH CHECK ---
    # After the loop finishes (end of epoch), if we are past epoch 5, decay again
    if epoch >= 5:
        adjust_learning_rate(optimizer, 0.5)

    print(f'Epoch: {epoch+1:02} | Loss: {epoch_loss / len(train_loader):.3f}')

Epoch: 01 | Loss: 6.705
Epoch: 02 | Loss: 6.414
Epoch: 03 | Loss: 6.313
Epoch: 04 | Loss: 6.272
Epoch: 05 | Loss: 6.247
üìâ Learning Rate decayed to: 0.35
üìâ Learning Rate decayed to: 0.175
Epoch: 06 | Loss: 6.223
üìâ Learning Rate decayed to: 0.0875
üìâ Learning Rate decayed to: 0.04375
Epoch: 07 | Loss: 6.191
üìâ Learning Rate decayed to: 0.021875
üìâ Learning Rate decayed to: 0.0109375
Epoch: 08 | Loss: 6.184


In [19]:
def beam_search_decode_single(model, sentence, vocab_src, vocab_trg, beam_size=2, max_len=50, device='cpu'):
    model.eval()
    
    # 1. Prepare Source
    if isinstance(sentence, str):
        tokens = vocab_src.tokenizer_eng(sentence)
    else:
        tokens = [token.lower() for token in sentence]
        
    indices = [vocab_src.stoi.get(t, vocab_src.stoi["<UNK>"]) for t in tokens]
    indices = indices[::-1] # Reverse Input
    indices = [vocab_src.stoi["<SOS>"]] + indices + [vocab_src.stoi["<EOS>"]]
    
    src_tensor = torch.LongTensor(indices).unsqueeze(1).to(device)

    with torch.no_grad():
        encoder_hidden, encoder_cell = model.encoder(src_tensor)

        # Hypothesis: (Score, [Sequence], Hidden, Cell)
        hypotheses = [(0.0, [vocab_trg.stoi["<SOS>"]], encoder_hidden, encoder_cell)]

        for _ in range(max_len):
            all_candidates = []
            
            for score, seq, hidden, cell in hypotheses:
                if seq[-1] == vocab_trg.stoi["<EOS>"]:
                    all_candidates.append((score, seq, hidden, cell))
                    continue
                
                input_tensor = torch.LongTensor([seq[-1]]).to(device)
                
                # Predict
                prediction, new_h, new_c = model.decoder(input_tensor, hidden, cell)
                
                # prediction is [1, vocab_size]. We squeeze to make it [vocab_size]
                prediction = prediction.squeeze(0) 
                
                # Log Softmax over dimension 0 (the vocabulary)
                log_probs = F.log_softmax(prediction, dim=0)
                
                # Get Top K
                top_k_probs, top_k_ids = log_probs.topk(beam_size * 2)
                

                # Expand
                for i in range(len(top_k_ids)):
                    word_idx = top_k_ids[i].item()
                    prob = top_k_probs[i].item()
                    all_candidates.append((score + prob, seq + [word_idx], new_h, new_c))

            # Prune
            hypotheses = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]
            
            if all(h[1][-1] == vocab_trg.stoi["<EOS>"] for h in hypotheses):
                break

    best_seq = hypotheses[0][1]
    decoded_words = [vocab_trg.itos[idx] for idx in best_seq]
    
    if "<SOS>" in decoded_words: decoded_words.remove("<SOS>")
    if "<EOS>" in decoded_words: decoded_words = decoded_words[:decoded_words.index("<EOS>")]
        
    return " ".join(decoded_words)

In [20]:

def evaluate_bleu(data_subset, model, vocab_src, vocab_trg, device, beam_size=2):
    targets = []      # Ground Truths
    predictions = []  # Model Outputs
    
    print(f"Starting BLEU Evaluation on {len(data_subset)} samples (Beam={beam_size})...")
    
    for i, datum in enumerate(data_subset):
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1} sentences...")

        pair = datum['translation']
        src_text = pair['en']
        trg_text = pair['fr']
        
        # 1. Get Prediction
        pred_sentence = beam_search_decode_single(
            model,          
            src_text,       
            vocab_src, 
            vocab_trg, 
            beam_size, 
            max_len=50,
            device=device
        )
        
        # 2. Collect Data
        # Targets must be a list of lists: [['ref_sentence']]
        targets.append([trg_text])
        predictions.append(pred_sentence)

    # 3. Compute Score
    print("Computing Score...")
    score = bleu_score(predictions, targets, n_gram=4)
    return score.item() * 100

In [21]:
# Run this after training
final_score = evaluate_bleu(
    valid_subset,
    model,             
    vocab_en, 
    vocab_fr, 
    device,            
    beam_size=12
)

print(f"Final Single-Model BLEU: {final_score:.2f}")

Starting BLEU Evaluation on 1000 samples (Beam=12)...
Processed 50 sentences...
Processed 100 sentences...
Processed 150 sentences...
Processed 200 sentences...
Processed 250 sentences...
Processed 300 sentences...
Processed 350 sentences...
Processed 400 sentences...
Processed 450 sentences...
Processed 500 sentences...
Processed 550 sentences...
Processed 600 sentences...
Processed 650 sentences...
Processed 700 sentences...
Processed 750 sentences...
Processed 800 sentences...
Processed 850 sentences...
Processed 900 sentences...
Processed 950 sentences...
Processed 1000 sentences...
Computing Score...
Final Single-Model BLEU: 0.00


In [105]:
def show_random_translations(dataset, models, vocab_src, vocab_trg, device, n_samples=5, beam_size=2):
    # Determine if we are using Single or Ensemble
    is_ensemble = isinstance(models, list)
    model_type = "ENSEMBLE" if is_ensemble else "SINGLE MODEL"
    
    print(f"--- Visualizing {n_samples} Random Samples ({model_type}) ---")
    
    for i in range(n_samples):
        # 1. Pick a random example
        idx = random.randint(0, len(dataset)-1)
        pair = dataset.hf_dataset[idx]['translation'] # Access raw HF data
        
        src = pair['en']
        trg = pair['fr']

        pred = beam_search_decode_single(models, src, vocab_src, vocab_trg, beam_size, device=device)
            
        # 3. Print
        print(f"\nExample {i+1}:")
        print(f"SRC (English): {src}")
        print(f"TRG (French):  {trg}")
        print(f"PRED (Model):  {pred}")
        print("-" * 50)

# --- Usage Examples ---

# 1. For Single Model
show_random_translations(valid_subsetset, model, vocab_en, vocab_fr, device, n_samples=3)

# 2. For Ensemble (Pass the list of 5 models)
# show_random_translations(valid_dataset, ensemble_models, vocab_en, vocab_fr, DEVICE, n_samples=3)

--- Visualizing 3 Random Samples (SINGLE MODEL) ---

Example 1:
SRC (English): The Federal Security Service now spreads a big network of fake sites and there are tons of potential buyers of military weapons.
TRG (French):  Le Service f√©d√©ral de s√©curit√© a diffus√© un immense r√©seau de faux sites et ramasse √† la pelle les personnes d√©sireuses d'acheter des armes de combat.
PRED (Model):  , , , de . .
--------------------------------------------------

Example 2:
SRC (English): One thing is certain: these new provisions will have a negative impact on voter turn-out.
TRG (French):  Une chose est certaine: ces nouvelles dispositions influenceront n√©gativement le taux de participation.
PRED (Model):  , , , de . . . . .
--------------------------------------------------

Example 3:
SRC (English): Chantal Rouleau was one of the first women in Montreal to raise the alarm.
TRG (French):  Chantal Rouleau a √©t√© l'une des premi√®res √©lues de Montr√©al √† tirer la sonnette d'alarme.
PRED

In [107]:
import random

# Use the beam_search_decode_single function you defined
model.eval()
print(f"--- DIAGNOSIS: VISUAL INSPECTION ---")

for i in range(5):
    idx = random.randint(0, len(valid_subset)-1)
    pair = valid_subset[idx]['translation']
    src = pair['en']
    trg = pair['fr']
    
    pred = beam_search_decode_single(model, src, vocab_en, vocab_fr, beam_size=12, device=device)
    
    print(f"Input:  {src}")
    print(f"Target: {trg}")
    print(f"Output: {pred}")
    print("-" * 30)

--- DIAGNOSIS: VISUAL INSPECTION ---
Input:  Recently he took up the street organ and became St. Petersburg's music man, because he was ready for this complex role with all his Bohemian existence, philosophy and image.
Target: Il a r√©cemment pris dans ses mains un orgue de Barbarie et il est devenu le symbole de cet instrument √† Saint-P√©tersbourg puisqu'il avait assez m√ªri pour ce r√¥le difficile de par son existence, sa philosophie et son image de boh√©mien.
Output: , , , de
------------------------------
Input:  Store on a sofa
Target: Une boutique depuis son canap√©
Output: 
------------------------------
Input:  There is a connection with the fact that an infant spends about 9 months in amniotic fluid in the womb; it is easier to get used to water after that.
Target: Dans la mesure o√π le b√©b√© √©volue environ 9 mois dans le liquide lymphatique du ventre de sa m√®re, il lui est plus facile de s'habituer ensuite √† l'eau.
Output: , , ,
------------------------------
Input:  Acc

In [109]:
# Create a tiny subset of the TRAINING data (which the model has seen)
train_debug_subset = dataset['train'].select(range(100))

print("--- DIAGNOSIS: TRAINING SET CHECK ---")
train_score = evaluate_bleu(
    train_debug_subset, 
    model, 
    vocab_en, 
    vocab_fr, 
    device, 
    beam_size=2
)
print(f"Training Set BLEU: {train_score:.2f}")

--- DIAGNOSIS: TRAINING SET CHECK ---
Starting BLEU Evaluation on 100 samples (Beam=2)...
Processed 50 sentences...
Processed 100 sentences...
Computing Score...
Training Set BLEU: 0.00


In [112]:
# --- REVISED HYPERPARAMETERS ---
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_fr)
ENC_EMB_DIM = 256  # Reduced from 1000 to save memory/speed up debugging
DEC_EMB_DIM = 256
HID_DIM = 512      # Reduced from 1000 to save memory
N_LAYERS = 2       # Reduced from 4 to 2 (Easier to train on small data)
DROPOUT = 0.5      # Increased Dropout slightly
N_EPOCHS = 10      # Give it a bit more time
CLIP = 1           # Tighter gradient clipping
DEVICE = device  # Use the detected device
# --- MEMORY FIX SETTINGS ---
BATCH_SIZE = 32
GRAD_ACCUMULATION_STEPS = 1 # Set to 1 for Adam to simplify debugging

# Re-initialize DataLoader
pad_idx = vocab_en.stoi["<PAD>"]
train_loader = DataLoader(
    WMT14Dataset(train_subset, vocab_en, vocab_fr),
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=0, 
    collate_fn=MyCollate(pad_idx)
)

SEEDS = [1] # Let's train just ONE model first to verify it works

def train_one_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# --- TRAINING LOOP ---
for seed in SEEDS:
    print(f"\n{'='*20}")
    print(f"DEBUG TRAINING (Seed: {seed})")
    print(f"{'='*20}")
    
    # 1. Set Seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available(): torch.mps.manual_seed(seed)
    
    # 2. Initialize Model (Smaller & Simpler for Debugging)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,OUTPUT_DIM, N_LAYERS, DROPOUT)
    model = Seq2Seq(enc, dec).to(DEVICE)
    model.apply(init_weights)
    
    # 3. OPTIMIZER FIX: Use Adam instead of SGD
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Ensure we use the French padding index
    TRG_PAD_IDX = vocab_fr.stoi["<PAD>"]
    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
    
    # 4. Train
    for epoch in range(N_EPOCHS):
        loss = train_one_epoch(model, train_loader, optimizer, criterion, CLIP)
        print(f"Epoch {epoch+1} | Loss: {loss:.3f} | PPL: {math.exp(loss):.2f}")
        
    # 5. Save
    torch.save(model.state_dict(), f"debug_model.pt")


DEBUG TRAINING (Seed: 1)
Epoch 1 | Loss: 6.452 | PPL: 634.18
Epoch 2 | Loss: 5.982 | PPL: 396.34
Epoch 3 | Loss: 5.787 | PPL: 326.17
Epoch 4 | Loss: 5.632 | PPL: 279.35
Epoch 5 | Loss: 5.552 | PPL: 257.84
Epoch 6 | Loss: 5.462 | PPL: 235.59
Epoch 7 | Loss: 5.410 | PPL: 223.55
Epoch 8 | Loss: 5.341 | PPL: 208.75
Epoch 9 | Loss: 5.323 | PPL: 205.01
Epoch 10 | Loss: 5.280 | PPL: 196.37


In [22]:
import random

# Use the beam_search_decode_single function you defined
model.eval()
print(f"--- DIAGNOSIS: VISUAL INSPECTION ---")

for i in range(5):
    idx = random.randint(0, len(valid_subset)-1)
    pair = valid_subset[idx]['translation']
    src = pair['en']
    trg = pair['fr']
    
    pred = beam_search_decode_single(model, src, vocab_en, vocab_fr, beam_size=12, max_len=50, device=device)
    
    print(f"Input:  {src}")
    print(f"Target: {trg}")
    print(f"Output: {pred}")
    print("-" * 30)

--- DIAGNOSIS: VISUAL INSPECTION ---
Input:  According to critics, Walmart can afford to sell the products cheaply partly because it pays little to its employees.
Target: Selon les critiques, Walmart peut se permettre de vendre ses produits bon march√© notamment en raison des salaires bas de ses employ√©s.
Output: n' ne qui vous les des sein mais mis ce un lignes rapidement int√©gration aux
------------------------------
Input:  But 300 people will come for culture, not 10,000. In the end, there's less management, money, everything dies out.
Target: Mais pour le c√¥t√© culturel, 300 personnes viendront, 10¬†milles ne viendront pas et au bout du compte c'est moins de management, d'argent, tout d√©p√©rit.
Output: car , a ne qui - a bon m√™me √† cas -t d' r√®gles , et garantir soutien nouvelles autorit√© communaut√© .
------------------------------
Input:  The buyer pays at an ATM.
Target: L'acheteur effectue le paiement sur les bornes automatiques.
Output: n' ne qui - a bon m√™me √† cas 

In [23]:
# --- 1. Create Micro-Dataset (Only 50 items) ---
micro_subset = dataset['train'].select(range(50)) # First 50 sentences only

# Re-build vocab on JUST these 50 sentences to ensure no <UNK>s
vocab_en_micro = Vocabulary(freq_threshold=1, max_size=1000)
vocab_en_micro.build_vocabulary([item['translation']['en'] for item in micro_subset], vocab_en_micro.tokenizer_eng)

vocab_fr_micro = Vocabulary(freq_threshold=1, max_size=1000)
vocab_fr_micro.build_vocabulary([item['translation']['fr'] for item in micro_subset], vocab_fr_micro.tokenizer_fr)

# Loaders
pad_idx_micro = vocab_en_micro.stoi["<PAD>"]
micro_loader = DataLoader(
    WMT14Dataset(micro_subset, vocab_en_micro, vocab_fr_micro),
    batch_size=10, # Small batch size
    shuffle=True, 
    collate_fn=MyCollate(pad_idx_micro)
)

# --- 2. Setup Model (Tiny Config) ---
INPUT_DIM = len(vocab_en_micro)
OUTPUT_DIM = len(vocab_fr_micro)
ENC_EMB_DIM = 64  
DEC_EMB_DIM = 64
HID_DIM = 128     
N_LAYERS = 1      # 1 Layer is enough for 50 sentences
DROPOUT = 0.0     # NO DROPOUT (We WANT to overfit)

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, OUTPUT_DIM ,N_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec).to(device)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(), lr=0.005) # High LR to learn fast
criterion = nn.CrossEntropyLoss(ignore_index=vocab_fr_micro.stoi["<PAD>"])

# --- 3. Train for 100 Epochs (Force Memorization) ---
print("--- STARTING MICRO-TRAINING ---")
for epoch in range(100):
    model.train()
    epoch_loss = 0
    for src, trg in micro_loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1} | Loss: {epoch_loss / len(micro_loader):.4f}")

# --- 4. Verify ---
print("\n--- VISUAL CHECK (Should be Perfect) ---")
# Pick the FIRST sentence (index 0)
pair = micro_subset[0]['translation']
src = pair['en']
trg = pair['fr']
pred = beam_search_decode_single(model, src, vocab_en_micro, vocab_fr_micro, beam_size=2, max_len=50, device=device)

print(f"Input: {src}")
print(f"Target: {trg}")
print(f"Pred:   {pred}")

--- STARTING MICRO-TRAINING ---
Epoch 10 | Loss: 4.9017
Epoch 20 | Loss: 4.4378
Epoch 30 | Loss: 4.0045
Epoch 40 | Loss: 3.5894
Epoch 50 | Loss: 3.0060
Epoch 60 | Loss: 2.3533
Epoch 70 | Loss: 1.4639
Epoch 80 | Loss: 1.0707
Epoch 90 | Loss: 0.6613
Epoch 100 | Loss: 0.5569

--- VISUAL CHECK (Should be Perfect) ---
Input: Resumption of the session
Target: Reprise de la session
Pred:   madame la session


In [None]:
print("--- CHECKING OTHER EXAMPLES ---")

# Let's check the first 5 items, not just item 0
for i in range(5):
    pair = micro_subset[i]['translation']
    src = pair['en']
    trg = pair['fr']
    
    # Translate
    pred = beam_search_decode_single(
        model, 
        src, 
        vocab_en_micro, 
        vocab_fr_micro, 
        beam_size=1, # Try Greedy search (Beam 1) to see raw output
        device=device    )
    
    print(f"[{i}] Input:  {src}")
    print(f"    Target: {trg}")
    print(f"    Pred:   {pred}")
    print("-" * 20)

--- CHECKING OTHER EXAMPLES ---
[0] Input:  Resumption of the session
    Target: Reprise de la session
    Pred:   je vous avez parfaitement raison et je vais v√©rifier si tout cela n ' a effectivement pas √©t√© fait .
--------------------
[1] Input:  I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
    Target: Je d√©clare reprise la session du Parlement europ√©en qui avait √©t√© interrompue le vendredi 17 d√©cembre dernier et je vous renouvelle tous mes vux en esp√©rant que vous avez pass√© de bonnes vacances.
    Pred:   je vous demande donc √† nouveau de faire le n√©cessaire pour que nous puissions disposer d' une cha√Æne n√©erlandaise .
--------------------
[2] Input:  Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural di

In [None]:
# --- HYPERPARAMETERS ---
INPUT_DIM = len(vocab_en_micro)
OUTPUT_DIM = len(vocab_fr_micro)
ENC_EMB_DIM = 64  
DEC_EMB_DIM = 64
HID_DIM = 128     
N_LAYERS = 1      
DROPOUT = 0.0     

# --- KEY CHANGE: BATCH SIZE 1 ---
# This removes all padding confusion. 
# The Encoder reads ONLY the words, no zeros.
micro_loader_single = DataLoader(
    WMT14Dataset(micro_subset, vocab_en_micro, vocab_fr_micro),
    batch_size=1,  # <--- MAGIC NUMBER
    shuffle=True, 
    collate_fn=MyCollate(pad_idx_micro)
)

# Initialize
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,INPUT_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec).to(DEVICE)
model.apply(init_weights)
optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss(ignore_index=vocab_fr_micro.stoi["<PAD>"])

print("--- TRAINING WITH BATCH SIZE 1 (NO PADDING) ---")
for epoch in range(50): # 50 Epochs should be enough for Batch Size 1
    model.train()
    epoch_loss = 0
    for src, trg in micro_loader_single:
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1} | Loss: {epoch_loss / len(micro_loader_single):.4f}")

# --- CHECK ---
print("\n--- VISUAL CHECK (BATCH 1) ---")
pair = micro_subset[0]['translation']
src = pair['en']
trg = pair['fr']
pred = beam_search_decode_single(model, src, vocab_en_micro, vocab_fr_micro, beam_size=2, device=DEVICE)

print(f"Input: {src}")
print(f"Target: {trg}")
print(f"Pred:   {pred}")

NameError: name 'vocab_en_micro' is not defined