In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
import math
from collections import Counter
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from torchmetrics.functional.text import bleu_score
import spacy

# Load Spacy
spacy_eng = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")

# --- 1. VOCABULARY CLASS ---
class Vocabulary:
    def __init__(self, freq_threshold=2, max_size=80000):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.max_size = max_size

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    @staticmethod
    def tokenizer_fr(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

    def build_vocabulary(self, sentence_list, tokenizer):
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in tokenizer(sentence):
                frequencies[word] += 1
        common_words = frequencies.most_common(self.max_size - 4)
        for word, count in common_words:
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text, tokenizer):
        tokenized_text = tokenizer(text)
        return [self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text]

# --- 2. DATASET CLASS (WITH REVERSING) ---
class WMT14Dataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # Numericalize and REVERSE Source
        src_indices = self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        src_indices = src_indices[::-1] # <--- REVERSING INPUT HERE
        
        numericalized_source = [self.source_vocab.stoi["<SOS>"]] + src_indices + [self.source_vocab.stoi["<EOS>"]]
        numericalized_target = [self.target_vocab.stoi["<SOS>"]] + \
                               self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr) + \
                               [self.target_vocab.stoi["<EOS>"]]

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    def __call__(self, batch):
        source = [item[0] for item in batch]
        target = [item[1] for item in batch]
        source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)
        return source, target

# --- 3. MODEL ARCHITECTURE ---
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1
        return outputs

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [2]:
# --- CONFIGURATION ---
BATCH_SIZE = 32
SUBSET_SIZE = 10000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using Device: {DEVICE}")

# 1. Load Data
dataset = load_dataset("wmt14", "fr-en")
train_subset = dataset['train'].select(range(SUBSET_SIZE))
valid_subset = dataset['validation'].select(range(1000))

# 2. Build Vocab
print("Building Vocabularies...")
vocab_en = Vocabulary(freq_threshold=1, max_size=80000)
vocab_en.build_vocabulary([item['translation']['en'] for item in train_subset], vocab_en.tokenizer_eng)

vocab_fr = Vocabulary(freq_threshold=1, max_size=80000)
vocab_fr.build_vocabulary([item['translation']['fr'] for item in train_subset], vocab_fr.tokenizer_fr)

print(f"Unique English Words: {len(vocab_en)}")
print(f"Unique French Words: {len(vocab_fr)}")

# 3. DataLoaders
pad_idx = vocab_en.stoi["<PAD>"]
train_loader = DataLoader(
    WMT14Dataset(train_subset, vocab_en, vocab_fr),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=MyCollate(pad_idx)
)

Using Device: mps


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Building Vocabularies...
Unique English Words: 10731
Unique French Words: 14472


In [3]:
# --- HYPERPARAMETERS ---
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_fr)
ENC_EMB_DIM = 1000
DEC_EMB_DIM = 1000
HID_DIM = 1000
N_LAYERS = 4
DROPOUT = 0.2
N_EPOCHS = 8 
CLIP = 5

SEEDS = [1, 2, 3, 4, 5]

def train_one_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# --- TRAINING LOOP FOR 5 MODELS ---
for seed in SEEDS:
    print(f"\n{'='*20}")
    print(f"TRAINING MODEL {seed}/5 (Seed: {seed})")
    print(f"{'='*20}")
    
    # 1. Set Seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available(): torch.mps.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed(seed)
    
    # 2. Initialize Fresh Model
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
    model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)
    model.apply(init_weights) # Paper initialization
    
    # 3. Optimizer & Criterion
    optimizer = optim.SGD(model.parameters(), lr=0.7)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    
    # 4. Train
    for epoch in range(N_EPOCHS):
        # Halve LR after epoch 5 (Simplified schedule)
        if epoch >= 5:
            for param_group in optimizer.param_groups:
                param_group['lr'] = param_group['lr'] * 0.5
        
        loss = train_one_epoch(model, train_loader, optimizer, criterion, CLIP)
        print(f"Seed {seed} | Epoch {epoch+1} | Loss: {loss:.3f} | PPL: {math.exp(loss):.2f}")
        
    # 5. Save Model
    save_path = f"model_seed_{seed}.pt"
    torch.save(model.state_dict(), save_path)
    print(f"--> Saved {save_path}")

print("\nAll 5 models trained and saved successfully.")


TRAINING MODEL 1/5 (Seed: 1)
Seed 1 | Epoch 1 | Loss: 6.703 | PPL: 814.91
Seed 1 | Epoch 2 | Loss: 6.408 | PPL: 606.40
Seed 1 | Epoch 3 | Loss: 6.315 | PPL: 553.00
Seed 1 | Epoch 4 | Loss: 6.272 | PPL: 529.49
Seed 1 | Epoch 5 | Loss: 6.253 | PPL: 519.52
Seed 1 | Epoch 6 | Loss: 6.212 | PPL: 498.53
Seed 1 | Epoch 7 | Loss: 6.197 | PPL: 491.13
Seed 1 | Epoch 8 | Loss: 6.190 | PPL: 488.07
--> Saved model_seed_1.pt

TRAINING MODEL 2/5 (Seed: 2)
Seed 2 | Epoch 1 | Loss: 6.709 | PPL: 819.88
Seed 2 | Epoch 2 | Loss: 6.402 | PPL: 603.22
Seed 2 | Epoch 3 | Loss: 6.310 | PPL: 549.95
Seed 2 | Epoch 4 | Loss: 6.273 | PPL: 530.30
Seed 2 | Epoch 5 | Loss: 6.253 | PPL: 519.66
Seed 2 | Epoch 6 | Loss: 6.214 | PPL: 499.47
Seed 2 | Epoch 7 | Loss: 6.201 | PPL: 493.25
Seed 2 | Epoch 8 | Loss: 6.193 | PPL: 489.09
--> Saved model_seed_2.pt

TRAINING MODEL 3/5 (Seed: 3)
Seed 3 | Epoch 1 | Loss: 6.712 | PPL: 821.81
Seed 3 | Epoch 2 | Loss: 6.391 | PPL: 596.40
Seed 3 | Epoch 3 | Loss: 6.297 | PPL: 542.87
See

In [7]:
def beam_search_decode_ensemble(models_list, sentence, vocab_src, vocab_trg, beam_size=2, max_len=50, device='cpu'):
    # Ensure models are in eval mode
    for m in models_list: m.eval()
    
    # Prepare Input
    if isinstance(sentence, str): tokens = vocab_src.tokenizer_eng(sentence)
    else: tokens = [token.lower() for token in sentence]
    
    indices = [vocab_src.stoi.get(t, vocab_src.stoi["<UNK>"]) for t in tokens]
    indices = indices[::-1] # REVERSE
    indices = [vocab_src.stoi["<SOS>"]] + indices + [vocab_src.stoi["<EOS>"]]
    src_tensor = torch.LongTensor(indices).unsqueeze(1).to(device)

    with torch.no_grad():
        # Encoder states for all models
        initial_states = []
        for m in models_list:
            h, c = m.encoder(src_tensor)
            initial_states.append((h, c))

        hypotheses = [(0.0, [vocab_trg.stoi["<SOS>"]], initial_states)]

        for _ in range(max_len):
            all_candidates = []
            for score, seq, states_list in hypotheses:
                if seq[-1] == vocab_trg.stoi["<EOS>"]:
                    all_candidates.append((score, seq, states_list))
                    continue
                
                last_token = torch.LongTensor([seq[-1]]).to(device)
                
                # Run all models and AVERAGE logits
                avg_log_probs = None
                new_states_list = []
                for i, m in enumerate(models_list):
                    prev_h, prev_c = states_list[i]
                    pred, new_h, new_c = m.decoder(last_token, prev_h, prev_c)
                    new_states_list.append((new_h, new_c))
                    
                    # --- FIX START ---
                    # Squeeze the batch dimension: [1, vocab_size] -> [vocab_size]
                    pred = pred.squeeze(0)
                    # --- FIX END ---

                    log_probs = F.log_softmax(pred, dim=0)
                    if avg_log_probs is None: avg_log_probs = log_probs
                    else: avg_log_probs += log_probs
                
                avg_log_probs = avg_log_probs / len(models_list)
                top_k_probs, top_k_ids = avg_log_probs.topk(beam_size * 2)

                for i in range(len(top_k_ids)):
                    all_candidates.append((score + top_k_probs[i].item(), seq + [top_k_ids[i].item()], new_states_list))

            hypotheses = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]
            if all(h[1][-1] == vocab_trg.stoi["<EOS>"] for h in hypotheses): break

    best_seq = hypotheses[0][1]
    decoded_words = [vocab_trg.itos[idx] for idx in best_seq]
    if "<SOS>" in decoded_words: decoded_words.remove("<SOS>")
    if "<EOS>" in decoded_words: decoded_words = decoded_words[:decoded_words.index("<EOS>")]
    return " ".join(decoded_words)

# --- BLEU EVALUATOR ---
def evaluate_ensemble(data_subset, models_list, vocab_src, vocab_trg, device, beam_size):
    targets = []
    predictions = []
    print(f"Evaluating Ensemble on {len(data_subset)} samples...")
    
    for i, datum in enumerate(data_subset):
        if (i+1)%50 == 0: print(f"Processed {i+1}...")
        pair = datum['translation']
        pred = beam_search_decode_ensemble(     
                models_list, 
                pair['en'], 
                vocab_src, 
                vocab_trg, 
                beam_size, 
                max_len=50, # Optional: Explicitly set max_len here if you want
                device=device
            )
        targets.append([pair['fr']])
        predictions.append(pred)
        
    return bleu_score(predictions, targets, n_gram=4).item() * 100

In [8]:
# 1. Load the 5 Saved Models
ensemble_models = []
print("Loading Ensemble Models...")

for seed in SEEDS:
    # Re-instantiate architecture
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
    m = Seq2Seq(enc, dec, DEVICE).to(DEVICE)
    
    # Load weights
    m.load_state_dict(torch.load(f"model_seed_{seed}.pt"))
    ensemble_models.append(m)
    print(f"Loaded model_seed_{seed}.pt")

# 2. Sanity Check
print("\n--- Manual Ensemble Check ---")
test_sen = "the cat sat on the mat"
print(f"Input: {test_sen}")
print(f"Output: {beam_search_decode_ensemble(ensemble_models, test_sen, vocab_en, vocab_fr, beam_size=2, device=DEVICE)}")

# 3. Final BLEU Score
final_score = evaluate_ensemble(valid_subset, ensemble_models, vocab_en, vocab_fr, DEVICE, beam_size=2)
print(f"\n=================================")
print(f"FINAL ENSEMBLE BLEU SCORE: {final_score:.2f}")
print(f"=================================")

Loading Ensemble Models...


  m.load_state_dict(torch.load(f"model_seed_{seed}.pt"))


Loaded model_seed_1.pt
Loaded model_seed_2.pt
Loaded model_seed_3.pt
Loaded model_seed_4.pt
Loaded model_seed_5.pt

--- Manual Ensemble Check ---
Input: the cat sat on the mat
Output: le de de
Evaluating Ensemble on 1000 samples...
Processed 50...
Processed 100...
Processed 150...
Processed 200...
Processed 250...
Processed 300...
Processed 350...
Processed 400...
Processed 450...
Processed 500...
Processed 550...
Processed 600...
Processed 650...
Processed 700...
Processed 750...
Processed 800...
Processed 850...
Processed 900...
Processed 950...
Processed 1000...

FINAL ENSEMBLE BLEU SCORE: 0.00
