In [15]:
# imports

import os
import sys
import time
import math

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from torch.utils.data import DataLoader

import spacy
from collections import Counter

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random

from torchmetrics.functional.text import bleu_score

In [16]:
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Device", DEVICE)

Device mps


In [17]:
# 1. Set Seeds for Reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check if MPS is available and set the seed
if torch.backends.mps.is_available():
    torch.mps.manual_seed(SEED)

# Check if CUDA is available (just in case you move this code to a server later)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [18]:
# Download and load the WMT14 French-English dataset
# This might take a few minutes as the dataset is large
dataset = load_dataset("wmt14", "fr-en")

# Print the dataset structure to verify
print(dataset)

# Example: Inspect the first training example
print("First training example:", dataset['train'][0])

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40836715
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})
First training example: {'translation': {'en': 'Resumption of the session', 'fr': 'Reprise de la session'}}


In [19]:
spacy_fr = spacy.load("fr_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [20]:
#First, we create a class to manage the mapping between words and IDs. This handles the "80k vocabulary" limit mentioned in the paper.

class Vocabulary:
    def __init__(self, freq_threshold=2, max_size=80000):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.max_size = max_size

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    @staticmethod
    def tokenizer_fr(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

    def build_vocabulary(self, sentence_list, tokenizer):
        frequencies = Counter()
        idx = 4 # Start index after special tokens

        # 1. Count frequencies of all words
        for sentence in sentence_list:
            for word in tokenizer(sentence):
                frequencies[word] += 1

        # 2. Sort by frequency and keep top 'max_size' words
        # This matches the paper's strategy of capping vocab size
        common_words = frequencies.most_common(self.max_size - 4)

        # 3. Add valid words to our dictionary
        for word, count in common_words:
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text, tokenizer):
        tokenized_text = tokenizer(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [21]:
#Now we define a PyTorch Dataset that takes the raw Hugging Face data and converts it into numbers using the Vocabulary class above.

class WMT14Dataset_regular_order(Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        # Get the pair: {'fr': '...', 'en': '...'}
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # Convert text to indices
        # Add <SOS> at start and <EOS> at end
        numericalized_source = [self.source_vocab.stoi["<SOS>"]]
        numericalized_source += self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        numericalized_source.append(self.source_vocab.stoi["<EOS>"])

        numericalized_target = [self.target_vocab.stoi["<SOS>"]]
        numericalized_target += self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr)
        numericalized_target.append(self.target_vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

In [22]:
class WMT14Dataset(Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # 1. Numericalize Source
        # Get list of numbers: [4, 10, 55] ("the", "cat", "sat")
        src_indices = self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        
        # --- IMPLEMENTATION OF PAPER POINT #1: REVERSE INPUT ---
        # Reverse the list: [55, 10, 4] ("sat", "cat", "the")
        src_indices = src_indices[::-1] 
        
        # Add special tokens
        numericalized_source = [self.source_vocab.stoi["<SOS>"]]
        numericalized_source += src_indices
        numericalized_source.append(self.source_vocab.stoi["<EOS>"])

        # 2. Numericalize Target (Do NOT reverse this)
        trg_indices = self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr)
        
        numericalized_target = [self.target_vocab.stoi["<SOS>"]]
        numericalized_target += trg_indices
        numericalized_target.append(self.target_vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

In [23]:
#Since sentences have different lengths, we cannot simply stack them into a matrix.
# We need a specific function (called collate_fn) to pad short sentences with zeros (the <PAD> token) so that every batch is rectangular.

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        source = [item[0] for item in batch]
        target = [item[1] for item in batch]

        # Pad sequences to the max length in this batch
        source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)

        return source, target

In [24]:
# 1. Load Data
# Select subsets: 10k for training, 1k for validation
train_subset = dataset['train'].select(range(10000))
valid_subset = dataset['validation'].select(range(1000))

print(f"Train Subset Size: {len(train_subset)}")
print(f"Valid Subset Size: {len(valid_subset)}")

# 2. Build Vocabulary
print("Building English Vocabulary...")
english_sentences = [item['translation']['en'] for item in train_subset]
vocab_en = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_en.build_vocabulary(english_sentences, vocab_en.tokenizer_eng)


print("Building French Vocabulary...")
french_sentences = [item['translation']['fr'] for item in train_subset]
vocab_fr = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_fr.build_vocabulary(french_sentences, vocab_fr.tokenizer_fr)

# 3. Create Dataset
train_subsetset = WMT14Dataset(train_subset, vocab_en, vocab_fr)
valid_subsetset = WMT14Dataset(valid_subset, vocab_en, vocab_fr)

# 4. Create DataLoaders
BATCH_SIZE = 32 # Reduced batch size for smaller dataset 10k
pad_idx = vocab_en.stoi["<PAD>"]

train_loader = DataLoader(
    dataset=train_subsetset,
    batch_size=BATCH_SIZE,
    num_workers=0,
    shuffle=True,
    collate_fn=MyCollate(pad_idx=pad_idx)
)

# 5. Test it
print("Testing the pipeline...")
for src_batch, trg_batch in train_loader:
    print(f"Source Shape: {src_batch.shape}") # Expect [Seq_Len, Batch_Size]
    print(f"Target Shape: {trg_batch.shape}")
    break

Train Subset Size: 10000
Valid Subset Size: 1000
Building English Vocabulary...
Building French Vocabulary...
Testing the pipeline...
Source Shape: torch.Size([80, 32])
Target Shape: torch.Size([86, 32])


In [25]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size) # dropiut after embedding corrupts the input word vectors to prevent reliance on specific features.
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p) # LSTM layer, dropout between layers prevents deeper
                                                                               #layers from co-adapting too strongly with shallower layers.

    def forward(self, x):
        # x shape : (seq_length, Batch_size) -> seq_length is the length of the input sentence, and we process the entire sequence at once
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (seq_length, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape : (seq_length, Batch_size, hidden_size)
        # hidden shape : (num_layers, Batch_size, hidden_size)
        # cell shape : (num_layers, Batch_size, hidden_size)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc_out = nn.Linear(hidden_size, output_size) #output_size = input_size of the decoder = size of target vocabulary
        self.output_size = output_size
        
    def forward(self, x, hidden, cell):
        # x shape : (1, Batch_size)  -> we process one time step at a time
        x = x.unsqueeze(0) #that's why we added one dimention 
        # x shape : (1, Batch_size, 1)
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape : (1, Batch_size, hidden_size)
        predictions = self.fc_out(outputs.squeeze(0)) #remove the time step dimension for the linear layer
        # predictions shape : (Batch_size, output_size)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # First input to the decoder is the <SOS> tokens
        x = target[0,:]  # shape: (Batch_size)

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output

            # Decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_force_ratio

            # Get the highest predicted token from our predictions
            best_guess = output.argmax(1) 

            # If teacher forcing, use actual next token as next input; if not, use predicted token
            x = target[t] if teacher_force else best_guess

        return outputs

In [26]:
# Helper function to initialize weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [28]:
# --- Architecture Specs from Paper ---
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_fr)
ENC_EMB_DIM = 1000  # Paper used 1000
DEC_EMB_DIM = 1000  # Paper used 1000
HID_DIM = 1000      # Paper used 1000
N_LAYERS = 4        # Paper used 4
DROPOUT = 0.2       # Paper implies some regularization, usually 0.2 is safe

# --- Setup ---
# Use the correct index for <PAD> from your English vocabulary
TRG_PAD_IDX = vocab_en.stoi["<PAD>"]

# Create Model
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

# Initialize Weights
model.apply(init_weights)

# --- Optimization Specs ---
BATCH_SIZE = 32    # Paper used 128
LEARNING_RATE = 0.0005 # Paper used fixed 0.7 initially
CLIP = 5            # Paper threshold for gradient norm
TOTAL_EPOCHS = 20 # Paper trained for 7.5 epochs
# Optimizer: SGD without momentum
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [31]:
# --- Training Loop ---
for epoch in range(TOTAL_EPOCHS):
    
    model.train()
    epoch_loss = 0
    
    # We need to know when we are "halfway" through
    num_batches = len(train_loader)
    halfway_point = num_batches // 2
    
    for i, (src, trg) in enumerate(train_loader):
        
        # Standard Training Step
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch: {epoch+1:02} | Loss: {epoch_loss / len(train_loader):.3f}')

# Save Model
save_path = f"One-LSTM.pt"
torch.save(model.state_dict(), save_path)
print(f"--> Saved {save_path}")

Epoch: 01 | Loss: 6.084
Epoch: 02 | Loss: 5.792
Epoch: 03 | Loss: 5.610
Epoch: 04 | Loss: 5.482
Epoch: 05 | Loss: 5.377
Epoch: 06 | Loss: 5.288
Epoch: 07 | Loss: 5.233
Epoch: 08 | Loss: 5.163
Epoch: 09 | Loss: 5.106
Epoch: 10 | Loss: 5.042
Epoch: 11 | Loss: 5.000
Epoch: 12 | Loss: 4.938
Epoch: 13 | Loss: 4.895
Epoch: 14 | Loss: 4.856
Epoch: 15 | Loss: 4.820
Epoch: 16 | Loss: 4.782
Epoch: 17 | Loss: 4.736
Epoch: 18 | Loss: 4.678
Epoch: 19 | Loss: 4.642
Epoch: 20 | Loss: 4.622
--> Saved One-LSTM.pt


In [32]:
def beam_search_decode_single(model, sentence, vocab_src, vocab_trg, beam_size=2, max_len=50, device='cpu'):
    model.eval()
    
    # 1. Prepare Source
    if isinstance(sentence, str):
        tokens = vocab_src.tokenizer_eng(sentence)
    else:
        tokens = [token.lower() for token in sentence]
        
    indices = [vocab_src.stoi.get(t, vocab_src.stoi["<UNK>"]) for t in tokens]
    indices = indices[::-1] # Reverse Input
    indices = [vocab_src.stoi["<SOS>"]] + indices + [vocab_src.stoi["<EOS>"]]
    
    src_tensor = torch.LongTensor(indices).unsqueeze(1).to(device)

    with torch.no_grad():
        encoder_hidden, encoder_cell = model.encoder(src_tensor)

        # Hypothesis: (Score, [Sequence], Hidden, Cell)
        hypotheses = [(0.0, [vocab_trg.stoi["<SOS>"]], encoder_hidden, encoder_cell)]

        for _ in range(max_len):
            all_candidates = []
            
            for score, seq, hidden, cell in hypotheses:
                if seq[-1] == vocab_trg.stoi["<EOS>"]:
                    all_candidates.append((score, seq, hidden, cell))
                    continue
                
                input_tensor = torch.LongTensor([seq[-1]]).to(device)
                
                # Predict
                prediction, new_h, new_c = model.decoder(input_tensor, hidden, cell)
                
                # prediction is [1, vocab_size]. We squeeze to make it [vocab_size]
                prediction = prediction.squeeze(0) 
                
                # Log Softmax over dimension 0 (the vocabulary)
                log_probs = F.log_softmax(prediction, dim=0)
                
                # Get Top K
                top_k_probs, top_k_ids = log_probs.topk(beam_size * 2)
                

                # Expand
                for i in range(len(top_k_ids)):
                    word_idx = top_k_ids[i].item()
                    prob = top_k_probs[i].item()
                    all_candidates.append((score + prob, seq + [word_idx], new_h, new_c))

            # Prune
            hypotheses = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]
            
            if all(h[1][-1] == vocab_trg.stoi["<EOS>"] for h in hypotheses):
                break

    best_seq = hypotheses[0][1]
    decoded_words = [vocab_trg.itos[idx] for idx in best_seq]
    
    if "<SOS>" in decoded_words: decoded_words.remove("<SOS>")
    if "<EOS>" in decoded_words: decoded_words = decoded_words[:decoded_words.index("<EOS>")]
        
    return " ".join(decoded_words)

In [45]:

def evaluate_bleu(data_subset, model, vocab_src, vocab_trg, device, beam_size=2):
    targets = []      # Ground Truths
    predictions = []  # Model Outputs
    
    print(f"Starting BLEU Evaluation on {len(data_subset)} samples (Beam={beam_size})...")
    
    for i, datum in enumerate(data_subset):
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1} sentences...")

        pair = datum['translation']
        src_text = pair['en']
        trg_text = pair['fr']
        
        # 1. Get Prediction
        pred_sentence = beam_search_decode_single(
            model,          
            src_text,       
            vocab_src, 
            vocab_trg, 
            beam_size, 
            max_len=50,
            device=device
        )
        
        # 2. Collect Data
        # Targets must be a list of lists: [['ref_sentence']]
        targets.append([trg_text])
        predictions.append(pred_sentence)

    # 3. Compute Score
    print("Computing Score...")
    score = bleu_score(predictions, targets, n_gram=1)
    return score.item() * 100

In [46]:
# Run this after training
final_score = evaluate_bleu(
    valid_subset,
    model,             
    vocab_en, 
    vocab_fr, 
    DEVICE,            
    beam_size=2
)

print(f"Final Single-Model BLEU: {final_score:.2f}")

Starting BLEU Evaluation on 1000 samples (Beam=2)...
Processed 50 sentences...
Processed 100 sentences...
Processed 150 sentences...
Processed 200 sentences...
Processed 250 sentences...
Processed 300 sentences...
Processed 350 sentences...
Processed 400 sentences...
Processed 450 sentences...
Processed 500 sentences...
Processed 550 sentences...
Processed 600 sentences...
Processed 650 sentences...
Processed 700 sentences...
Processed 750 sentences...
Processed 800 sentences...
Processed 850 sentences...
Processed 900 sentences...
Processed 950 sentences...
Processed 1000 sentences...
Computing Score...
Final Single-Model BLEU: 8.16


In [36]:
# Create a tiny subset of the TRAINING data (which the model has seen)
train_debug_subset = dataset['train'].select(range(100))

print("--- DIAGNOSIS: TRAINING SET CHECK ---")
train_score = evaluate_bleu(
    train_debug_subset, 
    model, 
    vocab_en, 
    vocab_fr, 
    DEVICE, 
    beam_size=2
)
print(f"Training Set BLEU: {train_score:.2f}")

--- DIAGNOSIS: TRAINING SET CHECK ---
Starting BLEU Evaluation on 100 samples (Beam=2)...
Processed 50 sentences...
Processed 100 sentences...
Computing Score...
Training Set BLEU: 0.00


In [41]:
model.eval()
print(f"--- DIAGNOSIS: VISUAL INSPECTION ---")

for i in range(5):
    idx = random.randint(0, len(valid_subset)-1)
    pair = valid_subset[idx]['translation']
    src = pair['en']
    trg = pair['fr']
    
    pred = beam_search_decode_single(model, src, vocab_en, vocab_fr, beam_size=12, device=DEVICE)
    
    print(f"Input:  {src}")
    print(f"Target: {trg}")
    print(f"Output: {pred}")
    print("-" * 30)

--- DIAGNOSIS: VISUAL INSPECTION ---
Input:  The buyer pays at an ATM.
Target: L'acheteur effectue le paiement sur les bornes automatiques.
Output: il ' est la question de
------------------------------
Input:  The most tragic section is the children's memorial, built in memory of 1.5 million children killed in concentration camps and gas chambers.
Target: La partie la plus tragique est le mémorial des enfants, construit à la mémoire des 1,5 million d'enfants exterminés dans les camps de concentration et les chambres à gaz.
Output: il ' est la la de la la de de la de de de , de , de , , , de de de et de de .
------------------------------
Input:  To force it to think that it feels something that it should be feeling when it sees something?
Target: Lui faire croire qu'il ressent ce qu'il devrait normalement ressentir au moment où il voit quelque chose?
Output: il ' est , une de de la , de la , de la de de la de .
------------------------------
Input:  Nor will trials of civilians will b