In [44]:
# imports

import os
import sys
import time

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from torch.utils.data import DataLoader

import spacy
from collections import Counter

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random



In [45]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Device", device)

Device mps


In [46]:
# 1. Set Seeds for Reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check if MPS is available and set the seed
if torch.backends.mps.is_available():
    torch.mps.manual_seed(SEED)

# Check if CUDA is available (just in case you move this code to a server later)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [47]:
# Download and load the WMT14 French-English dataset
# This might take a few minutes as the dataset is large
dataset = load_dataset("wmt14", "fr-en")

# Print the dataset structure to verify
print(dataset)

# Example: Inspect the first training example
print("First training example:", dataset['train'][0])

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40836715
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})
First training example: {'translation': {'en': 'Resumption of the session', 'fr': 'Reprise de la session'}}


In [48]:
spacy_fr = spacy.load("fr_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [49]:
#First, we create a class to manage the mapping between words and IDs. This handles the "80k vocabulary" limit mentioned in the paper.

class Vocabulary:
    def __init__(self, freq_threshold=2, max_size=80000):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.max_size = max_size

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    @staticmethod
    def tokenizer_fr(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

    def build_vocabulary(self, sentence_list, tokenizer):
        frequencies = Counter()
        idx = 4 # Start index after special tokens

        # 1. Count frequencies of all words
        for sentence in sentence_list:
            for word in tokenizer(sentence):
                frequencies[word] += 1

        # 2. Sort by frequency and keep top 'max_size' words
        # This matches the paper's strategy of capping vocab size
        common_words = frequencies.most_common(self.max_size - 4)

        # 3. Add valid words to our dictionary
        for word, count in common_words:
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text, tokenizer):
        tokenized_text = tokenizer(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [50]:
#Now we define a PyTorch Dataset that takes the raw Hugging Face data and converts it into numbers using the Vocabulary class above.

class WMT14Dataset(Dataset):
    def __init__(self, hf_dataset, source_vocab, target_vocab):
        self.hf_dataset = hf_dataset
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        # Get the pair: {'fr': '...', 'en': '...'}
        pair = self.hf_dataset[index]['translation']
        src_text = pair['en']
        trg_text = pair['fr']

        # Convert text to indices
        # Add <SOS> at start and <EOS> at end
        numericalized_source = [self.source_vocab.stoi["<SOS>"]]
        numericalized_source += self.source_vocab.numericalize(src_text, self.source_vocab.tokenizer_eng)
        numericalized_source.append(self.source_vocab.stoi["<EOS>"])

        numericalized_target = [self.target_vocab.stoi["<SOS>"]]
        numericalized_target += self.target_vocab.numericalize(trg_text, self.target_vocab.tokenizer_fr)
        numericalized_target.append(self.target_vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_source), torch.tensor(numericalized_target)

In [51]:
#Since sentences have different lengths, we cannot simply stack them into a matrix.
# We need a specific function (called collate_fn) to pad short sentences with zeros (the <PAD> token) so that every batch is rectangular.

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        source = [item[0] for item in batch]
        target = [item[1] for item in batch]

        # Pad sequences to the max length in this batch
        source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)

        return source, target

In [52]:
# 1. Load Data
# Select subsets: 10k for training, 1k for validation
train_subset = dataset['train'].select(range(10000))
valid_subset = dataset['validation'].select(range(1000))

print(f"Train Subset Size: {len(train_subset)}")
print(f"Valid Subset Size: {len(valid_subset)}")

# 2. Build Vocabulary
print("Building English Vocabulary...")
english_sentences = [item['translation']['en'] for item in train_subset]
vocab_en = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_en.build_vocabulary(english_sentences, vocab_en.tokenizer_eng)


print("Building French Vocabulary...")
french_sentences = [item['translation']['fr'] for item in train_subset]
vocab_fr = Vocabulary(freq_threshold=1, max_size=80000) # Lowered freq_threshold for smaller dataset 10k
vocab_fr.build_vocabulary(french_sentences, vocab_fr.tokenizer_fr)

# 3. Create Dataset
train_subsetset = WMT14Dataset(train_subset, vocab_en, vocab_fr)
valid_subsetset = WMT14Dataset(valid_subset, vocab_en, vocab_fr)

# 4. Create DataLoaders
BATCH_SIZE = 32 # Reduced batch size for smaller dataset 10k
pad_idx = vocab_en.stoi["<PAD>"]

train_loader = DataLoader(
    dataset=train_subsetset,
    batch_size=BATCH_SIZE,
    num_workers=0,
    shuffle=True,
    collate_fn=MyCollate(pad_idx=pad_idx)
)

# 5. Test it
print("Testing the pipeline...")
for src_batch, trg_batch in train_loader:
    print(f"Source Shape: {src_batch.shape}") # Expect [Seq_Len, Batch_Size]
    print(f"Target Shape: {trg_batch.shape}")
    break

Train Subset Size: 10000
Valid Subset Size: 1000
Building English Vocabulary...
Building French Vocabulary...
Testing the pipeline...
Source Shape: torch.Size([80, 32])
Target Shape: torch.Size([86, 32])


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size) # dropiut after embedding corrupts the input word vectors to prevent reliance on specific features.
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p) # LSTM layer, dropout between layers prevents deeper
                                                                               #layers from co-adapting too strongly with shallower layers.

    def forward(self, x):
        # x shape : (seq_length, Batch_size) -> seq_length is the length of the input sentence, and we process the entire sequence at once
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (seq_length, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape : (seq_length, Batch_size, hidden_size)
        # hidden shape : (num_layers, Batch_size, hidden_size)
        # cell shape : (num_layers, Batch_size, hidden_size)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape : (1, Batch_size)  -> we process one time step at a time
        x = x.unsqueeze(0) #that why we added one dimention 
        # x shape : (1, Batch_size, 1)
        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, Batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape : (1, Batch_size, hidden_size)
        predictions = self.fc_out(outputs.squeeze(0)) #remove the time step dimension for the linear layer
        # predictions shape : (Batch_size, output_size)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def__init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # First input to the decoder is the <SOS> tokens
        x = target[0,:]  # shape: (Batch_size)

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output

            # Decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_force_ratio

            # Get the highest predicted token from our predictions
            best_guess = output.argmax(1) 

            # If teacher forcing, use actual next token as next input; if not, use predicted token
            x = target[t] if teacher_force else best_guess

        return outputs