# Welcome to Colab!

In [None]:
# ============================================================
# BLOCK 1: INSTALL REQUIRED LIBRARIES
# ============================================================
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
import spacy
from collections import Counter
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# Reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencie

In [None]:
# ============================================================
# BLOCK 2: LOAD DATASET AND CREATE 80/20 SPLIT
# ============================================================
from datasets import load_dataset # This line fixes your NameError

# 1. Load the official Multi30k dataset
dataset = load_dataset("bentrevett/multi30k")

# 2. Use ONLY the 'train' split (approx 29,000 rows)
# We perform an 80/20 split on this portion only.
# seed=42 ensures the shuffle is identical for consistency.
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# 3. Assign the new splits
train_raw = split_dataset["train"]
test_raw = split_dataset["test"]

# 4. Verification Print
print("--- DATASET VERIFICATION ---")
print(f"Total rows in original train split: {len(dataset['train'])}")
print(f"Training Set (80%): {len(train_raw)}")
print(f"Testing Set (20%): {len(test_raw)}")
print("----------------------------")

--- DATASET VERIFICATION ---
Total rows in original train split: 29000
Training Set (80%): 23200
Testing Set (20%): 5800
----------------------------


In [None]:
# ============================================================
# BLOCK 3: Preprocessing (Tokenization & Vocab Building)
# ============================================================
import spacy
import torch
from collections import Counter
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# DIRECT LOADING STRATEGY
# This prevents the OSError by bypassing spaCy's shortcut system
try:
    import en_core_web_sm
    import de_core_news_sm
    en_nlp = en_core_web_sm.load()
    de_nlp = de_core_news_sm.load()
except ImportError:
    # Fallback if the packages aren't recognized as modules yet
    en_nlp = spacy.load('en_core_web_sm')
    de_nlp = spacy.load('de_core_news_sm')

def tokenize_en(text): return [tok.text.lower() for tok in en_nlp.tokenizer(text)]
def tokenize_de(text): return [tok.text.lower() for tok in de_nlp.tokenizer(text)]

class Vocab:
    def __init__(self, data, tokenize_fn, is_en=True):
        self.itos = ['<pad>', '<sos>', '<eos>', '<unk>']
        self.stoi = {token: i for i, token in enumerate(self.itos)}
        counter = Counter()
        for item in data:
            text = item['en'] if is_en else item['de']
            counter.update(tokenize_fn(text))
        for token, freq in counter.items():
            if freq > 1: # Standard frequency threshold
                self.stoi[token] = len(self.itos)
                self.itos.append(token)
    def __len__(self): return len(self.itos)
    def encode(self, tokens): return [self.stoi.get(t, self.stoi['<unk>']) for t in tokens]
    def decode(self, indices): return [self.itos[i] for i in indices]

# Build Vocabularies from your 80% train split (23,200 sentences)
# We assume train_raw and test_raw are already defined in your environment
vocab_en = Vocab(train_raw, tokenize_en, is_en=True)
vocab_de = Vocab(train_raw, tokenize_de, is_en=False)

def collate_fn(batch):
    src_list, trg_list = [], []
    for item in batch:
        # Prepend <sos> and append <eos>
        src_enc = [vocab_en.stoi['<sos>']] + vocab_en.encode(tokenize_en(item['en'])) + [vocab_en.stoi['<eos>']]
        trg_enc = [vocab_de.stoi['<sos>']] + vocab_de.encode(tokenize_de(item['de'])) + [vocab_de.stoi['<eos>']]
        src_list.append(torch.tensor(src_enc))
        trg_list.append(torch.tensor(trg_enc))

    # Pad sequences to match the longest sentence in the batch
    return pad_sequence(src_list, padding_value=vocab_en.stoi['<pad>']).to(device), \
           pad_sequence(trg_list, padding_value=vocab_de.stoi['<pad>']).to(device)

# Initialize DataLoaders
train_loader = DataLoader(train_raw, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_raw, batch_size=32, collate_fn=collate_fn)

print("--- BLOCK 3 SUCCESS ---")
print(f"English Vocab Size: {len(vocab_en)}")
print(f"German Vocab Size: {len(vocab_de)}")

--- BLOCK 3 SUCCESS ---
English Vocab Size: 5314
German Vocab Size: 6808


In [None]:
# ============================================================
# BLOCK 4: LSTM MODEL DEFINITION (ENCODER-DECODER)
# ============================================================
class Encoder(nn.Module):
    def init(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().init()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def init(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().init()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def init(self, encoder, decoder, device):
        super().init()
        self.encoder, self.decoder, self.device = encoder, decoder, device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        outputs = torch.zeros(trg_len, batch_size, self.decoder.output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            input = trg[t] if random.random() < teacher_forcing_ratio else output.argmax(1)
        return outputs

In [None]:
# ============================================================
# BLOCK 5: EVALUATION MATRICES (LOSS, PERPLEXITY, BLEU)
# ============================================================
# 1. Install sacrebleu if it's not already installed
!pip install sacrebleu

import sacrebleu
import math
import torch

def get_metrics(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    predictions, references = [], []

    with torch.no_grad():
        for src, trg in loader:
            # Turn off teacher forcing (ratio=0) for true evaluation
            output = model(src, trg, 0)

            # Loss Calculation (Flatten output and target tensors)
            # output: [trg_len, batch_size, output_dim] -> [ (trg_len-1)*batch_size, output_dim ]
            # trg: [trg_len, batch_size] -> [ (trg_len-1)*batch_size ]
            loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
            epoch_loss += loss.item()

            # Decode for BLEU
            # Convert model outputs to token indices
            preds = output.argmax(2).transpose(0, 1) # [batch_size, trg_len]
            targets = trg.transpose(0, 1)

            for i in range(preds.shape[0]):
                p = vocab_de.decode(preds[i].tolist())
                r = vocab_de.decode(targets[i].tolist())

                pred_sent = " ".join([t for t in p if t not in ['<sos>', '<eos>', '<pad>']])
                ref_sent = " ".join([t for t in r if t not in ['<sos>', '<eos>', '<pad>']])

                # Join tokens into sentences, removing special padding/SOS/EOS tokens
                predictions.append(pred_sent)
                references.append(ref_sent)

    avg_loss = epoch_loss / len(loader)

    # Perplexity (PPL) is the exponent of the loss
    ppl = math.exp(avg_loss)

    # BLEU score using SacreBLEU
    bleu = sacrebleu.corpus_bleu(predictions, [references]).score

    print("\n" + "="*40)
    print(f"| Test Loss: {avg_loss:.4f}")
    print(f"| Test Perplexity: {ppl:.4f}")
    print(f"| BLEU Score: {bleu:.2f}")
    print("="*40)

# We ignore the <pad> token index when calculating loss
criterion = torch.nn.CrossEntropyLoss(ignore_index=vocab_de.stoi['<pad>'])

# Run metrics on your 20% test split
get_metrics(model, test_loader, criterion)






| Test Loss: 3.5367
| Test Perplexity: 34.3549
| BLEU Score: 26.61


In [None]:
def get_clean_samples_full(model, loader, vocab_en, vocab_de, n=5):
    model.eval()
    samples_found = 0

    print("--- DETAILED SAMPLE TRANSLATIONS ---")

    with torch.no_grad():
        for src, trg in loader:
            # Generate output with Teacher Forcing = 0 (True Model Performance)
            output = model(src, trg, 0)

            # Convert tensors to token indices
            preds = output.argmax(2).transpose(0, 1)
            sources = src.transpose(0, 1)
            targets = trg.transpose(0, 1)

            for i in range(preds.shape[0]):
                if samples_found >= n: return

                # Decode indices to words
                src_toks = vocab_en.decode(sources[i].tolist())
                ref_toks = vocab_de.decode(targets[i].tolist())
                pred_toks = vocab_de.decode(preds[i].tolist())

                # Clean tokens (remove special markers)
                src_sent = " ".join([t for t in src_toks if t not in ['<sos>', '<eos>', '<pad>']])
                ref_sent = " ".join([t for t in ref_toks if t not in ['<sos>', '<eos>', '<pad>']])
                pred_sent = " ".join([t for t in pred_toks if t not in ['<sos>', '<eos>', '<pad>']])

                print(f"\n[Sample {samples_found + 1}]")
                print(f"SOURCE (EN):  {src_sent}")
                print(f"TARGET (DE):  {ref_sent}")
                print(f"PREDICT (DE): {pred_sent}")
                print("-" * 30)

                samples_found += 1

# Execute the full sample generator
get_clean_samples_full(model, test_loader, vocab_en, vocab_de, n=5)

--- DETAILED SAMPLE TRANSLATIONS ---

[Sample 1]
SOURCE (EN):  a lady wearing green and white shorts and top is on the beach clapping her hands .
TARGET (DE):  eine dame mit grün-weißen shorts und oberteil ist auf dem strand und klatscht in die hände .
PREDICT (DE): eine frau in grün und weißen oberteil sitzt am strand und hat sich die hände .
------------------------------

[Sample 2]
SOURCE (EN):  a couple takes their own picture in front of the <unk> <unk> <unk> , from across the street .
TARGET (DE):  ein paar macht auf der anderen straßenseite des <unk> <unk> <unk> ein bild von sich .
PREDICT (DE): ein paar wartet vor dem <unk> der <unk> der straße der straße der straße .
------------------------------

[Sample 3]
SOURCE (EN):  two women with black dresses and red tops are standing next to a fence smiling .
TARGET (DE):  zwei frauen in schwarzen kleidern und roten oberteilen stehen lächelnd neben einem zaun .
PREDICT (DE): zwei frauen mit schwarzen kleidern und roten roten stehen 