# NMT Homework (Self-Contained): EN→DE

Train a translation model (English→German), measure perplexity and BLEU, save a checkpoint, and optionally export predictions for ML‑Arena.

Focus: experiment with architectures (LSTM w/ attention, Transformer, decoding strategies) — not boilerplate. Core evaluation functions are provided to ensure consistent scoring across students.

Data: the course staff provides `dataset_splits/` in the repo root. No additional setup is needed for data.

## 0. Setup
Use `install.sh` or `pip install -r requirements.txt` to set up.

In [32]:
MODEL_NR = [0, 1, 2][0]
print(f'You chose model {MODEL_NR} - LOL')

You chose model 0 - LOL


In [33]:
# !pip install -r requirements.txt
import torch
import sys
import os
import math
import random
from pathlib import Path

print('PyTorch version:', torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

try: 
    sys.stdout.reconfigure(line_buffering=True)
except Exception: 
    pass

PyTorch version: 2.8.0+cu128
Using device: cuda


In [34]:
HOME_DIR = Path(os.getcwd())
WORK_DIR = HOME_DIR / 'DLNLP25W'
if os.name == 'posix':
    # then create a folder named DLNLP25W
    # if folder exists then dont create it
    if not os.path.exists(WORK_DIR):
        os.makedirs(WORK_DIR)
    os.chdir(WORK_DIR)
print(os.listdir())

[]


## 1. Shared Utilities (no external imports)
Tokenization, vocabulary, dataset, collate, and fixed evaluation (PPL, NLL, BLEU).

In [35]:
from typing import List, Tuple, Dict, Iterable
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import math
import random

def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

SPECIAL_TOKENS = {'pad': '<pad>', 'sos': '<sos>', 'eos': '<eos>', 'unk': '<unk>'}

def simple_tokenize(s: str) -> List[str]:
    return s.strip().lower().split()

def read_split(path: str) -> List[Tuple[List[str], List[str]]]:
    pairs: List[Tuple[List[str], List[str]]] = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip(' ').split('\t')
            if len(parts) < 2: 
                continue
            pairs.append((simple_tokenize(parts[0]), simple_tokenize(parts[1])))
    return pairs

def build_vocab(seqs: Iterable[List[str]], max_size: int | None = None) -> Dict[str, int]:
    from collections import Counter
    c = Counter()
    for s in seqs: 
        c.update(s)
    itms = c.most_common(max_size) if max_size else c.items()
    stoi = { # special tokens ids
        SPECIAL_TOKENS['pad']: 0, 
        SPECIAL_TOKENS['sos']: 1, 
        SPECIAL_TOKENS['eos']: 2, 
        SPECIAL_TOKENS['unk']: 3,
    }
    for w, _ in itms:
        if w not in stoi: 
            stoi[w] = len(stoi)
    return stoi

def encode(tokens: List[str], stoi: Dict[str, int], add_sos_eos: bool = False) -> List[int]:
    ids = [stoi.get(t, stoi[SPECIAL_TOKENS['unk']]) for t in tokens]
    if add_sos_eos: 
        ids = [stoi[SPECIAL_TOKENS['sos']]] + ids + [stoi[SPECIAL_TOKENS['eos']]]
    return ids

class Example:
    def __init__(self, s: List[int], ti: List[int], to: List[int]): 
        self.src_ids = s
        self.tgt_in_ids = ti
        self.tgt_out_ids = to

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_stoi, tgt_stoi):
        self.examples: List[Example] = []
        for src, tgt in pairs:
            s = encode(src, src_stoi) + [src_stoi[SPECIAL_TOKENS['eos']]]
            t = encode(tgt, tgt_stoi, add_sos_eos=True)
            self.examples.append(Example(s, t[:-1], t[1:]))

    def __len__(self): 
        return len(self.examples)
    
    def __getitem__(self, i): 
        return self.examples[i]

def collate_pad(batch, pad_id_src: int, pad_id_tgt: int):
    src_max = max(len(x.src_ids) for x in batch)
    tgt_max = max(len(x.tgt_in_ids) for x in batch)
    
    def pad_to(a, L, pad):
        return a + [pad] * (L - len(a))
        
    src = torch.tensor([pad_to(x.src_ids, src_max, pad_id_src) for x in batch])
    tgt_in = torch.tensor([pad_to(x.tgt_in_ids, tgt_max, pad_id_tgt) for x in batch])
    tgt_out = torch.tensor([pad_to(x.tgt_out_ids, tgt_max, pad_id_tgt) for x in batch])
    src_l = torch.tensor([len(x.src_ids) for x in batch])
    tgt_l = torch.tensor([len(x.tgt_out_ids) for x in batch])
    return src, src_l, tgt_in, tgt_out, tgt_l

def compute_perplexity(loss_sum: float, token_count: int) -> float:
    if token_count == 0:
        return float('inf')
    try:
        return float(math.exp(loss_sum / token_count))
    except OverflowError:
        return float('inf')

def corpus_bleu(refs: List[List[str]], hyps: List[List[str]], max_order: int = 4, smooth: bool = True) -> float:
    from collections import Counter

    def ngrams(t, n):
        return Counter([tuple(t[i:i+n]) for i in range(len(t)-n+1)])

    m = [0] * max_order
    p = [0] * max_order
    rl = 0
    hl = 0

    for r, h in zip(refs, hyps):
        rl += len(r)
        hl += len(h)
        for n in range(1, max_order + 1):
            R = ngrams(r, n)
            H = ngrams(h, n)
            m[n-1] += sum(min(c, H[g]) for g, c in R.items())
            p[n-1] += max(len(h) - n + 1, 0)

    prec = [
        (m[i] + 1) / (p[i] + 1) if smooth else (m[i] / p[i] if p[i] > 0 else 0.0)
        for i in range(max_order)
    ]
    geo = math.exp(sum((1 / max_order) * math.log(x) for x in prec if x > 0)) if min(prec) > 0 else 0.0
    bp = 1.0 if hl > rl else math.exp(1 - rl / max(1, hl))
    return float(geo * bp)

@torch.no_grad()
def evaluate_nll(loader: DataLoader, model: nn.Module, pad_id_tgt: int, device: torch.device):
    '''Evaluation of negative log-likelihood loss on the given data loader.'''
    criterion = nn.CrossEntropyLoss(ignore_index=pad_id_tgt, reduction='sum')
    model.eval()
    tot = 0.0
    toks = 0
    for src, src_l, tgt_in, tgt_out, tgt_l in loader:
        src, src_l = src.to(device), src_l.to(device)
        tgt_in, tgt_out = tgt_in.to(device), tgt_out.to(device)
        logits = model(src, src_l, tgt_in)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
        tot += float(loss.item())
        toks += int((tgt_out != pad_id_tgt).sum().item())
    return tot, toks

@torch.no_grad()
def evaluate_bleu(loader: DataLoader, model: nn.Module, tgt_itos: List[str], sos_id: int, eos_id: int, device: torch.device, max_len: int = 100):
    '''BLEU score evaluation on the given data loader.'''
    model.eval()
    refs = []
    hyps = []
    for src, src_l, tgt_in, tgt_out, tgt_l in loader:
        src, src_l = src.to(device), src_l.to(device)
        pred = model.greedy_decode(
            src, src_l, max_len=max_len, sos_id=sos_id, eos_id=eos_id
        )
        for b in range(src.size(0)):
            ref_ids = tgt_out[b].tolist()
            hyp_ids = pred[b].tolist()
            if eos_id in ref_ids: 
                ref_ids = ref_ids[:ref_ids.index(eos_id)]
            if eos_id in hyp_ids: 
                hyp_ids = hyp_ids[:hyp_ids.index(eos_id)]
            refs.append([tgt_itos[i] for i in ref_ids if i != 0])
            hyps.append([tgt_itos[i] for i in hyp_ids if i != 0 and i != sos_id])
    return float(corpus_bleu(refs, hyps))

## 2. Paths and Hyperparameters

In [36]:
set_seed(42)

if not os.path.exists('../data'):
    import requests
    os.makedirs('../data', exist_ok=True)
    # downnload data
    for dataset in ['train', 'val', 'public_test',]:
        url = f'https://raw.githubusercontent.com/MarkusStefan/DLNLP25W/dev/data/{dataset}.txt'
        response = requests.get(url)
        lines = response.text.strip().split('\n')
        with open(f'../data/{dataset}.txt', 'w', encoding='utf-8') as f:
            for line in lines:
                f.write(line + '\n')
    train_path = '../data/train.txt'
    val_path = '../data/val.txt'
    public_test_path = '../data/public_test.txt'

if not os.path.exists(public_test_path):
    alt = '../data/test_public.txt'
    public_test_path = alt if os.path.exists(alt) else public_test_path

private_test_path = '../data/private_test.txt'

src_vocab_size = 30000
tgt_vocab_size = 30000
emb_dim = 256
hid_dim = 512
layers = 1
dropout = 0.1
batch_size = 64
epochs = 5
lr = 3e-4
max_decode_len = 100

save_dir = 'checkpoints'
os.makedirs(save_dir, exist_ok=True)
SAVE_DIR = os.path.join(save_dir, f'checkpoint_last_{MODEL_NR}.pt')
print('Public test path:', public_test_path)

Public test path: ../data/public_test.txt


## 3. Load Data and Build Vocab

In [37]:
print('Loading splits...')
train_pairs = read_split(train_path)
val_pairs = read_split(val_path)
test_pairs = read_split(public_test_path)
print(f'Train: {len(train_pairs):,} | Val: {len(val_pairs):,} | Public test: {len(test_pairs):,}')

src_stoi = build_vocab((s for s, _ in train_pairs), max_size=src_vocab_size)
tgt_stoi = build_vocab((t for _, t in train_pairs), max_size=tgt_vocab_size)

pad_id_src = src_stoi[SPECIAL_TOKENS['pad']]
pad_id_tgt = tgt_stoi[SPECIAL_TOKENS['pad']]
sos_id = tgt_stoi[SPECIAL_TOKENS['sos']]
eos_id = tgt_stoi[SPECIAL_TOKENS['eos']]

train_ds = TranslationDataset(train_pairs, src_stoi, tgt_stoi)
val_ds = TranslationDataset(val_pairs, src_stoi, tgt_stoi)
test_ds = TranslationDataset(test_pairs, src_stoi, tgt_stoi)

collate = lambda b: collate_pad(b, pad_id_src, pad_id_tgt)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)

tgt_itos = [None] * len(tgt_stoi)
for w, i in tgt_stoi.items():
    if 0 <= i < len(tgt_itos):
        tgt_itos[i] = w

print('Vocab sizes — src:', len(src_stoi), 'tgt:', len(tgt_stoi))

Loading splits...
Train: 226,997 | Val: 32,428 | Public test: 32,428
Vocab sizes — src: 30004 tgt: 30004


## 4. Build Model (Your Playground)
Keep the forward/greedy_decode contract so evaluation works. Try adding attention, GRU, Transformer, etc.

In [38]:
if MODEL_NR == 0:
    class Encoder(nn.Module):
        def __init__(self, vocab_size, emb_dim, hid_dim, num_layers=1, dropout=0.1):
            super().__init__()
            self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
            self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)

        def forward(self, src, src_lens):
            emb = self.emb(src)
            packed = nn.utils.rnn.pack_padded_sequence(emb, src_lens.cpu(), batch_first=True, enforce_sorted=False)
            out, (h, c) = self.rnn(packed)
            out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
            return out, (h, c)

    class Decoder(nn.Module):
        def __init__(self, vocab_size, emb_dim, hid_dim, num_layers=1, dropout=0.1):
            super().__init__()
            self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
            self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.proj = nn.Linear(hid_dim, vocab_size)

        def forward(self, tgt_in, hidden):
            emb = self.emb(tgt_in)
            out, hidden = self.rnn(emb, hidden)
            return self.proj(out), hidden

    class Seq2Seq(nn.Module):
        def __init__(self, enc, dec):
            super().__init__()
            self.encoder = enc
            self.decoder = dec

        def forward(self, src, src_lens, tgt_in):
            _, h = self.encoder(src, src_lens)
            logits, _ = self.decoder(tgt_in, h)
            return logits

        @torch.no_grad()
        def greedy_decode(self, src, src_lens, max_len, sos_id, eos_id):
            B = src.size(0)
            _, h = self.encoder(src, src_lens)
            inputs = torch.full((B, 1), sos_id, dtype=torch.long, device=src.device)
            outs = []
            for _ in range(max_len):
                logits, h = self.decoder(inputs[:, -1:].contiguous(), h)
                nxt = logits[:, -1, :].argmax(-1, keepdim=True)
                outs.append(nxt)
                inputs = torch.cat([inputs, nxt], dim=1)
            
            seqs = torch.cat(outs, dim=1)
            for i in range(B):
                row = seqs[i]
                if (row == eos_id).any():
                    idx = (row == eos_id).nonzero(as_tuple=False)[0].item()
                    row[idx + 1:] = eos_id
            return seqs

In [39]:
# MODEL 1: Using GRU, torch.float16 and 
if MODEL_NR == 1:
    class Encoder(nn.Module):
        pass


    class Decoder(nn.Module):
        pass


    class Seq2Seq(nn.Module):
        pass

In [40]:
# MODEL 2: Using GRU, torch.float16, and student-teacher training based on fine-tuned BERT
if MODEL_NR == 2:
    class Encoder(nn.Module):
        pass


    class Decoder(nn.Module):
        pass


    class Seq2Seq(nn.Module):
        pass

In [41]:
encoder = Encoder(len(src_stoi), emb_dim, hid_dim, num_layers=layers, dropout=dropout)
decoder = Decoder(len(tgt_stoi), emb_dim, hid_dim, num_layers=layers, dropout=dropout)
model = Seq2Seq(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print("Nr of params:\t", sum(p.numel() for p in model.parameters() if p.requires_grad))

Nr of params:	 33908020


## 5. Train

In [42]:
from tqdm import tqdm

criterion = nn.CrossEntropyLoss(ignore_index=pad_id_tgt, reduction='sum')
evals = {
    'train_loss': [],
    'val_loss': [],
    'train_perplexity': [],
    'val_perplexity': [],
    'train_nll': [],
    'val_nll': [],

}
with tqdm(total=epochs) as pbar:
    for epoch in range(epochs):
        model.train()
        tot = 0.0
        toks = 0
        n_samples = train_loader.dataset.__len__()
        n_samples_seen = 0
        for src, src_l, tgt_in, tgt_out, tgt_l in train_loader:
            src, src_l = src.to(device), src_l.to(device)
            tgt_in, tgt_out = tgt_in.to(device), tgt_out.to(device)
            
            optimizer.zero_grad()
            logits = model(src, src_l, tgt_in)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            tot += float(loss.item())
            toks += int((tgt_out != pad_id_tgt).sum().item())
            n_samples_seen += src.size(0)
            pbar.set_postfix(
                train_loss=f'{tot / toks if toks > 0 else float("inf"):.2f}', 
                epoch_pct=f'{(n_samples_seen / n_samples) * 100:.2f}', 
                overall_train_pct=f'{(epoch * n_samples + n_samples_seen) / (epochs * n_samples) * 100:.2f}'
            )
            
        tr_ppl = compute_perplexity(tot, toks)
        v_loss, v_toks = evaluate_nll(val_loader, model, pad_id_tgt, device)
        v_ppl = compute_perplexity(v_loss, v_toks)
        # update tqdm summary metrics
        pbar.set_postfix(train_ppl=f'{tr_ppl:.2f}', val_ppl=f'{v_ppl:.2f}')
        pbar.update(1)

torch.save({
    'model_state': model.state_dict(),
    'optimizer_state': optimizer.state_dict(),
    'epoch': epochs,
    'src_stoi': src_stoi,
    'tgt_stoi': tgt_stoi,
    'model_cfg': {'emb': emb_dim, 'hid': hid_dim, 'layers': layers, 'dropout': dropout}
}, SAVE_DIR)

print('Saved checkpoint:', SAVE_DIR)

100%|██████████| 5/5 [05:27<00:00, 65.59s/it, train_ppl=3.86, val_ppl=6.15]                            


Saved checkpoint: checkpoints/checkpoint_last_0.pt


## 6. Evaluate: Perplexity and BLEU (Public Test)

BLEU score:
$$
  B(\hat{y}, y) = \beta \cdot \exp\left(\sum_{i=1}^N w_i \cdot \ln(p_i)\right)
$$


In [43]:
val_loss, val_tok = evaluate_nll(val_loader, model, pad_id_tgt, device)
val_ppl = compute_perplexity(val_loss, val_tok)

tst_loss, tst_tok = evaluate_nll(test_loader, model, pad_id_tgt, device)
tst_ppl = compute_perplexity(tst_loss, tst_tok)

val_bleu = evaluate_bleu(val_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)
bleu = evaluate_bleu(test_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)

print(f'Validation perplexity:  {val_ppl:.2f}')
print(f'Public test perplexity: {tst_ppl:.2f}')
print(f'Validation BLEU:        {val_bleu * 100:.2f}')
print(f'Public test BLEU:       {bleu * 100:.2f}')

Validation perplexity:  6.15
Public test perplexity: 6.11
Validation BLEU:        25.19
Public test BLEU:       25.50


## 7. Private Test (Optional)

In [44]:
if os.path.exists(private_test_path):
    prv_pairs = read_split(private_test_path)
    prv_ds = TranslationDataset(prv_pairs, src_stoi, tgt_stoi)
    prv_loader = DataLoader(prv_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
    
    prv_loss, prv_tok = evaluate_nll(prv_loader, model, pad_id_tgt, device)
    prv_ppl = compute_perplexity(prv_loss, prv_tok)
    
    prv_bleu = evaluate_bleu(prv_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)
    
    print(f'Private test perplexity: {prv_ppl:.2f}')
    print(f'Private test BLEU:       {prv_bleu * 100:.2f}')
else:
    print('Private test split not found at', private_test_path)

Private test split not found at ../data/private_test.txt


## 8. Export Predictions for ML‑Arena (Optional)

In [45]:
@torch.no_grad()
def decode_to_lines(loader: DataLoader, model: nn.Module, tgt_itos: List[str], sos_id: int, eos_id: int, device: torch.device, max_len: int) -> List[str]:
    lines: List[str] = []
    for src, src_l, tgt_in, tgt_out, tgt_l in loader:
        src, src_l = src.to(device), src_l.to(device)
        pred_ids = model.greedy_decode(src, src_l, max_len=max_len, sos_id=sos_id, eos_id=eos_id)
        for b in range(src.size(0)):
            hyp = pred_ids[b].tolist()
            if eos_id in hyp:
                hyp = hyp[:hyp.index(eos_id)]
            toks = [tgt_itos[i] for i in hyp if i != 0 and i != sos_id]
            lines.append(' '.join(toks))
    return lines

# export_split = 'private'
export_split = 'public'
export_format = 'tsv'
export_out = 'submissions/private_predictions.tsv'

os.makedirs(os.path.dirname(export_out) or '.', exist_ok=True)
pairs = read_split(public_test_path if export_split == 'public' else private_test_path)
exp_ds = TranslationDataset(pairs, src_stoi, tgt_stoi)
exp_loader = DataLoader(exp_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)

preds = decode_to_lines(exp_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)

if export_format == 'tsv':
    with open(export_out, 'w', encoding='utf-8') as f:
        for i, h in enumerate(preds):
            f.write(f'{i}\t{h}\n')
elif export_format == 'jsonl':
    import json
    with open(export_out, 'w', encoding='utf-8') as f:
        for i, h in enumerate(preds):
            f.write(json.dumps({'id': i, 'hyp': h}, ensure_ascii=False) + '\n')

print(f'Wrote {len(preds)} predictions to {export_out}')
print('Adjust if ML‑Arena requires a different schema.')

Wrote 32428 predictions to submissions/private_predictions.tsv
Adjust if ML‑Arena requires a different schema.
