In [1]:
# ==============================================================================
# 1. Imports and Setup
# ==============================================================================
import sys
import os
import torch
import torch.nn as nn
import math
from typing import List, Optional
from datasets import load_dataset
from tqdm import tqdm

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

from modelling.model import TransformerModel
from data_utils import clean_text_pair

# Device configuration (supports CUDA, MPS for Apple Silicon, or CPU)
if torch.cuda.is_available():
    DEVICE = 'cuda'
elif torch.backends.mps.is_available():
    DEVICE = 'mps'
else:
    DEVICE = 'cpu'
print(f"Using device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps


In [None]:
# ==============================================================================
# 2. Load WMT17 German-English Dataset
# ==============================================================================
print("Loading WMT17 dataset...")
dataset = load_dataset("wmt17", "de-en", trust_remote_code=True)
print(f"✓ Dataset loaded")

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'wmt17' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Loading WMT17 dataset...


Generating train split: 100%|██████████| 5906184/5906184 [00:01<00:00, 3771059.50 examples/s]
Generating validation split: 100%|██████████| 2999/2999 [00:00<00:00, 1114838.05 examples/s]
Generating test split: 100%|██████████| 3004/3004 [00:00<00:00, 1242940.63 examples/s]


In [None]:
# ==============================================================================
# 2b. Clean Test Data for Evaluation
# ==============================================================================
test_data = dataset['test']['translation']
NUM_TEST_SAMPLES = 100  # Use subset for faster evaluation

cleaned_pairs = []
for item in tqdm(test_data[:NUM_TEST_SAMPLES * 2], desc="Cleaning test data"):
    result = clean_text_pair(item['de'], item['en'])
    if result is not None:
        cleaned_pairs.append(result)
    if len(cleaned_pairs) >= NUM_TEST_SAMPLES:
        break

print(f"✓ Cleaned {len(cleaned_pairs)} test pairs")
print(f"\nExample pair:")
print(f"  German:  {cleaned_pairs[0][0]}")
print(f"  English: {cleaned_pairs[0][1]}")

Cleaning test data:  54%|█████▍    | 108/200 [00:00<00:00, 32356.06it/s]

✓ Cleaned 100 test pairs

Example pair:
  German: 28-jähriger koch in san francisco mall tot aufgefunden
  English: 28-year-old chef found dead at san francisco mall





In [6]:
# ==============================================================================
# 3. Train Tokenizer on WMT17 Data (if not already trained)
# ==============================================================================
from tokenizer import GPT2BPETokenizer
import os

TOKENIZER_ARTIFACTS_DIR = '../tokenizer_artifacts'

# Check if tokenizer already exists
if os.path.exists(os.path.join(TOKENIZER_ARTIFACTS_DIR, 'vocab.json')):
    print("✓ Tokenizer already trained, loading from artifacts...")
    from transformers import GPT2Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(
        TOKENIZER_ARTIFACTS_DIR,
        unk_token="<unk>", pad_token="<pad>", bos_token="<s>", eos_token="</s>"
    )
else:
    print("Training tokenizer on WMT17 data...")
    # Collect training corpus (use subset for faster training)
    train_data = dataset['train']['translation']
    corpus = []
    for item in tqdm(train_data[:50000], desc="Preparing corpus"):
        corpus.append(item['de'])
        corpus.append(item['en'])
    
    # Train YOUR GPT2BPETokenizer
    gpt2_tokenizer = GPT2BPETokenizer(
        corpus=corpus,
        vocab_size=32000,
        special_tokens=["<unk>", "<pad>", "<s>", "</s>"],
        artifact_dir=TOKENIZER_ARTIFACTS_DIR
    )
    tokenizer = gpt2_tokenizer.tokenizer
    print("✓ Tokenizer trained and saved")

# Special token IDs
PAD_ID = tokenizer.pad_token_id
BOS_ID = tokenizer.bos_token_id  
EOS_ID = tokenizer.eos_token_id
VOCAB_SIZE = len(tokenizer)

print(f"\n✓ YOUR GPT2BPETokenizer ready")
print(f"  Vocab size: {VOCAB_SIZE}")
print(f"  PAD={PAD_ID}, BOS={BOS_ID}, EOS={EOS_ID}")

Training tokenizer on WMT17 data...


Preparing corpus: 100%|██████████| 50000/50000 [00:00<00:00, 6202022.83it/s]





✓ Tokenizer trained and saved

✓ YOUR GPT2BPETokenizer ready
  Vocab size: 32000
  PAD=1, BOS=2, EOS=3


In [13]:
# ==============================================================================
# 4. Create or Load Trained Translation Model
# ==============================================================================
CHECKPOINT_PATH = '../checkpoints/translation_model.pt'
MAX_SEQ_LEN = 256  # Maximum sequence length for both source and target

# Model configuration for translation
MODEL_CONFIG = {
    'vocab_size': VOCAB_SIZE,
    'd_model': 256,
    'n_heads': 8,
    'num_encoder_layers': 3,
    'num_decoder_layers': 3,
    'dim_feedforward': 1024,
    'dropout': 0.1,
    'max_len': MAX_SEQ_LEN  # Increased to handle longer sequences
}

# Create model with YOUR custom transformer implementation
model = TransformerModel(
    vocab_size=MODEL_CONFIG['vocab_size'],
    d_model=MODEL_CONFIG['d_model'],
    n_heads=MODEL_CONFIG['n_heads'],
    num_encoder_layers=MODEL_CONFIG['num_encoder_layers'],
    num_decoder_layers=MODEL_CONFIG['num_decoder_layers'],
    dim_feedforward=MODEL_CONFIG['dim_feedforward'],
    dropout=MODEL_CONFIG['dropout'],
    max_len=MODEL_CONFIG['max_len']
)

# Load trained weights if available
if os.path.exists(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"✓ Loaded trained translation model from {CHECKPOINT_PATH}")
else:
    print(f"⚠ No trained model found at {CHECKPOINT_PATH}")
    print("  Model initialized with random weights - train it first!")

model = model.to(DEVICE)
model.eval()
print(f"✓ Model ready on {DEVICE}")

⚠ No trained model found at ../checkpoints/translation_model.pt
  Model initialized with random weights - train it first!
✓ Model ready on mps


In [14]:
# ==============================================================================
# 4b. Prepare Training Data
# ==============================================================================
from dataset import TranslationDataset
from data_utils import collate_batch
from torch.utils.data import DataLoader

# Clean training data (use subset for faster training - increase for better results)
NUM_TRAIN_SAMPLES = 50000  # Increase this for better model quality
train_data = dataset['train']['translation']

train_pairs = []
for item in tqdm(train_data[:NUM_TRAIN_SAMPLES * 2], desc="Cleaning training data"):
    result = clean_text_pair(item['de'], item['en'])
    if result is not None:
        train_pairs.append(result)
    if len(train_pairs) >= NUM_TRAIN_SAMPLES:
        break

# Clean validation data
val_data = dataset['validation']['translation']
val_pairs = []
for item in tqdm(val_data[:1000], desc="Cleaning validation data"):
    result = clean_text_pair(item['de'], item['en'])
    if result is not None:
        val_pairs.append(result)

print(f"✓ Training pairs: {len(train_pairs)}")
print(f"✓ Validation pairs: {len(val_pairs)}")

# Create datasets using YOUR TranslationDataset class
train_dataset = TranslationDataset(train_pairs, tokenizer)
val_dataset = TranslationDataset(val_pairs, tokenizer)

# Create dataloaders with YOUR collate_batch function
BATCH_SIZE = 32

def collate_fn(batch):
    return collate_batch(batch, pad_idx=PAD_ID, bos_idx=BOS_ID, eos_idx=EOS_ID)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print(f"✓ Train batches: {len(train_loader)}")
print(f"✓ Val batches: {len(val_loader)}")

Cleaning training data:  56%|█████▋    | 56250/100000 [00:01<00:01, 43634.48it/s]
Cleaning validation data: 100%|██████████| 1000/1000 [00:00<00:00, 52377.73it/s]

✓ Training pairs: 50000
✓ Validation pairs: 942
✓ Train batches: 1563
✓ Val batches: 30





In [17]:
# ==============================================================================
# 4c. Train the Translation Model
# ==============================================================================
from modelling.trainer import Trainer
from modelling.scheduler import get_scheduler
from modelling.loss import get_loss_function

# Training configuration
EPOCHS = 5
LEARNING_RATE = 1e-4
WARMUP_STEPS = 1000
GRAD_CLIP = 1.0

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Initialize scheduler using YOUR NoamScheduler
scheduler = get_scheduler(
    optimizer, 
    scheduler_type='noam',
    d_model=MODEL_CONFIG['d_model'],
    warmup_steps=WARMUP_STEPS
)

# Initialize loss function using YOUR loss function (with label smoothing)
criterion = get_loss_function('label_smoothing', smoothing=0.1, ignore_index=PAD_ID)

# Create trainer using YOUR Trainer class
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    scheduler=scheduler,
    device=DEVICE,
    grad_clip=GRAD_CLIP
)

print(f"✓ Trainer initialized")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Warmup steps: {WARMUP_STEPS}")
print(f"  d_model: {MODEL_CONFIG['d_model']}")

✓ Trainer initialized
  Epochs: 5
  Learning rate: 0.0001
  Warmup steps: 1000
  d_model: 256


In [18]:
# ==============================================================================
# 4d. Train using YOUR Trainer.train() method
# ==============================================================================

# Train with YOUR trainer's built-in train method
best_val_loss = trainer.train(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    num_epochs=EPOCHS,
    save_path=CHECKPOINT_PATH,
    save_extra={'config': MODEL_CONFIG}
)

print(f"\n✓ Training complete! Best val loss: {best_val_loss:.4f}")


Epoch 1/5


Epoch 1: 100%|██████████| 1563/1563 [12:14<00:00,  2.13it/s, loss=3.7030]


Train Loss: 4.3607


Validating: 100%|██████████| 30/30 [00:04<00:00,  6.03it/s]


Val Loss: 4.9755
  ✓ Saved best model (val_loss: 4.9755)
Learning Rate: 0.001580

Epoch 2/5


Epoch 2: 100%|██████████| 1563/1563 [32:58<00:00,  1.27s/it, loss=3.5654]   


Train Loss: 3.6579


Validating: 100%|██████████| 30/30 [00:05<00:00,  5.49it/s]


Val Loss: 4.8349
  ✓ Saved best model (val_loss: 4.8349)
Learning Rate: 0.001118

Epoch 3/5


Epoch 3:   1%|▏         | 22/1563 [00:12<14:25,  1.78it/s, loss=3.5586]


KeyboardInterrupt: 

In [19]:
# ==============================================================================
# 4e. Reload Best Model for Evaluation
# ==============================================================================
checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print(f"✓ Loaded best model from epoch {checkpoint['epoch']}")
print(f"  Val loss: {checkpoint['val_loss']:.4f}")

✓ Loaded best model from epoch 2
  Val loss: 4.8349


## Exercise 1: Greedy Decoding for Translation

Implement autoregressive generation:
1. Encode the source sentence with the encoder
2. Start decoder with BOS token
3. At each step, take the token with highest probability (greedy)
4. Stop when EOS is generated or max length is reached

In [20]:
# ==============================================================================
# 5. Greedy Decoding Function for Translation
# ==============================================================================

@torch.no_grad()
def greedy_decode(
    model: TransformerModel,
    src_ids: List[int],
    bos_id: int,
    eos_id: int,
    max_length: int = 100,
    device: str = 'cpu'
) -> List[int]:
    """
    Generate a translation using greedy decoding.
    
    Autoregressive generation procedure:
    1. Encode the source sentence using the encoder
    2. Initialize decoder with BOS token
    3. Generate tokens one at a time, selecting highest probability (greedy)
    4. Stop when EOS is generated or max length is reached
    
    Args:
        model: The trained transformer model
        src_ids: Source sentence token IDs
        bos_id: Beginning of sequence token ID
        eos_id: End of sequence token ID
        max_length: Maximum generation length
        device: Device to run on
    
    Returns:
        List of generated token IDs
    """
    model.eval()
    
    # Prepare source tensor
    src = torch.tensor([src_ids], dtype=torch.long, device=device)
    src_mask = torch.ones(1, len(src_ids), dtype=torch.long, device=device)
    
    # Step 1: Encode source sequence once
    encoder_output = model.encode(src, src_mask)
    
    # Step 2: Initialize decoder input with BOS token
    decoder_input = torch.tensor([[bos_id]], dtype=torch.long, device=device)
    generated_ids = []
    
    # Step 3 & 4: Generate tokens autoregressively
    for _ in range(max_length):
        # Create decoder mask
        tgt_mask = torch.ones(1, decoder_input.size(1), dtype=torch.long, device=device)
        
        # Decode: get output from decoder
        decoder_output = model.decode(decoder_input, encoder_output, tgt_mask, src_mask)
        
        # Project to vocabulary
        logits = model.output_projection(decoder_output)
        
        # Get the last token's logits and find the most probable next token (GREEDY)
        next_token_logits = logits[0, -1, :]
        next_token_id = torch.argmax(next_token_logits).item()
        
        # Append to generated sequence
        generated_ids.append(next_token_id)
        
        # Stop if EOS is generated
        if next_token_id == eos_id:
            break
        
        # Update decoder input for next iteration
        decoder_input = torch.cat([
            decoder_input,
            torch.tensor([[next_token_id]], dtype=torch.long, device=device)
        ], dim=1)
    
    return generated_ids


def translate(
    model: TransformerModel,
    source_text: str,
    tokenizer,
    max_length: int = 100,
    device: str = 'cpu'
) -> str:
    """
    Translate a source sentence (German) to target language (English).
    """
    # Encode source text
    src_ids = tokenizer.encode(source_text)
    
    # Generate translation using greedy decoding
    generated_ids = greedy_decode(
        model, src_ids, 
        bos_id=tokenizer.bos_token_id,
        eos_id=tokenizer.eos_token_id,
        max_length=max_length, 
        device=device
    )
    
    # Decode to text
    translation = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    return translation

print("✓ Greedy decoding function defined")

✓ Greedy decoding function defined


In [21]:
# ==============================================================================
# 6. Test Translation on a Few Examples
# ==============================================================================

print("Testing translation on sample sentences:\n")
print("=" * 80)

for i, (src, ref) in enumerate(cleaned_pairs[:5]):
    translation = translate(model, src, tokenizer, max_length=100, device=DEVICE)
    
    print(f"Example {i+1}:")
    print(f"  Source (DE):     {src}")
    print(f"  Reference (EN):  {ref}")
    print(f"  Translation:     {translation}")
    print("-" * 80)

Testing translation on sample sentences:

Example 1:
  Source (DE):     28-jähriger koch in san francisco mall tot aufgefunden
  Reference (EN):  28-year-old chef found dead at san francisco mall
  Translation:     thecommissionhasnottobeaveryimportantway.
--------------------------------------------------------------------------------
Example 2:
  Source (DE):     ein 28-jähriger koch, der vor kurzem nach san francisco gezogen ist, wurde im treppenhaus eines örtlichen einkaufzentrums tot aufgefunden.
  Reference (EN):  a 28-year-old chef who had recently moved to san francisco was found dead in the stairwell of a local mall this week.
  Translation:     thecommissionhastobeabletobeabletobeabletobeabletobeabletobeabletobeabletobeabletobeabletobeabletobeabletobeaveryimportantway.
--------------------------------------------------------------------------------
Example 3:
  Source (DE):     der bruder des opfers sagte aus, dass er sich niemanden vorstellen kann, der ihm schaden wollen wür

## Exercise 2 & 3: Generate Translations and Compute BLEU Score

Generate translations for the test set and evaluate using BLEU score from HuggingFace `evaluate`.

In [None]:
# ==============================================================================
# 7. Generate Translations for Test Set
# ==============================================================================

print("Generating translations for test set...")

predictions = []
references = []

for src, ref in tqdm(cleaned_pairs, desc="Translating"):
    # Generate translation
    translation = translate(model, src, tokenizer, max_length=100, device=DEVICE)
    predictions.append(translation)
    references.append([ref])  # BLEU expects list of references

print(f"✓ Generated {len(predictions)} translations")

In [None]:
# ==============================================================================
# 8. Compute BLEU Score
# ==============================================================================
import evaluate

# Load BLEU metric
bleu = evaluate.load("bleu")

# Compute BLEU score
results = bleu.compute(predictions=predictions, references=references)

print("=" * 60)
print("BLEU Score Results:")
print("=" * 60)
print(f"  BLEU Score:       {results['bleu']:.4f}")
print(f"  Precisions:       {[f'{p:.4f}' for p in results['precisions']]}")
print(f"  Brevity Penalty:  {results['brevity_penalty']:.4f}")
print(f"  Length Ratio:     {results['length_ratio']:.4f}")
print("=" * 60)

## Exercise 4: Analyze Translation Quality

Evaluate some translations and identify common errors made by the model.

In [None]:
# ==============================================================================
# 9. Analyze Individual Translations
# ==============================================================================

print("Detailed Analysis of Sample Translations:")
print("=" * 80)

# Compute individual BLEU scores
for i in range(min(10, len(predictions))):
    src, ref = cleaned_pairs[i]
    pred = predictions[i]
    
    # Individual BLEU
    individual_bleu = bleu.compute(predictions=[pred], references=[[ref]])
    
    print(f"\n--- Example {i+1} (BLEU: {individual_bleu['bleu']:.4f}) ---")
    print(f"Source (DE):    {src}")
    print(f"Reference (EN): {ref}")
    print(f"Prediction:     {pred}")
    
    # Simple error analysis
    ref_words = set(ref.lower().split())
    pred_words = set(pred.lower().split())
    
    missing = ref_words - pred_words
    extra = pred_words - ref_words
    
    if missing:
        print(f"Missing words:  {', '.join(list(missing)[:5])}")
    if extra:
        print(f"Extra words:    {', '.join(list(extra)[:5])}")

In [None]:
# ==============================================================================
# 10. Error Pattern Summary
# ==============================================================================

print("\n" + "=" * 80)
print("Common Error Patterns Analysis:")
print("=" * 80)

# Analyze error patterns
repetition_count = 0
too_short = 0
too_long = 0
empty_translations = 0

for pred, (src, ref) in zip(predictions, cleaned_pairs):
    # Check for repetitions
    words = pred.split()
    if len(words) > 2:
        for j in range(len(words) - 2):
            if words[j] == words[j+1] == words[j+2]:
                repetition_count += 1
                break
    
    # Length analysis
    ref_len = len(ref.split())
    pred_len = len(pred.split())
    
    if pred_len == 0:
        empty_translations += 1
    elif pred_len < ref_len * 0.5:
        too_short += 1
    elif pred_len > ref_len * 1.5:
        too_long += 1

print(f"  Translations with repetitions: {repetition_count}/{len(predictions)}")
print(f"  Too short translations:        {too_short}/{len(predictions)}")
print(f"  Too long translations:         {too_long}/{len(predictions)}")
print(f"  Empty translations:            {empty_translations}/{len(predictions)}")
print("=" * 80)

print("\n✓ Practical 10 Complete!")