In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
from datasets import load_dataset
import torch
from utils.translation_transformer import TransformerConfig

DEVICE = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using device: {DEVICE}")


ds = load_dataset("wmt/wmt14", "de-en")

vocab_size = 40_000
vocab_path = "./data/bpe_tokenizer_40k.json"

training_samples = len(ds["train"])
batch_size = 64
dataset_max_sample_len = 50

sharedVocab = True
# bpe_v3_ep12
configMid = TransformerConfig(
    d_model=512,
    nhead=8,
    num_encoder_layers=4,
    num_decoder_layers=4,
    dim_feedforward=2048,
    dropout=0.1,
    max_len=dataset_max_sample_len+2,  # +2 for BOS and EOS tokens
)
# base model according to the paper 'Attention is all you need'
# big_3.8770loss
configBig = TransformerConfig(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1,
    max_len=150,
)

# training
num_steps = 20_000
warmup_steps = 2_000
eval_iters = 10
patience = 1_000

label_smoothing = 0.1

# optimizer
start_lr = 3e-4
betas = (0.9, 0.98)
epsilon = 1e-9

Using device: mps


In [47]:
from tokenizers import Tokenizer as HFTokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Metaspace
from tokenizers.processors import TemplateProcessing
from utils.tokenization_vocab import HFTokenizerWrapper, Tokenizer, BPETokenizer
from pathlib import Path

bpe_tokenizer = HFTokenizer(BPE(unk_token=Tokenizer.UNK_TOKEN))
trainer = BpeTrainer(
    special_tokens=[
        Tokenizer.PAD_TOKEN,
        Tokenizer.SOS_TOKEN,
        Tokenizer.EOS_TOKEN,
        Tokenizer.UNK_TOKEN,
    ],
    vocab_size=vocab_size,
    show_progress=True,
)

bpe_tokenizer.pre_tokenizer = Metaspace()
bpe_tokenizer.decoder = decoders.Metaspace()

pretrained = True  # Set to True if you want to load a previously saved tokenizer

Path(vocab_path).parent.mkdir(parents=True, exist_ok=True)

if Path(vocab_path).is_file():
    pretrained = True

if pretrained:
    bpe_tokenizer = HFTokenizer.from_file(vocab_path)
    
    bpe_tokenizer.post_processor = TemplateProcessing(
    single=f"{Tokenizer.SOS_TOKEN} $A {Tokenizer.EOS_TOKEN}",
    special_tokens=[
        (Tokenizer.SOS_TOKEN, bpe_tokenizer.token_to_id(Tokenizer.SOS_TOKEN)),
        (Tokenizer.EOS_TOKEN, bpe_tokenizer.token_to_id(Tokenizer.EOS_TOKEN)),
    ],
)
else:
    bpe_tokenizer.train(
        [
            "./datasets/wmt14_translate_de-en_test.csv",
            "./datasets/wmt14_translate_de-en_train.csv",
            "./datasets/wmt14_translate_de-en_validation.csv",
        ],
        trainer=trainer,
    )

    bpe_tokenizer.save(vocab_path)


#tokenizer = HFTokenizerWrapper(bpe_tokenizer)
tokenizer = BPETokenizer(vocab_path)
print(f"Vocab size: {len(tokenizer)}")

Vocab size: 40000


In [48]:
from utils.parallel_corpus import (
    TranslationDataset,
    DataLoaderFactory,
    LazyTranslationPairs,
)
from utils.tokenization_vocab import HFTokenizerWrapper
import os

# Create lazy wrappers - no materialization into lists!
train_src = LazyTranslationPairs(ds["train"], src_lang="de", tgt_lang="en", mode="src")
train_tgt = LazyTranslationPairs(ds["train"], src_lang="de", tgt_lang="en", mode="tgt")

test_src = LazyTranslationPairs(ds["test"], src_lang="de", tgt_lang="en", mode="src")
test_tgt = LazyTranslationPairs(ds["test"], src_lang="de", tgt_lang="en", mode="tgt")

# Create datasets with lazy loading (processes on-the-fly, no upfront preprocessing)
train_ds = TranslationDataset(
    source_sentences=train_src,
    target_sentences=train_tgt,
    source_tokenizer=tokenizer,
    target_tokenizer=tokenizer,
    max_length=dataset_max_sample_len,
    lazy=True,  # Enable lazy loading!
)

test_ds = TranslationDataset(
    source_sentences=test_src,
    target_sentences=test_tgt,
    source_tokenizer=tokenizer,
    target_tokenizer=tokenizer,
    max_length=dataset_max_sample_len,
    lazy=True,
)

# Optimize num_workers based on CPU cores
optimal_workers = min(8, os.cpu_count() or 4)

train_loader = DataLoaderFactory.create_dataloader(
    dataset=train_ds,
    batch_size=batch_size,
    pad_idx=tokenizer.PAD_IDX,
    num_workers=optimal_workers,
    shuffle=True,  # Shuffle for training
    persistent_workers=True,  # Keep workers alive between epochs
    prefetch_factor=4,  # Prefetch more batches
)

test_loader = DataLoaderFactory.create_dataloader(
    dataset=test_ds,
    batch_size=batch_size,
    pad_idx=tokenizer.PAD_IDX,
    num_workers=0,
    shuffle=False,  # No shuffle for testing
    persistent_workers=True,
    prefetch_factor=4,
)

print(f"✓ Lazy loading enabled - no memory materialization!")
print(f"✓ Using {optimal_workers} workers for parallel processing")
print(f"Train samples: {len(train_ds):,}, Test samples: {len(test_ds):,}")
print(f"Train batches: {len(train_loader):,}, Test batches: {len(test_loader):,}")

Initialized lazy dataset with 4508785 sentence pairs
Initialized lazy dataset with 3003 sentence pairs
✓ Lazy loading enabled - no memory materialization!
✓ Using 8 workers for parallel processing
Train samples: 4,508,785, Test samples: 3,003
Train batches: 70,450, Test batches: 47


In [31]:
# Import the TranslationTransformer
from utils.translation_transformer import (
    TranslationTransformer,
    TranslationTransformerPytorch,
)

# Initialize the model with larger max_len to handle max_length + special tokens
model = TranslationTransformer(
    src_vocab_size=len(tokenizer),
    tgt_vocab_size=len(tokenizer),
    config=configMid,
    padding_idx=tokenizer.PAD_IDX,
    sharedVocab=sharedVocab,
)

print(f"Model initialized!")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

Model initialized!
Total parameters: 49,871,872


### Load model from checkpoint

In [32]:
checkpoint = torch.load("./models/best_model_entire_ds.pt", map_location=DEVICE)
print(checkpoint['model_config'])
state_dict = checkpoint["model_state_dict"]
new_state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
model.load_state_dict(new_state_dict)
model.eval()

{'d_model': 512, 'nhead': 8, 'num_encoder_layers': 4, 'num_decoder_layers': 4, 'dim_feedforward': 2048, 'dropout': 0.1, 'max_len': 52}


TranslationTransformer(
  (src_embedding): WordEmbedding(
    (embedding): Embedding(40000, 512, padding_idx=0)
  )
  (tgt_embedding): WordEmbedding(
    (embedding): Embedding(40000, 512, padding_idx=0)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-3): 4 x TransformerEncoderLayer(
      (self_attn): MultiHeadAttention(
        (qkv_proj): Linear(in_features=512, out_features=1536, bias=False)
        (out_proj): Linear(in_features=512, out_features=512, bias=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=2048, out_features=512, bias=True)
      )
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2

In [35]:
# Apply PyTorch optimizations
import torch

# 1. Enable TF32 for faster matmul on Ampere+ GPUs (A100, RTX 3090, etc.)
# This provides ~2x speedup for matrix multiplications with minimal accuracy loss
# torch.set_float32_matmul_precision('high')  # Options: 'highest', 'high', 'medium'
# torch.backends.fp32_precision = 'tf32'

# 2. For MPS (Apple Silicon), ensure we're using optimal settings
if DEVICE.type == "mps":
    # MPS backend is already optimized, but we can ensure memory efficiency
    torch.mps.empty_cache()  # Clear any cached memory
elif DEVICE.type == "cuda":
    # Enable TF32 for cuDNN convolutions as well
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    print("✓ Running on CPU (no GPU optimizations)")

model_compiled = torch.compile(
    model, mode="default"
)  # Options: 'default', 'reduce-overhead', 'max-autotune'

# Move model to device (GPU if available)
model = model.to(DEVICE)
model.train()

print(f"Using device: {DEVICE}")
print(f"Model moved to {DEVICE}")

Using device: mps
Model moved to mps


In [None]:
import torch.nn as nn
import torch.optim as optim
from utils.train import train
from torch.optim.lr_scheduler import LambdaLR


def lr_lambda(step, warmup_steps=4000):
    """Learning rate schedule with warmup and decay."""
    step = max(step, 1)
    return configBig.d_model ** (-0.5) * min(step**-0.5, step * warmup_steps**-1.5)


# Loss function and optimizer
criterion = nn.CrossEntropyLoss(
    ignore_index=tokenizer.PAD_IDX, label_smoothing=label_smoothing
)
optimizer = optim.Adam(model.parameters(), lr=1, betas=betas, eps=epsilon)

scheduler = LambdaLR(optimizer, lambda step: lr_lambda(step, warmup_steps))

# Training
train_losses, best_loss = train(
    model=model_compiled,
    config=configBig,
    train_loader=train_loader,
    test_loader=test_loader,
    dataset_size=len(train_ds),
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    device=DEVICE,
    num_steps=num_steps,
    eval_iters=eval_iters,
    patience=patience,
)

In [55]:
from utils.train import estimate_loss
import torch.nn as nn

criterion = nn.CrossEntropyLoss(
    ignore_index=tokenizer.PAD_IDX, label_smoothing=label_smoothing
)

estimate_loss(
    model=model,
    test_loader=test_loader,
    criterion=criterion,
    device=DEVICE,
    eval_iters=len(test_loader),
    print_enabled=True,
)

Eval batch 47/47, Loss: 2.9148


2.9334366981019366

In [61]:
sentence = "Ich liebe meinen Freund über alles."
input_sequence = torch.tensor([tokenizer.encode(sentence)], device=DEVICE) # Exclude SOS token

In [63]:
from utils.inference import beam_search

tgt_seq = beam_search(
    model=model,
    input_sequence=input_sequence,
    sos=tokenizer.SOS_IDX,
    eos=tokenizer.EOS_IDX,
    beam_width=3,
    length_penalty=1.0,
    repetition_penalty=1.0,
    device=DEVICE,
)

print(tokenizer.decode(tgt_seq))

i love my friend about everything .


In [62]:
from utils.inference import greedy_translate

tgt_seq = greedy_translate(
    model=model,
    input_sequence=input_sequence,
    src_tokenizer=tokenizer,
    tgt_tokenizer=tokenizer,
    max_len=dataset_max_sample_len,
    device=DEVICE
)
print(tokenizer.decode(tgt_seq))

i love my friend about everything .


In [None]:
from torch import nn

criterion = nn.CrossEntropyLoss(
    ignore_index=tokenizer.PAD_IDX, label_smoothing=label_smoothing
)

# Test with a sample German sentence from the dataset
total_loss = 0.0
eval_iters = len(test_loader)
print_enabled = True
samples = 0

for k, batch in enumerate(test_loader):
    if k >= eval_iters:
        break

    batch_de, batch_en, _, _ = batch
    samples += len(batch_de)

    for idx, (src, tgt) in enumerate(zip(batch_de, batch_en)):
        sample_de = tokenizer.decode_to_text(src.tolist())
        sample_en = tokenizer.decode_to_text(tgt.tolist())

        tgt_seq = torch.tensor(greedy_translate(
            model,
            input_sequence=src.unsqueeze(0).to(DEVICE),
            src_tokenizer=tokenizer,
            tgt_tokenizer=tokenizer,
            max_len=dataset_max_sample_len,
            device=DEVICE,
        ))
        
        output = tgt_seq.reshape(-1, tgt_seq.shape[-1])
        tgt_output = tgt.reshape(-1)

        # Calculate loss
        loss = criterion(output, tgt_output)
        total_loss += loss.item()

        if print_enabled:
            print(f"{k*len(batch_de)+idx+1} / {len(batch_de)*eval_iters}\r", end="")

print(f"\nAverage Loss over {samples} samples: {(total_loss / samples):.4f}")

In [56]:
len(test_loader)

47

In [57]:
# Test translation on sample input
import sacrebleu

# Test with a sample German sentence from the dataset
total_bleu = 0.0
eval_iters = len(test_loader)
print_enabled = True
samples = 0
total_samples = min(len(test_ds), eval_iters * batch_size)

for k, batch in enumerate(test_loader):
    if k >= eval_iters:
        break

    batch_de, batch_en, _, _ = batch
    samples += len(batch_de)

    for idx, (de, en) in enumerate(zip(batch_de, batch_en)):
        sample_de = tokenizer.decode_to_text(de.tolist())
        sample_en = tokenizer.decode_to_text(en.tolist())

        tgt_seq = greedy_translate(
            model,
            input_sequence=torch.tensor(
                [tokenizer.encode(sample_de)], device=DEVICE
            ),
            src_tokenizer=tokenizer,
            tgt_tokenizer=tokenizer,
            max_len=dataset_max_sample_len,
            device=DEVICE,
        )

        translation = tokenizer.decode_to_text(tgt_seq)
        BLEUscore = sacrebleu.corpus_bleu([translation], [[sample_en]])
        total_bleu += BLEUscore.score

        if print_enabled:
            print(f"{k*len(batch_de)+idx+1} / {total_samples}: bleu={BLEUscore.score:.4f}\r", end="")

print(f"\nAverage BLEU Score over {samples} samples: {(total_bleu / samples):.4f}")

7 / 3003: bleu=10.6951

KeyboardInterrupt: 

In [None]:
translation, _ = translate_sample(
    "Ich liebe meinen Jungen Freund über alles.",
    model,
    src_tokenizer=tokenizer,
    tgt_tokenizer=tokenizer,
    max_len=dataset_max_sample_len,
    device=DEVICE,
)
translation

NameError: name 'translate_sample' is not defined