In [1]:
import os
from google.colab import drive

drive.mount('/content/drive')
notebook_dir = "/content/drive/MyDrive/DeepLearningMachineTranslation"
os.chdir(notebook_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import torch

from datasets import load_dataset
import os

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import Sequence, NFD, Lowercase
from tokenizers.pre_tokenizers import Whitespace

from utils.gru_tokenizer import BPETokenizer
from utils.gru_dataset import TranslationDataset, create_dataloader
from utils.checkpoint_manager import CheckpointManager
from utils.gru_train import train_model, translate_examples
from utils.evalutation import evaluate_model

from models.gru import GRUSeq2Seq

from sklearn.model_selection import train_test_split

# Preprocessing

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

ds = load_dataset("wmt/wmt14", "de-en", cache_dir="./data/wmt14")
VOCAB_SIZE=40000

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Tokenizer

In [None]:
def prepare_tokenizer_data(dataset):
    texts = []

    for split in ['train']:
        if split in dataset:
            print(f"Processing {split} split ({len(dataset[split]):,} examples)...")

            for i, example in enumerate(dataset[split]):
                texts.append(example['translation']['en'])
                texts.append(example['translation']['de'])

                if (i + 1) % 50000 == 0:
                    print(f"  Processed {i+1:,} examples...")

    print(f"Total text segments: {len(texts):,}")
    return texts

print("INIT")

trainer = BpeTrainer(
    special_tokens=["[PAD]", "[SOS]", "[EOS]", "[UNK]"],
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    show_progress=True,
    continuing_subword_prefix="##",
)

bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

bpe_tokenizer.normalizer = normalizers.Sequence([
    NFD(),
    Lowercase(),
])
bpe_tokenizer.pre_tokenizer = Whitespace()

train_texts = prepare_tokenizer_data(ds)

print(f"\nTraining tokenizer on {len(train_texts):,} text segments")
print(f"Target vocabulary size: {VOCAB_SIZE}")

bpe_tokenizer.train_from_iterator(train_texts, trainer=trainer)

save_dir = "/content/drive/MyDrive/DeepLearningMachineTranslation"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "tokenizer.json")
bpe_tokenizer.save(save_path)

print(f"Tokenizer saved to: {save_path}")

print("TESTING TOKENIZER")

test_samples = [
    "Hello world!",
    "Guten Tag!",
    "The quick brown fox jumps over the lazy dog.",
    "Der schnelle braune Fuchs springt über den faulen Hund.",
]

for sample in test_samples:
    print(f"\nOriginal: '{sample}'")

    encoding = bpe_tokenizer.encode(sample)
    print(f"Tokens: {encoding.tokens}")
    print(f"Token IDs: {encoding.ids}")

    decoded = bpe_tokenizer.decode(encoding.ids)
    print(f"Decoded: '{decoded}'")

    def clean_for_comparison(text):
        return text.replace(' ', '').replace('##', '').lower()

    orig_clean = clean_for_comparison(sample)
    dec_clean = clean_for_comparison(decoded)

    if orig_clean == dec_clean:
        print("Content preserved perfectly!")
    else:
        print(f"Content DIFFERENCE:")
        print(f"Original: {orig_clean}")
        print(f"Decoded:  {dec_clean}")

        for i, (o, d) in enumerate(zip(orig_clean, dec_clean)):
            if o != d:
                print(f"First diff at position {i}: '{o}' vs '{d}'")
                print(f"Context: ...{orig_clean[i-10:i+10]}... vs ...{dec_clean[i-10:i+10]}...")
                break

print("CORRECTED TOKENIZER VERIFICATION")

vocab_size = bpe_tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size:,}")

print("\nSpecial tokens (CORRECTED):")
for token in ["[PAD]", "[SOS]", "[EOS]", "[UNK]"]:
    token_id = bpe_tokenizer.token_to_id(token)
    if token_id is not None:
        print(f"  {token}: ID {token_id}")
    else:
        print(f"  {token}: MISSING")

sample = "jumps"
encoding = bpe_tokenizer.encode(sample)
print(f"'{sample}' tokens: {encoding.tokens}")

INIT
Preparing training data...
Processing train split (4,508,785 examples)...
  Processed 50,000 examples...
  Processed 100,000 examples...
  Processed 150,000 examples...
  Processed 200,000 examples...
  Processed 250,000 examples...
  Processed 300,000 examples...
  Processed 350,000 examples...
  Processed 400,000 examples...
  Processed 450,000 examples...
  Processed 500,000 examples...
  Processed 550,000 examples...
  Processed 600,000 examples...
  Processed 650,000 examples...
  Processed 700,000 examples...
  Processed 750,000 examples...
  Processed 800,000 examples...
  Processed 850,000 examples...
  Processed 900,000 examples...
  Processed 950,000 examples...
  Processed 1,000,000 examples...
  Processed 1,050,000 examples...
  Processed 1,100,000 examples...
  Processed 1,150,000 examples...
  Processed 1,200,000 examples...
  Processed 1,250,000 examples...
  Processed 1,300,000 examples...
  Processed 1,350,000 examples...
  Processed 1,400,000 examples...
  Proces

### Dataloader

In [4]:
tokenizer = BPETokenizer("/content/drive/MyDrive/DeepLearningMachineTranslation/tokenizer.json")

print("Preparing training data...")

source_sentences = []
target_sentences = []

for example in ds['train']:
    source_sentences.append(example['translation']['de'])
    target_sentences.append(example['translation']['en'])


print(f"Created {len(source_sentences):,} translation pairs")
print(f"Sample source: {source_sentences[0][:50]}...")
print(f"Sample target: {target_sentences[0][:50]}...")

train_src, val_src, train_tgt, val_tgt = train_test_split(
    source_sentences,
    target_sentences,
    test_size=0.1,
    random_state=42,
    shuffle=True
)

print(f"Training samples: {len(train_src):,}")
print(f"Validation samples: {len(val_src):,}")
print("\nCreating datasets...")

train_dataset = TranslationDataset(
    source_sentences=train_src,
    target_sentences=train_tgt,
    tokenizer=tokenizer,
    max_length=100
)

val_dataset = TranslationDataset(
    source_sentences=val_src,
    target_sentences=val_tgt,
    tokenizer=tokenizer,
    max_length=100
)


train_loader = create_dataloader(
    dataset=train_dataset,
    batch_size=128,
    pad_idx=tokenizer.pad_id,
    max_length=100,
    shuffle=True
)

val_loader = create_dataloader(
    dataset=val_dataset,
    batch_size=128,
    pad_idx=tokenizer.pad_id,
    max_length=100,
    shuffle=False
)

Preparing training data...
Created 4,508,785 translation pairs
Sample source: Wiederaufnahme der Sitzungsperiode...
Sample target: Resumption of the session...
Training samples: 4,057,906
Validation samples: 450,879

Creating datasets...




# Models

### GRU

In [None]:
checkpoint_manager = CheckpointManager()

torch.backends.cuda.matmul.fp32_precision = 'tf32'
torch.backends.cudnn.conv.fp32_precision = 'tf32'
torch.backends.cudnn.benchmark = True

model = GRUSeq2Seq(
    vocab_size=40000,
    embedding_dim=300,
    hidden_size=1024,
    num_layers=3,
    dropout=0.4,
    pad_idx=tokenizer.pad_id,
    sos_idx=tokenizer.sos_id,
    eos_idx=tokenizer.eos_id
)

trained_model, history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    tokenizer=tokenizer,
    checkpoint_manager=checkpoint_manager,
    num_epochs=25,
    learning_rate=0.0001,
    teacher_forcing_ratio=0.6,
    clip_grad=5.0,
    patience=5,
    resume_from='latest',
    use_amp=False,
)

translate_examples(trained_model, val_loader, tokenizer, num_examples=5)

Using device: cuda
Using standard precision training
Resuming from latest checkpoint: ./checkpoints/epoch_002.pth
  ✓ Loaded checkpoint from epoch 2
Resuming training from epoch 2/25
Epoch 3/25 (Resumed from epoch 2)




Epoch 3 Training:   0%|          | 0/31703 [00:01<?, ?batch/s]

In [None]:
# First, load the saved model
save_path = "/content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth"
torch.save(model.state_dict(), save_path)

# Evaluation

In [12]:
save_path = "/content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth"

checkpoint = torch.load(save_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')

model = GRUSeq2Seq(
    vocab_size=40000,
    embedding_dim=300,
    hidden_size=1024,
    num_layers=3,
    dropout=0.4,
    pad_idx=tokenizer.pad_id,
    sos_idx=tokenizer.sos_id,
    eos_idx=tokenizer.eos_id
)

model.load_state_dict(checkpoint['model_state_dict'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Loaded model from {save_path}")
print(f"Model was trained for {checkpoint['epoch'] + 1} epochs")

bleu_scores = evaluate_model(
    model,
    val_loader,
    tokenizer,
    max_len=50,
    device=device,
    show_examples=5  # Show 5 examples
)

Loaded model from /content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth
Model was trained for 2 epochs




DEBUG - First batch type: <class 'dict'>
DEBUG - First batch structure: {'src_tokens': tensor([[ 9641,  6197,  6280,  ...,  6565,  6426,     4],
        [ 6281, 15042,  5988,  ...,     0,     0,     0],
        [ 5906, 10754,  5913,  ...,     0,     0,     0],
        ...,
        [ 5957,  9217,  6115,  ...,     0,     0,     0],
        [ 5906,  9282,  6009,  ...,     0,     0,     0],
        [11238,  6278,  5957,  ...,     0,     0,     0]]), 'tgt_input': tensor([[    1,  6848, 13198,  ...,  5886, 10691,  5980],
        [    1,  6473,  6108,  ...,     0,     0,     0],
        [    1,  5886,  9438,  ...,     0,     0,     0],
        ...,
        [    1,  6026,  6213,  ...,     0,     0,     0],
        [    1,  5886,  7748,  ...,     0,     0,     0],
        [    1,  6026,  6956,  ...,     0,     0,     0]]), 'tgt_output': tensor([[ 6848, 13198,  6162,  ..., 10691,  5980,     2],
        [ 6473,  6108,  8778,  ...,     0,     0,     0],
        [ 5886,  9438,  5904,  ...,     0,  