# Intro

This document contains the preprocessing and training of the GRU model trained. Most of the source code is in the given packages.
The structure and ideas how the training was done is in the report.

### Structure
1) Preprocessing: Contains the "training" of BPE tokeizer and the setup of the dataset.
2) Model: Model architecture and training setup
3) Evalutation: BLEU Score and sample data

*I only realized on the last day, that the dataset contained a split that why I have made a split myself. 

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
notebook_dir = "/content/drive/MyDrive/DeepLearningMachineTranslation"
os.chdir(notebook_dir)

In [1]:
import torch

from datasets import load_dataset
import os

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import Sequence, NFD, Lowercase
from tokenizers.pre_tokenizers import Whitespace

from utils.gru_tokenizer import BPETokenizer
from utils.gru_dataset import TranslationDataset, create_dataloader
from utils.checkpoint_manager import CheckpointManager
from utils.gru_train import train_model, translate_examples
from utils.evalutation import evaluate_model

from models.gru import GRUSeq2Seq

from sklearn.model_selection import train_test_split

# Preprocessing

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

ds = load_dataset("wmt/wmt14", "de-en", cache_dir="./data/wmt14")
VOCAB_SIZE=40000

### Tokenizer

This is sets up the BPE tokenizer, only needs to be run if an adequate tokenizer is not already given.

In [2]:
def prepare_tokenizer_data(dataset):
    texts = []

    for split in ['train']:
        if split in dataset:
            print(f"Processing {split} split ({len(dataset[split]):,} examples)...")

            for i, example in enumerate(dataset[split]):
                texts.append(example['translation']['en'])
                texts.append(example['translation']['de'])

                if (i + 1) % 50000 == 0:
                    print(f"  Processed {i+1:,} examples...")

    print(f"Total text segments: {len(texts):,}")
    return texts

trainer = BpeTrainer(
    special_tokens=["[PAD]", "[SOS]", "[EOS]", "[UNK]"],
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    show_progress=True,
    continuing_subword_prefix="##",
)

bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

bpe_tokenizer.normalizer = normalizers.Sequence([
    NFD(),
    Lowercase(),
])
bpe_tokenizer.pre_tokenizer = Whitespace()

train_texts = prepare_tokenizer_data(ds)

print(f"\nTraining tokenizer on {len(train_texts):,} text segments")
print(f"Target vocabulary size: {VOCAB_SIZE}")

bpe_tokenizer.train_from_iterator(train_texts, trainer=trainer)

save_dir = "/content/drive/MyDrive/DeepLearningMachineTranslation"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "tokenizer.json")
bpe_tokenizer.save(save_path)

print(f"Tokenizer saved to: {save_path}")

NameError: name 'BpeTrainer' is not defined

### Dataloader

Needs to be run before every training as the dataloaders are not saved.

1) This sets up the tokenizer from the previous json file. 
2) Splits the data into train and test.
3) And then sets up standart datasets and dataloaders from pytorch.

In [3]:
tokenizer = BPETokenizer("./tokenizer.json")

print("Preparing training data...")

source_sentences = []
target_sentences = []

for example in ds['train']:
    source_sentences.append(example['translation']['de'])
    target_sentences.append(example['translation']['en'])


print(f"Created {len(source_sentences):,} translation pairs")
print(f"Sample source: {source_sentences[0][:50]}...")
print(f"Sample target: {target_sentences[0][:50]}...")

train_src, val_src, train_tgt, val_tgt = train_test_split(
    source_sentences,
    target_sentences,
    test_size=0.1,
    random_state=42,
    shuffle=True
)

print(f"Training samples: {len(train_src):,}")
print(f"Validation samples: {len(val_src):,}")
print("\nCreating datasets...")

train_dataset = TranslationDataset(
    source_sentences=train_src,
    target_sentences=train_tgt,
    tokenizer=tokenizer,
    max_length=100
)

val_dataset = TranslationDataset(
    source_sentences=val_src,
    target_sentences=val_tgt,
    tokenizer=tokenizer,
    max_length=100
)


train_loader = create_dataloader(
    dataset=train_dataset,
    batch_size=2,
    pad_idx=tokenizer.pad_id,
    max_length=100,
    shuffle=True
)

val_loader = create_dataloader(
    dataset=val_dataset,
    batch_size=2,
    pad_idx=tokenizer.pad_id,
    max_length=100,
    shuffle=False
)

Preparing training data...
Created 4,508,785 translation pairs
Sample source: Wiederaufnahme der Sitzungsperiode...
Sample target: Resumption of the session...
Training samples: 4,057,906
Validation samples: 450,879

Creating datasets...


# Model

### GRU

This is the "training loop" for the model as training takes longer then most only runtimes the model weights and training state can be saved with checkpoint manager. Which need to be adjusted, the standart checkpoint manager set up here saves the models under ./checkpoints/epoch_XXX.pth and loads the last save from there.  

*Not sure if mixed precision training is working, as it does not throw any errors, but does also not speed up the training.

In [None]:
checkpoint_manager = CheckpointManager()

torch.backends.cuda.matmul.fp32_precision = 'tf32'
torch.backends.cudnn.conv.fp32_precision = 'tf32'
torch.backends.cudnn.benchmark = True

model = GRUSeq2Seq(
    vocab_size=40000,
    embedding_dim=300,
    hidden_size=512,
    num_layers=3,
    dropout=0.4,
    pad_idx=tokenizer.pad_id,
    sos_idx=tokenizer.sos_id,
    eos_idx=tokenizer.eos_id
)

trained_model, history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    tokenizer=tokenizer,
    checkpoint_manager=checkpoint_manager,
    num_epochs=25,
    learning_rate=0.0001,
    teacher_forcing_ratio=0.6,
    clip_grad=5.0,
    patience=5,
    resume_from='latest',
    use_amp=False,
)

Using device: cuda
Using standard precision training
No latest checkpoint found. Starting from scratch.
Failed to load checkpoint: stat: path should be string, bytes, os.PathLike or integer, not NoneType
Starting training from scratch...
Epoch 1/25


Epoch 1 Training:   0%|          | 0/2028953 [00:01<?, ?batch/s]

KeyboardInterrupt: 

In [None]:
save_path = "/content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth"
torch.save(model.state_dict(), save_path)

# Evaluation

Loads the last save made by the checkpoint manager. And evaluates it with BLEU[1-4] and outputs some example translations.

In [12]:
save_path = "/content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth"

checkpoint = torch.load(save_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')

model = GRUSeq2Seq(
    vocab_size=40000,
    embedding_dim=300,
    hidden_size=1024,
    num_layers=3,
    dropout=0.4,
    pad_idx=tokenizer.pad_id,
    sos_idx=tokenizer.sos_id,
    eos_idx=tokenizer.eos_id
)

model.load_state_dict(checkpoint['model_state_dict'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Loaded model from {save_path}")
print(f"Model was trained for {checkpoint['epoch'] + 1} epochs")

bleu_scores = evaluate_model(
    model,
    val_loader,
    tokenizer,
    max_len=50,
    device=device,
    show_examples=5 
)

Loaded model from /content/drive/MyDrive/DeepLearningMachineTranslation/checkpoints/epoch_002.pth
Model was trained for 2 epochs




DEBUG - First batch type: <class 'dict'>
DEBUG - First batch structure: {'src_tokens': tensor([[ 9641,  6197,  6280,  ...,  6565,  6426,     4],
        [ 6281, 15042,  5988,  ...,     0,     0,     0],
        [ 5906, 10754,  5913,  ...,     0,     0,     0],
        ...,
        [ 5957,  9217,  6115,  ...,     0,     0,     0],
        [ 5906,  9282,  6009,  ...,     0,     0,     0],
        [11238,  6278,  5957,  ...,     0,     0,     0]]), 'tgt_input': tensor([[    1,  6848, 13198,  ...,  5886, 10691,  5980],
        [    1,  6473,  6108,  ...,     0,     0,     0],
        [    1,  5886,  9438,  ...,     0,     0,     0],
        ...,
        [    1,  6026,  6213,  ...,     0,     0,     0],
        [    1,  5886,  7748,  ...,     0,     0,     0],
        [    1,  6026,  6956,  ...,     0,     0,     0]]), 'tgt_output': tensor([[ 6848, 13198,  6162,  ..., 10691,  5980,     2],
        [ 6473,  6108,  8778,  ...,     0,     0,     0],
        [ 5886,  9438,  5904,  ...,     0,  