In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import copy
import random
import sentencepiece as spm
import torch
import pickle

from data import data_preparation
from models import models_path
from evaluation import scoring
from training import lstm, rnn_attention
from generation import generation_strategies
from datafiles import datafiles_path

# Get the data
## Doesn't need to be run if you already have the datafiles and tokenizer

In [6]:
# Download the raw parallel news commentary archive from the wmt16 server
# and save the english and german files
# This might take about 30 seconds
data_preparation.download_data()

Downloading https://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz
Extracting news-commentary-v11.de-en.de
Extracting news-commentary-v11.de-en.en


In [13]:
# Shuffle and split the datasets into train, dev and test subsets
data_preparation.split_data(seed=42)

In [14]:
data_preparation.get_data(language="en", split="train")[:10]

array(['A new currency was introduced and pegged to the US dollar at a one-to-one exchange rate',
       "Today's market fundamentalist creed fails to recognize that financial markets are inherently unstable",
       'Finally, Koreans must relearn the entrepreneurialism that built the chaebol, the family-owned industrial conglomerates that powered the economy’s development',
       'In fact, the Spanish economy is a classic case of a defective growth pattern followed by a predictable, policy-assisted recovery that is driven (with a delay) mostly by the tradable sector',
       'Nowhere is this better illustrated than in America’s current debate over illegal immigration',
       'Across the región, Chávez’s influence was strong',
       "By foregoing prices in allocating healthcare, the Dutch have taken the economic incentives for extending life away from the country's physicians",
       'It is not', 'So who will step aside for whom remains unclear',
       'Since the 1960s, there has 

In [15]:
data_preparation.get_data(language="de", split="train")

array(['Eine neue Währung wurde eingeführt und mit einem festen Wechselkurs im Verhältnis 1:1 an den Dollar gekoppelt',
       'Heutige Marktfundamentalisten verfehlen die Einsicht, das Finanzmärkte von sich aus instabil sind',
       'Schließlich müssen die Koreaner jenen Unternehmergeist wiederfinden, der den Aufbau der Chaebol ermöglichte, jener in Familienbesitz stehender Unternehmensnetzwerke, die die wirtschaftliche Entwicklung des Landes vorantrieben',
       ..., 'Dabei handelt es sich um sehr wichtige Fragen',
       'Viele von ihnen haben Jobs, mit denen sie zusätzliche Kredite aufnehmen müssen, nur um sich über Wasser zu halten',
       'Europa kann einfach nicht auf die Briten zählen, zumindest für eine Weile'],
      dtype='<U1499')

In [10]:
# Train one tokenizer (SentencePieceBPE) for the english and german training data splits
# This might take about 20 seconds
tokenizer_path = data_preparation.train_tokenizer(
    data=np.concatenate(
        [
            data_preparation.get_data(language="en", split="train"),
            data_preparation.get_data(language="de", split="train")
        ]
    ),
    vocab_size=8_000
)

In [26]:
token_ids_en_train = data_preparation.encode(language="en", split="train", tokenizer_path=tokenizer_path, add_bos=True, add_eos=True)
token_ids_de_train = data_preparation.encode(language="de", split="train", tokenizer_path=tokenizer_path, add_bos=True, add_eos=True)

In [27]:
data_preparation.decode(token_ids_en_train[:3], tokenizer_path=tokenizer_path)

['A new currency was introduced and pegged to the US dollar at a one-to-one exchange rate',
 "Today's market fundamentalist creed fails to recognize that financial markets are inherently unstable",
 'Finally, Koreans must relearn the entrepreneurialism that built the chaebol, the family-owned industrial conglomerates that powered the economy’s development']

# Task 1

In [28]:
tokenizer_path = models_path / "tokenizer" / "en_de_8000.model"

## a)
Shuffle all tokens

In [29]:
token_ids_de_test = data_preparation.encode(language="de", split="test", tokenizer_path=tokenizer_path, add_bos=False, add_eos=False)

In [30]:
token_ids_de_test_shuffeled = copy.deepcopy(token_ids_de_test)
for row in token_ids_de_test_shuffeled:
    random.shuffle(row)

In [39]:
scoring.corpus_bleu_from_token_ids(references=token_ids_de_test, hypotheses=token_ids_de_test_shuffeled, tokenizer_path=tokenizer_path)

0.012694

## b)
Replace tokens randomly

In [46]:
token_ids_de_test = data_preparation.encode(language="de", split="test", tokenizer_path=tokenizer_path, add_bos=False, add_eos=False)

In [59]:
sp = spm.SentencePieceProcessor()
sp.Load(str(tokenizer_path))
token_ids_de_test_replaced = copy.deepcopy(token_ids_de_test)
token_ids_de_test_replaced = [
    [
        random.randint(0, sp.vocab_size() - 1) if random.random() < 0.01 else x 
        for x in row
    ]
    for row in token_ids_de_test_replaced
]

In [60]:
scoring.corpus_bleu_from_token_ids(references=token_ids_de_test, hypotheses=token_ids_de_test_replaced, tokenizer_path=tokenizer_path)

0.975197

# Task 2

In [12]:
tokenizer_path = models_path / "tokenizer" / "en_de_8000.model"

Validation loss comparison of the models.\
Orange: RNN-Attention\
Blue: Unidirectional LSTM\
Green: Bidirectional LSTM\
<img src="images/comparison.png" alt="drawing" width="600"/>

## Unidirectional LSTM

Training took 7h 15min and was done calling:\
```python ./training/train_lstm.py --model_name="encoder-decoder" --num_epochs=20 --batch_size=32 --lr=1e-4 --embedding_dimension=512 --hidden_size=700 --bidirectional=False --val_steps=1000 --early_stopping=3```

<img src="images/unidirectional_val_loss.png" alt="drawing" width="600"/>
<img src="images/unidirectional_train_batch_loss.png" alt="drawing" width="600"/>
<img src="images/unidirectional_train_avg_loss.png" alt="drawing" width="600"/>

Load the model

In [13]:
# Initialize model
lstm_model = lstm.LSTMEncoderDecoder(
    num_embeddings=8000,
    embedding_dimension=512,
    hidden_size=700,
    bidirectional=False
)
state_dict = torch.load(models_path / "encoder-decoder.pt")
lstm_model.load_state_dict(state_dict)
lstm_model.eval()

LSTMEncoderDecoder(
  (embedding): Embedding(8000, 512)
  (encoder): LSTM(512, 700, batch_first=True)
  (decoder): LSTM(512, 700, batch_first=True)
  (final_linear): Linear(in_features=700, out_features=8000, bias=True)
)

Generate translations for two samples

In [14]:
generator = generation_strategies.Generator(model=lstm_model)
generator.generate_argmax_from_strings(
    texts=[
        "A new currency was introduced and pegged to the US dollar at a one-to-one exchange rate",
        "Today's market fundamentalist creed fails to recognize that financial markets are inherently unstable"
    ],
    tokenizer_path=models_path / "tokenizer" / "en_de_8000.model",
    max_generation_length=30
)

100%|██████████| 1/1 [00:00<00:00,  4.19it/s]


['2013 wird die Inflation in den USA und Großbritannien nicht mehr als 20 Prozent der Weltbevölkerung leben',
 'Der Preissteigerungen ist nicht nur ein Problem, sondern auch für die Zukunft der Welt']

Calculate BLEU score for the test set\
Predictions were generated using `python ./training/test_predictions.py --model_name="encoder-decoder" --batch_size=64`

In [15]:
# Load predictions
with open(datafiles_path / "predictions" / f"encoder-decoder_predictions_test_split.pkl", "rb") as f:
    prediction_tokens = pickle.load(f)
token_ids_de_test = data_preparation.encode(language="de", split="test", tokenizer_path=tokenizer_path, add_bos=False, add_eos=False)

In [16]:
scoring.corpus_bleu_from_token_ids(references=token_ids_de_test, hypotheses=prediction_tokens, tokenizer_path=tokenizer_path)

0.012927

## Bidirectional LSTM

Training took 9h 15min was done calling:\
```python ./training/train_lstm.py --model_name="encoder-decoder-bidirectional" --num_epochs=20 --batch_size=32 --lr=1e-4 --embedding_dimension=512 --hidden_size=700 --bidirectional=True --val_steps=1000 --early_stopping=3```

<img src="images/bidirectional_val_loss.png" alt="drawing" width="600"/>
<img src="images/bidirectional_train_batch_loss.png" alt="drawing" width="600"/>
<img src="images/bidirectional_train_avg_loss.png" alt="drawing" width="600"/>

Load the model

In [17]:
# Initialize model
lstm_bidirectional_model = lstm.LSTMEncoderDecoder(
    num_embeddings=8000,
    embedding_dimension=512,
    hidden_size=700,
    bidirectional=True
)
state_dict = torch.load(models_path / "encoder-decoder-bidirectional.pt")
lstm_bidirectional_model.load_state_dict(state_dict)
lstm_bidirectional_model.eval()

LSTMEncoderDecoder(
  (embedding): Embedding(8000, 512)
  (encoder): LSTM(512, 700, batch_first=True, bidirectional=True)
  (hidden_projection): Linear(in_features=1400, out_features=700, bias=True)
  (cell_projection): Linear(in_features=1400, out_features=700, bias=True)
  (decoder): LSTM(512, 700, batch_first=True)
  (final_linear): Linear(in_features=700, out_features=8000, bias=True)
)

Generate translations for two samples

In [18]:
generator = generation_strategies.Generator(model=lstm_bidirectional_model)
generator.generate_argmax_from_strings(
    texts=[
        "A new currency was introduced and pegged to the US dollar at a one-to-one exchange rate",
        "Today's market fundamentalist creed fails to recognize that financial markets are inherently unstable"
    ],
    tokenizer_path=models_path / "tokenizer" / "en_de_8000.model",
    max_generation_length=30
)

100%|██████████| 1/1 [00:00<00:00,  5.48it/s]


['technungsrophetanehrtenteils der Markennung von Dollars und einer Verzerrung von Grundlagen, die die',
 'Wieensammlungographiemandungseigenssenalen USA müssen']

Calculate BLEU score for the test set\
Predictions were generated using `python ./training/test_predictions.py --model_name="encoder-decoder-bidirectional" --batch_size=64`

In [19]:
# Load predictions
with open(datafiles_path / "predictions" / f"encoder-decoder-bidirectional_predictions_test_split.pkl", "rb") as f:
    prediction_tokens = pickle.load(f)
token_ids_de_test = data_preparation.encode(language="de", split="test", tokenizer_path=tokenizer_path, add_bos=False, add_eos=False)

In [20]:
scoring.corpus_bleu_from_token_ids(references=token_ids_de_test, hypotheses=prediction_tokens, tokenizer_path=tokenizer_path)

0.026404

## RNNAttention
(Bahdanau et al., 2014; https://arxiv.org/abs/1409.0473)

Training was done by calling:\
```python ./training/train_rnn_attention.py --model_name="rnn-attention" --num_epochs=10 --batch_size=4 --lr=1e-6 --teacher_forching_prob=0.5 --embedding_dimension=512 --hidden_size=512 --attention_dim=256 --maxout_size=256 --val_steps=8000 --early_stopping=3```\
Training was stopped after 2d 16h 10min

<img src="images/attention_val_loss.png" alt="drawing" width="600"/>
<img src="images/attention_train_batch_loss.png" alt="drawing" width="600"/>
<img src="images/attention_train_avg_loss.png" alt="drawing" width="600"/>

Load the model

In [25]:
# Initialize model
rnn_attention_encoder = rnn_attention.RNNAttentionEncoder(
    num_embeddings=8000,
    embedding_dimension=512,
    hidden_size=512
)
rnn_attention_model = rnn_attention.RNNAttention(
    encoder=rnn_attention_encoder,
    decoder=rnn_attention.RNNAttentionDecoder(
        embedding_layer=rnn_attention_encoder.embedding,
        hidden_size=512,
        attention_dim=256,
        maxout_size=256
    ),
    random_seed=42
)
state_dict = torch.load(models_path / "rnn-attention.pt")
rnn_attention_model.load_state_dict(state_dict)
rnn_attention_model.eval()

RNNAttention(
  (encoder): RNNAttentionEncoder(
    (embedding): Embedding(8000, 512)
    (gru): GRU(512, 512, batch_first=True, bidirectional=True)
    (linear): Linear(in_features=512, out_features=512, bias=True)
  )
  (decoder): RNNAttentionDecoder(
    (attention): Alignment(
      (hidden_projection): Linear(in_features=512, out_features=256, bias=True)
      (outputs_projection): Linear(in_features=1024, out_features=256, bias=True)
      (energy_scale): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(8000, 512)
    (gru): GRU(1536, 512, batch_first=True)
    (linear_weighted_values): Linear(in_features=1024, out_features=512, bias=True)
    (linear_hidden): Linear(in_features=512, out_features=512, bias=True)
    (linear_embeddings): Linear(in_features=512, out_features=512, bias=True)
    (linear_prediction): Linear(in_features=256, out_features=8000, bias=True)
  )
)

Generate translations for two samples

In [26]:
generator = generation_strategies.Generator(model=rnn_attention_model)
generator.generate_argmax_from_strings(
    texts=[
        "A new currency was introduced and pegged to the US dollar at a one-to-one exchange rate",
        "Today's market fundamentalist creed fails to recognize that financial markets are inherently unstable"
    ],
    tokenizer_path=models_path / "tokenizer" / "en_de_8000.model",
    max_generation_length=30
)

100%|██████████| 1/1 [00:00<00:00, 12.26it/s]


['Eine gilt Problem Obwohl und und und und und und und und und und und und und und und und und und und und und und und und und und',
 'Die zweite der USA der USA der USA und der USA und der USA und der USA und der USA und der USA und der USA und der Welt –']

Calculate BLEU score for the test set\
Predictions were generated using `python ./training/test_predictions.py --model_name="rnn-attention" --batch_size=8`

In [27]:
# Load predictions
with open(datafiles_path / "predictions" / f"rnn-attention_predictions_test_split.pkl", "rb") as f:
    prediction_tokens = pickle.load(f)
token_ids_de_test = data_preparation.encode(language="de", split="test", tokenizer_path=tokenizer_path, add_bos=False, add_eos=False)

In [28]:
scoring.corpus_bleu_from_token_ids(references=token_ids_de_test, hypotheses=prediction_tokens, tokenizer_path=tokenizer_path)

0.000771