## Preparaing the Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

import warnings
warnings.simplefilter('ignore')

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Create Tokenizers

In [3]:
# https://stackoverflow.com/questions/62822737/oserror-e050-cant-find-model-de-it-doesnt-seem-to-be-a-shortcut-link-a

import spacy.cli 
spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)




[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting de_core_news_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz (14.9 MB)
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py): started
  Building wheel for de-core-news-sm (setup.py): finished with status 'done'
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.3.0-py3-none-any.whl size=14907580 sha256=2d72db2b648098d7e6be189b27b874b92ab8cea921f03cba3289632f08bba58a
  Stored in directory: /root/.cache/pip/wheels/75/30/c3/ea1c6002eede7f49c8ab017ce62a2981a87b1cd39fab6e6a65
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')




In [4]:
# Creating tokenizers
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [5]:
# Tokenizer functions for German and English

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens)
    and reverses it
    
    :param text: the german sentence
    """

    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)

    :param text: the english sentence
    """

    return [tok.text for tok in spacy_en.tokenizer(text)]


In [6]:
# Source is German and Target is English

SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)


In [7]:
# Download and load the train, valid and test data

train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de', '.en'),
    fields = (SRC, TRG)
)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 547kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 167kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 160kB/s]


In [8]:
# Verifying

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(vars(train_data.examples[0]))

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
# building vocabulary
# min_freq => min threshold to include the word in the vocab

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [10]:
# Unique tokens
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7854
Unique tokens in target (en) vocabulary: 5893


## Creating iterators

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)

## Building the Seq2Seq Model

### Encoder

In [13]:
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        """
        :param input_dim: It is the size/dimensionality of the one-hot vectors that will be input to the encoder. 
                        This is equal to the input (source) vocabulary size.
        :param emb_dim: It is the dimensionality of the embedding layer. 
                        This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
        :param hid_dim: It is the dimensionality of the hidden and cell states
        :param n_layers: It is the number of layers in the RNN.
        :param dropout: It is the amount of dropout to use. 
                        This is a regularization parameter to prevent overfitting.
        """

        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        # src.shape => (src_len, batch_size)

        # example, src will be like:

            # | I ,     Good,       How,    ... |
            # | have,   morning,    are,    ... |
            # | a,      <eos>,      you,    ... |
            # | dog,    <pad>,      <eos>,  ... |
            # | <eos>,  <pad>,      <pad>,  ... |

            # In this number of rows = src_len or max_len
            # and number of columns = batch_size

        embedded = self.dropout(self.embedding(src))
        # embedded.shape => (src_len, batch_size, emb_dim)

        outputs, (hidden, cell) = self.rnn(embedded) # by default, batch_first = False

        # n_direction = 1 in this case as we are not using bi-directional rnn
        # outputs.shape => (src_len, batch_size, hid_dim * n_direction)

        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        # cell.shape => (n_layers * n_directions, batch_size, hid_dim)

        # Refer Notes:
        # hidden, cell are forwarded to next time steps
        # In this case, they are the final hidden, cell for the 
        # final time step/ layer
        # n_directions = 2 if we use bi-directional LSTMs

        return hidden, cell

### Decoder

In [14]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        """
        :param output_dim: It is the size/dimensionality of the one-hot vectors that will be input to the decoder. 
                        This is equal to the output (target) vocabulary size.
                        It also used to output the word in the Linear layer.
        :param emb_dim: It is the dimensionality of the embedding layer. 
                        This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
        :param hid_dim: It is the dimensionality of the hidden and cell states
        :param n_layers: It is the number of layers in the RNN.
        :param dropout: It is the amount of dropout to use. 
                        This is a regularization parameter to prevent overfitting.
        """

        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):

        # input.shape => (batch_size)
        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        # cell.shape => (n_layers * n_directions, batch_size, hid_dim)

        # n_directions in the decoder will both always be 1, therefore:
        # hidden.shape => (n_layers, batch_size, hid_dim)
        # cell.shape => (n_layers, batch_size, hid_dim)

        input = input.unsqueeze(0)
        # input.shape => (1, batch_size)
        # NOTE: the src_len for the input is going to be as we will give just
        # the 1st word. Also, we will get just the next word (prediction).
        # Hence, output seq_len will also be 1.

        embedded = self.dropout(self.embedding(input))
        # embedded.shape => (1, batch_size, emb_dim)

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) # by default, batch_first = False

        # output.shape => (seq_len, batch_size, hid_dim * n_directions)

        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        # cell.shape => (n_layers * n_directions, batch_size, hid_dim)

        # seq_len and n_directions will always be 1 in the decoder, therefore:

        # output.shape => (1, batch_size, hid_dim)
        # hidden.shape => (n_layers, batch_size, hid_dim)
        # cell.shape => (n_layers, batch_size, hid_dim)

        prediction = self.fc_out(output.squeeze(0))

        # prediction.shape => (batch_size, output_dim)

        return prediction, hidden, cell

### Seq2Seq model

In [15]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):

        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"

        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal numebr of layers!"

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        """
        :param src: the source sentence sequence
                    src.shape => (src_len, batch_size)
        :param trg: the target sentence sequence
                    trg.shape => (trg_len, batch_size)
        :param teacher_forcing_ratio: the ratio, used for training. It tells 
                    by how much probability, we should use the decoded token or
                    the original token for training
                    e.g. if teacher_forcing_ratio is 0.75, 
                    we use ground-truth inputs 75% of the time.
        """

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden state of the encoder is used as the initial 
        # hidden state of the decoder
        hidden, cell = self.encoder(src)

        # first input to the decoder is the <sos> tokens
        input = trg[0, :]

        for t in range(1, trg_len):

            # insert input token embedding, previous hidden and previous
            # cell states, and
            # recieve output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)

            # place the predictions in a tensor holding predictions for each token
            outputs[t] = output

            # decide if we are going to use tracher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # else, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs # outputs.shape => (trg_len, batch_size, output_dim)
            

## Training the Seq2Seq Model

### Init the model

In [16]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

### Initializing weights of the model.
* In the paper they state they initialize all weights from a uniform distribution between -0.08 and +0.08, i.e. $\mathcal{U}(-0.08, 0.08)$.

In [17]:
def init_weights(m):
    """
        initializes the weights of the model.
    """
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7854, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [18]:
def count_parameters(model):
    """
        calculates the number of trainable parameters.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,898,757 trainable parameters


In [19]:
# defining the optimizer
optimizer = optim.Adam(model.parameters())

In [20]:
# loss function
# NOTE: Our loss function calculates the average loss per token, 
# however by passing the index of the <pad> token as the ignore_index argument 
# we ignore the loss whenever the target token is a padding token.

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

### Training Function

In [21]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad

        output = model(src, trg)

        # trg.shape => (trg_len, batch_size)
        # output.shape => (trg_len, batch_size, output_dim)

        output_dim = output.shape[-1]

        # ignoring the 1st token
        output = output[1:].view(-1, output_dim) # in this case, it's 0
        trg = trg[1:].view(-1) # in this case it's <sos>

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

### Evaluate Function

In [22]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) # turn off teacher forcing

            # trg.shape => (trg_len, batch_size)
            # output.shape => (trg_len, batch_size, output_dim)

            output_dim = output.shape[-1]

            # ignoring the 1st token
            output = output[1:].view(-1, output_dim) # in this case, it's 0
            trg = trg[1:].view(-1) # in this case it's <sos>

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### TRAINING

In [24]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)

    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 38s
	Train Loss: 5.179 | Train PPL: 177.458
	 Val. Loss: 5.049 |  Val. PPL: 155.855
Epoch: 02 | Time: 0m 38s
	Train Loss: 4.663 | Train PPL: 105.972
	 Val. Loss: 5.208 |  Val. PPL: 182.655
Epoch: 03 | Time: 0m 37s
	Train Loss: 4.511 | Train PPL:  90.976
	 Val. Loss: 5.270 |  Val. PPL: 194.484
Epoch: 04 | Time: 0m 38s
	Train Loss: 4.395 | Train PPL:  81.021
	 Val. Loss: 5.275 |  Val. PPL: 195.375
Epoch: 05 | Time: 0m 37s
	Train Loss: 4.289 | Train PPL:  72.859
	 Val. Loss: 5.076 |  Val. PPL: 160.159
Epoch: 06 | Time: 0m 38s
	Train Loss: 4.246 | Train PPL:  69.820
	 Val. Loss: 5.038 |  Val. PPL: 154.220
Epoch: 07 | Time: 0m 37s
	Train Loss: 4.204 | Train PPL:  66.972
	 Val. Loss: 5.094 |  Val. PPL: 162.979
Epoch: 08 | Time: 0m 37s
	Train Loss: 4.157 | Train PPL:  63.908
	 Val. Loss: 4.973 |  Val. PPL: 144.426
Epoch: 09 | Time: 0m 37s
	Train Loss: 4.076 | Train PPL:  58.897
	 Val. Loss: 5.064 |  Val. PPL: 158.215
Epoch: 10 | Time: 0m 37s
	Train Loss: 4.098 | Train PPL

## TESTING

In [25]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.935 | Test PPL: 139.122 |
