In [62]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm

In [63]:
import evaluate

In [64]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Dataset
The dataset has ~30,000 parallel English and German sentences

In [65]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [66]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [67]:
train_data, valid_data, test_data = (
    dataset['train'],
    dataset['validation'],
    dataset['test']
)

In [68]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

## Tokenizers
We will downloaded the pre-trained tokenizers for English and German ("de_core_news_sm" and "en_core_web_sm" respectively) from the `spacy` library by the following command line:
```bash
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```
and then load them in the code as follows:

In [69]:
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

In [70]:
test_string = "What a lovely day !!"

[token.text for token in en_nlp.tokenizer(test_string)]

['What', 'a', 'lovely', 'day', '!', '!']

Helper functions to apply the tokenizers on each example in the dataset

In [71]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [72]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

In [73]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

## Vocabulary

1. We will build the vocabulary for the English and German languages using the `build_vocab_from_iterator` function, provided by `torchtext`.
2. The vocabulary is used to associate each unique token in our dataset with an index (an integer), e.g. "hello" = 1, "world" = 2, "bye" = 3, "hates" = 4, etc

We have various kind of special tokens that we need to add to our vocabulary, such as:
- `<sos>`: Start of sentence token
- `<eos>`: End of sentence token
- `<unk>`: Unknown token for out-of-vocabulary words
- `<pad>`: Padding token to make all the sentences in the same batch have the same length

In [74]:
import torchtext.vocab


min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    sos_token,
    eos_token,
    pad_token
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['en_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['de_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)

In [75]:
en_vocab.get_itos()[:11] # ios = int to string

['<unk>', '<sos>', '<eos>', '<pad>', 'a', '.', 'in', 'the', 'on', 'man', 'is']

In [76]:
en_vocab.get_itos()[9]

'man'

In [77]:
de_vocab.get_itos()[:10]

['<unk>', '<sos>', '<eos>', '<pad>', '.', 'ein', 'einem', 'in', 'eine', ',']

In [78]:
en_vocab.get_stoi()["the"]

7

In [79]:
en_vocab["the"]

7

In [80]:
len(en_vocab), len(de_vocab)

(5893, 7853)

In [81]:
"the" in en_vocab

True

In [82]:
"The" in en_vocab

False

check that both our vocabularies have the same index for the unknown and padding tokens as this simplifies some code later on.

In [83]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

Using `set_default_index` function to set the default index for the unknown tokens to be 0.

In [84]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [85]:
en_vocab['The']

0

In [86]:
en_vocab['the']

7

In [87]:
en_vocab.get_itos()[0]

'<unk>'

In [88]:
tokens = ["i", "love", "watching", "crime", "shows"]
print(en_vocab.lookup_indices(tokens))
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

[956, 2169, 173, 0, 821]


['i', 'love', 'watching', '<unk>', 'shows']

In [89]:
def numericalize_example(example, en_vocab, de_vocab):
    # convert tokens to indices
    en_ids = en_vocab.lookup_indices(example['en_tokens'])
    de_ids = de_vocab.lookup_indices(example['de_tokens'])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [90]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs= fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs= fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs= fn_kwargs)

In [91]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [1, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 2],
 'de_ids': [1, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 2]}

Convert the ids from int to tensor

In [92]:
data_type = "torch"
format_columns = ['en_ids', 'de_ids']

train_data = train_data.with_format(
    type= data_type,
    columns= format_columns,
    output_all_columns= True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [93]:
train_data[0]

{'en_ids': tensor([   1,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            2]),
 'de_ids': tensor([   1,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    2]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

# Data Loaders
- the `collate_fn` is a function that defines how to combine a batch of samples into a single batch tensor that can be fed into the model for training or inference.


In [94]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_de_ids = [example['de_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value= pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value= pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids
        }
        return batch
    
    return collate_fn

The `DataLoader` internally handles the process of passing the batches to the `collate_fn` function

Shuffling of data makes training more stable and potentially improves the final performance of the model, however only needs to be done on the training set

In [95]:
import torch.utils.data.dataloader


def get_data_loader(dataset, batch_size, pad_index, shuffle = False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset= dataset,
        batch_size= batch_size,
        collate_fn= collate_fn,
        shuffle= shuffle
    )

    return data_loader

In [96]:
batch_size = 32

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index) 
test_data_loader = get_data_loader(test_data, batch_size, pad_index) 

# Bulding The Model
We will bulid our model in three parts:
- **Encoder**
- **Decoder**
- **Seq2Seq**
--- 
## Encoder
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92/assets/seq2seq2.png)
- Two layer LSTM
- LSTM have a hidden state $h$ AND a cell state $c$. we can think of the cell state as another type of hidden state.
- $h_0$ and $c_0$ will be initialized to zeros
- context vector $z$ is the final hidden state and the final cell state of the LSTM $z = (h_T, c_T)$
- Only the hidden state from the first layer is passed as input to the second layer, and not the cell state.

So we will get:
$$({h_t}^1, {c_t}^1) = \text{EncoderLSTM}^1 (e(x_t), ({h_{t-1}}^1, {c_{t-1}}^1))$$
$$({h_t}^2, {c_t}^2) = \text{EncoderLSTM}^2 ({h_t}^1, {c_t}^1)$$

In [97]:
src_test = [
    [1, 2, 0, 0, 0],
    [3, 4, 5, 0, 0],
    [1, 3, 4, 0, 0]
]
torch.Tensor(src_test).size()
# - src length is 5 (the maximum sequence length after padding)
# - batch size is 3 (the number of sequences in the batch)

torch.Size([3, 5])

The RNN returns:
- `outputs`: (seq_len, batch_size, hidden_dim) - the top-layer hidden state for each time-step
- `hidden`: (n_layers, batch_size, hidden_dim) - the final hidden state for each layer, $h_T$, stacked on top of each other
- `cell`: (n_layers, batch_size, hidden_dim) - the final cell state for each layer, $c_T$, stacked on top of each other

We will return `hidden` and `cell` only as we only need the final hidden and cell states (to make our **`context vector`**) for the decoder.

In [98]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, droupout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=droupout) #input_size, hidden_size, num_layers
        self.dropout = nn.Dropout(droupout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        output, (hidden, cell) = self.rnn(embedded)

        return hidden, cell

## Decoder

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92//assets/seq2seq3.png)

$({s_t}^1, {c_t}^1) = \text{DecoderLSTM}^1(d(y_t), ({s_{t-1}}^1, {c_{t-1}}^1))$  
$({s_t}^2, {c_t}^2) = \text{DecoderLSTM}^2({s_t}^1, {c_t}^1)$

the initial hidden and cell states to our decoder are our context vectors, which are the final hidden and cell states of our encoder from the same layer
$({s_0}^l) = z^l = ({h_T}^l, {c_T}^l)$

How to make a prediction of the next token in the sequence:
- We pass the hidden state from the top layer of the RNN through a linear layer, $f$, to make a prediction of what the next token in the sequence should be, $\hat{y}_{t+1} = f({s_t}^L)$
-- 
### Forward Pass
- Within the forward method, we accept a batch of `input` tokens, `previous hidden` states and `previous cell` states

In [113]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout= dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))

        return prediction, hidden, cell

## Seq2Seq
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92//assets/seq2seq4.png)

In [114]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"


    def forward(self, src, trg, teacher_forcing_ratio):

        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_length):

            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs


# Traning the model
### Model Initialization
- Input and output dimensions are the size of the vocabulary for the source and target languages
- The embedding dimensions and dropout for the encoder and decoder can be different
- The number of layers and the size of the hidden/cell states must be the same for both the encoder and the decoder

In [115]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout
)

model = Seq2Seq(
    encoder,
    decoder,
    device
) 

### Weight Initialization

In [116]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [117]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,898,501 trainable parameters


### Optimizer

In [110]:
optimizer = optim.Adam(model.parameters())

### Loss Function
- The `CrossEntropyLoss` function calculates both the log softmax as well as the negative log-likelihood of our predictions.
- We ignore the loss whenever the target token is a padding token.
- The loss function, such as `nn.CrossEntropyLoss` in PyTorch, expects the following input shapes:
    - The output tensor should be a 2D tensor of shape (N, C), where N is the total number of elements in the flattened sequence, and C is the number of classes (in this case, the size of the output vocabulary).
    - The trg tensor should be a 1D tensor of shape (N,), where each element represents the true target class for each corresponding element in the flattened output tensor.



In [105]:
criterion = nn.CrossEntropyLoss(ignore_index= pad_index)

### Traning Loop

At each iteration:
- Get the source and target sentences from the batch, $X$ and $Y$
- Zero the gradients calculated from the last batch
- Feed the source and target into the model to get the output $\hat{Y}$
- As the loss function only works on 2d inputs with 1d targets we need to flatten each of them with `.view`
    - We slice off the first column of the output and target tensors as mentioned earlier
- Calculate the gradients with `loss.backward()`
- Clip the gradients to prevent them from exploding
- Update the parameters of our model by doing an optimizer step
- Sum the loss value to a running total
- Finally, return the loss that is averaged over all batches

In [118]:
def train_fn(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0 
    model = model.to(device)  # Move model to device
    criterion = criterion.to(device)

    for i, batch in enumerate(data_loader):
        src =batch['de_ids'].to(device)
        trg =batch['en_ids'].to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio) 
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1] # We get the vocab_size

        #we cut off the first element of each tensor 
        output = output[1:].view(-1, output_dim) # Reshape it whith x number of rows and output_dim number of columns
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss/ len(data_loader)


### Evaluation Loop

In [123]:
def evaluation_fn(model, data_loader, criterion, device):
    model.eval()
    model = model.to(device)
    criterion = criterion.to(device)
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['de_ids'].to(device)
            trg = batch['en_ids'].to(device)

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

### Model Traning

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float('inf') # infinite value

for epoch in range(n_epochs):
    train_loss = train_fn(
        model, 
        train_data_loader,
        optimizer, 
        criterion,
        clip, 
        teacher_forcing_ratio,
        device
    )

    valid_loss = evaluation_fn(
        model, 
        valid_data_loader,
        criterion,
        device
    )

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch number: {epoch}')
    print(f'\tTrain Loss: {train_loss: 7.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\tValid Loss: {valid_loss: 7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}')


### Evaluating the Model

In [125]:
model.load_state_dict(torch.load('tut1-model.pt', map_location= torch.device(device)))
test_loss = evaluation_fn(model, test_data_loader, criterion, device)
print(f'| Test Loss: {test_loss: 7.3f} | Test PPL: {np.exp(test_loss):7.3f} |')

| Test Loss:   3.787 | Test PPL:  44.139 |
