In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import evaluate

In [3]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Dataset
The dataset has ~30,000 parallel English and German sentences

In [4]:
dataset = datasets.load_dataset("bentrevett/multi30k")

Using the latest cached version of the dataset since bentrevett/multi30k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Mohammed\.cache\huggingface\datasets\bentrevett___multi30k\default\0.0.0\4589883f3d09d4ef6361784e03f0ead219836469 (last modified on Mon Jun  3 12:31:00 2024).


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [6]:
train_data, valid_data, test_data = (
    dataset['train'],
    dataset['validation'],
    dataset['test']
)

In [7]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

## Tokenizers
We will downloaded the pre-trained tokenizers for English and German ("de_core_news_sm" and "en_core_web_sm" respectively) from the `spacy` library by the following command line:
```bash
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```
and then load them in the code as follows:

In [8]:
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

In [9]:
test_string = "What a lovely day !!"

[token.text for token in en_nlp.tokenizer(test_string)]

['What', 'a', 'lovely', 'day', '!', '!']

Helper functions to apply the tokenizers on each example in the dataset

In [10]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [11]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 1014/1014 [00:01<00:00, 934.72 examples/s] 
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1099.19 examples/s]


In [12]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

## Vocabulary

1. We will build the vocabulary for the English and German languages using the `build_vocab_from_iterator` function, provided by `torchtext`.
2. The vocabulary is used to associate each unique token in our dataset with an index (an integer), e.g. "hello" = 1, "world" = 2, "bye" = 3, "hates" = 4, etc

We have various kind of special tokens that we need to add to our vocabulary, such as:
- `<sos>`: Start of sentence token
- `<eos>`: End of sentence token
- `<unk>`: Unknown token for out-of-vocabulary words
- `<pad>`: Padding token to make all the sentences in the same batch have the same length

In [13]:
import torchtext.vocab


min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    sos_token,
    eos_token,
    pad_token
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['en_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['de_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)



In [14]:
en_vocab.get_itos()[:11] # ios = int to string

['<unk>', '<sos>', '<eos>', '<pad>', 'a', '.', 'in', 'the', 'on', 'man', 'is']

In [15]:
en_vocab.get_itos()[9]

'man'

In [16]:
de_vocab.get_itos()[:10]

['<unk>', '<sos>', '<eos>', '<pad>', '.', 'ein', 'einem', 'in', 'eine', ',']

In [17]:
en_vocab.get_stoi()["the"]

7

In [18]:
en_vocab["the"]

7

In [19]:
len(en_vocab), len(de_vocab)

(5893, 7853)

In [20]:
"the" in en_vocab

True

In [21]:
"The" in en_vocab

False

check that both our vocabularies have the same index for the unknown and padding tokens as this simplifies some code later on.

In [22]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

Using `set_default_index` function to set the default index for the unknown tokens to be 0.

In [23]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [24]:
en_vocab['The']

0

In [25]:
en_vocab['the']

7

In [26]:
en_vocab.get_itos()[0]

'<unk>'

In [27]:
tokens = ["i", "love", "watching", "crime", "shows"]
print(en_vocab.lookup_indices(tokens))
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

[956, 2169, 173, 0, 821]


['i', 'love', 'watching', '<unk>', 'shows']

In [28]:
def numericalize_example(example, en_vocab, de_vocab):
    # convert tokens to indices
    en_ids = en_vocab.lookup_indices(example['en_tokens'])
    de_ids = de_vocab.lookup_indices(example['de_tokens'])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [29]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs= fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs= fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs= fn_kwargs)

Map: 100%|██████████| 1014/1014 [00:01<00:00, 867.45 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1823.91 examples/s]


In [30]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [1, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 2],
 'de_ids': [1, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 2]}

Convert the ids from int to tensor

In [31]:
data_type = "torch"
format_columns = ['en_ids', 'de_ids']

train_data = train_data.with_format(
    type= data_type,
    columns= format_columns,
    output_all_columns= True
)

In [32]:
train_data[0]

{'en_ids': tensor([   1,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            2]),
 'de_ids': tensor([   1,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    2]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

# Data Loaders
- the `collate_fn` is a function that defines how to combine a batch of samples into a single batch tensor that can be fed into the model for training or inference.


In [33]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_de_ids = [example['de_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value= pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value= pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids
        }
        return batch
    
    return collate_fn

The `DataLoader` internally handles the process of passing the batches to the `collate_fn` function

Shuffling of data makes training more stable and potentially improves the final performance of the model, however only needs to be done on the training set

In [34]:
import torch.utils.data.dataloader


def get_data_loader(dataset, batch_size, pad_index, shuffle = False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset= dataset,
        batch_size= batch_size,
        collate_fn= collate_fn,
        shuffle= shuffle
    )

    return data_loader

In [35]:
batch_size = 32

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index) 
test_data_loader = get_data_loader(test_data, batch_size, pad_index) 

# Bulding The Model
We will bulid our model in three parts:
- **Encoder**
- **Decoder**
- **Seq2Seq**
--- 
## Encoder
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92/assets/seq2seq2.png)
- Two layer LSTM
- LSTM have a hidden state $h$ AND a cell state $c$. we can think of the cell state as another type of hidden state.
- $h_0$ and $c_0$ will be initialized to zeros
- context vector $z$ is the final hidden state and the final cell state of the LSTM $z = (h_T, c_T)$
- Only the hidden state from the first layer is passed as input to the second layer, and not the cell state.

So we will get:
$$({h_t}^1, {c_t}^1) = \text{EncoderLSTM}^1 (e(x_t), ({h_{t-1}}^1, {c_{t-1}}^1))$$
$$({h_t}^2, {c_t}^2) = \text{EncoderLSTM}^2 ({h_t}^1, {c_t}^1)$$

In [36]:
src_test = [
    [1, 2, 0, 0, 0],
    [3, 4, 5, 0, 0],
    [1, 3, 4, 0, 0]
]
torch.Tensor(src_test).size()
# - src length is 5 (the maximum sequence length after padding)
# - batch size is 3 (the number of sequences in the batch)

torch.Size([3, 5])

The RNN returns:
- `outputs`: (seq_len, batch_size, hidden_dim) - the top-layer hidden state for each time-step
- `hidden`: (n_layers, batch_size, hidden_dim) - the final hidden state for each layer, $h_T$, stacked on top of each other
- `cell`: (n_layers, batch_size, hidden_dim) - the final cell state for each layer, $c_T$, stacked on top of each other

We will return `hidden` and `cell` only as we only need the final hidden and cell states (to make our **`context vector`**) for the decoder.

In [37]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, droupout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=droupout) #input_size, hidden_size, num_layers
        self.dropout = nn.Dropout(droupout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        output, (hidden, cell) = self.rnn(embedded)

        return hidden, cell

## Decoder

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92//assets/seq2seq3.png)

$({s_t}^1, {c_t}^1) = \text{DecoderLSTM}^1(d(y_t), ({s_{t-1}}^1, {c_{t-1}}^1))$  
$({s_t}^2, {c_t}^2) = \text{DecoderLSTM}^2({s_t}^1, {c_t}^1)$

the initial hidden and cell states to our decoder are our context vectors, which are the final hidden and cell states of our encoder from the same layer
$({s_0}^l) = z^l = ({h_T}^l, {c_T}^l)$

How to make a prediction of the next token in the sequence:
- We pass the hidden state from the top layer of the RNN through a linear layer, $f$, to make a prediction of what the next token in the sequence should be, $\hat{y}_{t+1} = f({s_t}^L)$
-- 
### Forward Pass
- Within the forward method, we accept a batch of `input` tokens, `previous hidden` states and `previous cell` states

In [38]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout= dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.unsqueeze(0))

        return prediction, hidden, cell

## Seq2Seq
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/b3cd54c72cd6e4e63f672d334c795b4fe744ef92//assets/seq2seq4.png)

In [39]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forching_ratio):

        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_length):

            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_force = random.random() < teacher_forching_ratio
            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs




# Traning the model