In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Dataset
The dataset has ~30,000 parallel English and German sentences

In [3]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [5]:
train_data, valid_data, test_data = (
    dataset['train'],
    dataset['validation'],
    dataset['test']
)

In [68]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

## Tokenizers
We will downloaded the pre-trained tokenizers for English and German ("de_core_news_sm" and "en_core_web_sm" respectively) from the `spacy` library by the following command line:
```bash
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```
and then load them in the code as follows:

In [6]:
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

In [7]:
test_string = "What a lovely day !!"

[token.text for token in en_nlp.tokenizer(test_string)]

['What', 'a', 'lovely', 'day', '!', '!']

Helper functions to apply the tokenizers on each example in the dataset

In [8]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [9]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

## Vocabulary

1. We will build the vocabulary for the English and German languages using the `build_vocab_from_iterator` function, provided by `torchtext`.
2. The vocabulary is used to associate each unique token in our dataset with an index (an integer), e.g. "hello" = 1, "world" = 2, "bye" = 3, "hates" = 4, etc

We have various kind of special tokens that we need to add to our vocabulary, such as:
- `<sos>`: Start of sentence token
- `<eos>`: End of sentence token
- `<unk>`: Unknown token for out-of-vocabulary words
- `<pad>`: Padding token to make all the sentences in the same batch have the same length

In [12]:
import torchtext.vocab


min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    sos_token,
    eos_token,
    pad_token
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['en_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['de_tokens'],
    min_freq= min_freq,
    specials= special_tokens
)

check that both our vocabularies have the same index for the unknown and padding tokens as this simplifies some code later on.

In [13]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

Using `set_default_index` function to set the default index for the unknown tokens to be 0.

In [14]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [15]:
def numericalize_example(example, en_vocab, de_vocab):
    # convert tokens to indices
    en_ids = en_vocab.lookup_indices(example['en_tokens'])
    de_ids = de_vocab.lookup_indices(example['de_tokens'])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [16]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs= fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs= fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs= fn_kwargs)

Convert the ids from int to tensor

In [17]:
data_type = "torch"
format_columns = ['en_ids', 'de_ids']

train_data = train_data.with_format(
    type= data_type,
    columns= format_columns,
    output_all_columns= True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

# Data Loaders
- the `collate_fn` is a function that defines how to combine a batch of samples into a single batch tensor that can be fed into the model for training or inference.


In [18]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_de_ids = [example['de_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value= pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value= pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids
        }
        return batch
    
    return collate_fn

The `DataLoader` internally handles the process of passing the batches to the `collate_fn` function

Shuffling of data makes training more stable and potentially improves the final performance of the model, however only needs to be done on the training set

In [19]:
import torch.utils.data.dataloader


def get_data_loader(dataset, batch_size, pad_index, shuffle = False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset= dataset,
        batch_size= batch_size,
        collate_fn= collate_fn,
        shuffle= shuffle
    )

    return data_loader

In [21]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index) 
test_data_loader = get_data_loader(test_data, batch_size, pad_index) 

- if encoder_hidden_dim is the size of the hidden state in one direction, the concatenated hidden state from both directions will have a size of `encoder_hidden_dim * 2`.
- `hidden [-2, :, : ]` is the last of the forwards RNN
- `hidden [-1, :, : ]` is the last of the backwards RNN


In [22]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim ,bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim= 1))) # concatenates the last hidden states from both directions along the second dimension

        return outputs, hidden

**Additive Attention:** $e_i = v^T tanh(W_a [h_i; s_j])$
- $v$ = self.v_fc
- $W_a$ = self.attn_fc
- $h_i$ is the i-th encoder hidden state
- $s_j$ is the j-th decoder hidden state (repeated `src_length` times)
- $[h_i; s_j]$ is the concatenation of the encoder hidden state and the decoder hidden state

In [23]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(encoder_hidden_dim * 2 + decoder_hidden_dim, decoder_hidden_dim)

        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False) 

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]

        # repeat decoder hidden state src_length times
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))

        attention = self.v_fc(energy).squeeze(2) #remove dimetion in index num 2

        return torch.softmax(attention, dim=1)


In [24]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.atteention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.atteention(hidden, encoder_outputs)
        a = a.unsqueeze(1) #add a dimention in index 1
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs) #M*M
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueez(0))
        
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = embedded.squeeze(0)

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))

        return prediction, hidden.squeeze(0), a.squeeze(1)

In [25]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device


    def forward(self, src, trg, teacher_forcing_ratio):

        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs ,hidden = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_length):

            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs


In [27]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
encoder_hidden_dim = 512
decoder_hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout, 
    attention
)

model = Seq2Seq(
    encoder,
    decoder,
    device
).to(device)

### Weight Initialization

In [28]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (atteention): Attention(
      (attn_fc): Linear(in_features=1536, out_features=512, bias=True)
      (v_fc): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,518,405 trainable parameters


### Optimizer

In [30]:
optimizer = optim.Adam(model.parameters())

### Loss Function
- The `CrossEntropyLoss` function calculates both the log softmax as well as the negative log-likelihood of our predictions.
- We ignore the loss whenever the target token is a padding token.
- The loss function, such as `nn.CrossEntropyLoss` in PyTorch, expects the following input shapes:
    - The output tensor should be a 2D tensor of shape (N, C), where N is the total number of elements in the flattened sequence, and C is the number of classes (in this case, the size of the output vocabulary).
    - The trg tensor should be a 1D tensor of shape (N,), where each element represents the true target class for each corresponding element in the flattened output tensor.



In [31]:
criterion = nn.CrossEntropyLoss(ignore_index= pad_index)

### Training Loop

At each iteration:
- Get the source and target sentences from the batch, $X$ and $Y$
- Zero the gradients calculated from the last batch
- Feed the source and target into the model to get the output $\hat{Y}$
- As the loss function only works on 2d inputs with 1d targets we need to flatten each of them with `.view`
    - We slice off the first column of the output and target tensors as mentioned earlier
- Calculate the gradients with `loss.backward()`
- Clip the gradients to prevent them from exploding
- Update the parameters of our model by doing an optimizer step
- Sum the loss value to a running total
- Finally, return the loss that is averaged over all batches

In [118]:
def train_fn(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0 
    model = model.to(device)  # Move model to device
    criterion = criterion.to(device)

    for i, batch in enumerate(data_loader):
        src =batch['de_ids'].to(device)
        trg =batch['en_ids'].to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio) 
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1] # We get the vocab_size

        #we cut off the first element of each tensor 
        output = output[1:].view(-1, output_dim) # Reshape it whith x number of rows and output_dim number of columns
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss/ len(data_loader)


### Evaluation Loop

In [123]:
def evaluation_fn(model, data_loader, criterion, device):
    model.eval()
    model = model.to(device)
    criterion = criterion.to(device)
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['de_ids'].to(device)
            trg = batch['en_ids'].to(device)

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

### Model Traning

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float('inf') # infinite value

for epoch in range(n_epochs):
    train_loss = train_fn(
        model, 
        train_data_loader,
        optimizer, 
        criterion,
        clip, 
        teacher_forcing_ratio,
        device
    )

    valid_loss = evaluation_fn(
        model, 
        valid_data_loader,
        criterion,
        device
    )

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch number: {epoch}')
    print(f'\tTrain Loss: {train_loss: 7.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\tValid Loss: {valid_loss: 7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}')


### Evaluating the Model

In [None]:
model.load_state_dict(torch.load('tut1-model.pt', map_location= torch.device(device)))
test_loss = evaluation_fn(model, test_data_loader, criterion, device)
print(f'| Test Loss: {test_loss: 7.3f} | Test PPL: {np.exp(test_loss):7.3f} |')

In [131]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            de_tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            de_tokens = [token for token in sentence]
        if lower:
            de_tokens = [token.lower() for token in de_tokens]
        de_tokens = [sos_token] + de_tokens + [eos_token]
        ids = de_vocab.lookup_indices(de_tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        attentions = torch.zeros(max_output_length, 1, len(ids))
        for i in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, attention = model.decoder(
                inputs_tensor, hidden, encoder_outputs
            )
            attentions[i] = attention
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        en_tokens = en_vocab.lookup_tokens(inputs)
    return en_tokens, de_tokens, attentions[: len(en_tokens) - 1]

In [None]:
def plot_attention(sentence, translation, attention):
    fig, ax = plt.subplots(figsize=(10, 10))
    attention = attention.squeeze(1).numpy()
    cax = ax.matshow(attention, cmap="bone")
    ax.set_xticks(ticks=np.arange(len(sentence)), labels=sentence, rotation=90, size=15)
    translation = translation[1:]
    ax.set_yticks(ticks=np.arange(len(translation)), labels=translation, size=15)
    plt.show()
    plt.close()

In [134]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

['<sos>', 'a', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

### calculating the BLEU score

In [135]:
translation = [
    translate_sentence(
    example['de'],
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    )
    for example in test_data
]

In [None]:
bleu = evaluate.load("bleu")

In [138]:
predictions = [" ".join(translation[1:-1]) for translation in translation]

references = [[example["en"]] for example in test_data]

In [None]:
predictions[0], references[0]

In [140]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [141]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [None]:

tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

In [143]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [None]:
results