# CentraleSupelec - Natural language processing
# Practical session n°7

## Translation with RNN and attention

In this labwork, we are going to tackle the task of translation using recurrent neural network. We are going to implement a model similar to the one described in the paper "Neural Machine Translation by Jointly Learning to Align and Translate" (Bahdanau et Al. 2014) (https://arxiv.org/abs/1409.0473).

In [None]:
import os
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import time
import random
import math

print(torch.__version__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Loading data

First, let's load the data. We are going to use a toy corpus provided with the pytorch tutorials. It contains 135842 french sentences along with their aligned english translation. I provide the code to preprocess the data, build the dictionaries, convert sentences into indice tensors, create a dataloader of padded input sentences.

**Question**: Read and execute the following cells. Try to understand how the dataloader works. In particular, understand the pack_padded_sequences (https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html) we are using in order to speed-up the computation of the encoder network. 

In [None]:
if not os.path.exists("data"):
    !wget https://download.pytorch.org/tutorial/data.zip
    !unzip data.zip

In [None]:
lang1, lang2 = 'eng', 'fra'
lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
    read().strip().split('\n')

In [None]:
# sample size (try with smaller sample size to reduce computation)
num_examples = 50000

# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]

In [None]:
data = pd.DataFrame(original_word_pairs, columns=["eng", "fra"])

In [None]:
data.head(5)

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = re.sub(r'[" "]+', " ", w)    
    
    w = w.strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [None]:
# Now we do the preprocessing using pandas and lambdas
data["eng"] = data.eng.apply(lambda w: preprocess_sentence(w))
data["fra"] = data.fra.apply(lambda w: preprocess_sentence(w))
data.sample(10)

In [None]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
            
        # sort the vocab
        self.vocab = sorted(self.vocab)

        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [None]:
# index language using the class above
inp_lang = LanguageIndex(data["fra"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())
# Vectorize the input and target languages
source_tensor = [[inp_lang.word2idx[s] for s in fra.split(' ')]  for fra in data["fra"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["eng"].values.tolist()]
source_tensor[:10]

**Question**: complete the code of the function *print_sent(corpus, dict, n)* so that it prints the $n$ first sentences of a corpus of index tensors $corpus$, using a dictionary $dict$.

Ex: print_sent(tensor, inp_lan, 10)

outputs:

In [None]:
def print_sent(corpus, dict_lang, n):
    ############## Start coding here #####################
    for tensor in corpus[:n]:
        sent = [dict_lang.idx2word[widx] for widx in tensor]
        print(f"The tensor {tensor} is the encoding of {sent}")
     ############## Stop coding here #####################

In [None]:
print("#"*50, " source sentences ", "#"*50)
print_sent(source_tensor, inp_lang, 10)
print("#"*50, " target sentences ", "#"*50)
print_sent(target_tensor, targ_lang, 10)

In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(source_tensor), max_length(target_tensor)
print(max_length_inp)

In [None]:
def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
    return padded

In [None]:
# inplace padding
source_tensor = [pad_sequences(x, max_length_inp) for x in source_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
len(target_tensor)

In [None]:
# Creating training and validation sets using an 80-20 split
source_tensor_train, source_tensor_val, target_tensor_train, target_tensor_val = train_test_split(source_tensor, target_tensor, test_size=0.2)

# Show length
len(source_tensor_train), len(target_tensor_train), len(source_tensor_val), len(target_tensor_val)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
# conver the data to tensors and pass to the Dataloader 
# to create an batch iterator

class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)

In [None]:
BUFFER_SIZE = len(source_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 512 #256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = MyData(source_tensor_train, target_tensor_train)
val_dataset = MyData(source_tensor_val, target_tensor_val)

train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, 
                     shuffle=False)

## The model

### Encoder

Let's first define the encoder part of our model. It consists in a bidirectional gatted Recurrent Unit (similar to the a long short-term memory (LSTM) but with different (and simpler) gates) taking word embeddings as input. For each input, the GRU outputs a forward hidden state $\overrightarrow{h}$ and a backward one $\overleftarrow{h}$. A simple concatenation of two, followed by a linear layer represents the encoder state. The motivation is to include both the preceding and following words in the annotation of one word.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
        
        #src = [src len, batch size]
        #src_len = [batch size]
        
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
                
        #need to explicitly put lengths on cpu!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))
                
        packed_outputs, hidden = self.rnn(packed_embedded)
                                 
        #packed_outputs is a packed sequence containing all hidden states
        #hidden is now from the final non-padded element in the batch
            
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
            
        #outputs is now a non-packed sequence, all hidden states obtained
        #  when the input is a pad token are all zeros
            
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
   
        #hidden [0, :, : ] is the last of the forwards RNN 
        #hidden [1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        #hidden = [n layers * num directions, batch size, hid dim]
        hidden = torch.tanh(self.fc(torch.cat((hidden[0,:,:], hidden[1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

### Decoder

The decoder network is also a recurrent neural network but with an additional attention mechanism.

Given a source sentence $x=[x_1,x_2,\dots,x_n]$ and a target sentence  $y=[y_1,y_2,\dots,y_m]$, the decoder hidden states are obtained using the function $f(s_{t-1}, y_{t-1}, c_t)$.

$c_t$ is called the context and is a sum of the encoder hidden states $h_i$ of the input sequence, weighted by alignment scores $\alpha_{t,i}$

$$c_t = \sum_i^n{\alpha_{t,i}h_i}$$

and

$$\alpha_{t,i} = align(y_t,x_i)$$

$$\ \ \ \ = softmax(score(s_{t-1}, h_i))$$

The alignment model assigns a score $α_{t,i}$ to the pair $(y_t,x_i)$ of input at position $i$ and output at position $t$, based on how well they match. The set of $\{α_{t,i}\}$ are weights defining how much of each source hidden state should be considered for each output.

During the lecture, we saw a basic dot-product attention alignment score in which we simply take the dot product between $s_{t-1}$ and $h_i$. This implies that the encoder and decoder dimensions are the same. 

In this practical session, we are going to implement the attention score proposed in Bahdanau’s paper (https://arxiv.org/abs/1409.0473). $α$ is parametrized by a feed-forward neural network and this network is jointly trained with other parts of the model. The score function is therefore in the following form, given that $tanh$ is used as the non-linear activation function:

$$score(s_t,h_i) = v^⊤_a tanh(W_a[s_t;h_i])$$

where both $v_a$ and $W_a$ are weight matrices to be learned in the alignment model.

**Question**: Read and understand the code in the class $Decoder$. Complete the code in the class $Attention$ to implement Bahdanau's version of the attention.  

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
        """
        Arguments:
            input: embedings of words y_{i-1}
            hidden: hidden representations s_{t-1}
            encoder_outputs: hidden representations of the entire source sequences
            mask: binary mask to identify padding (True: word, False: padding)            
        """
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        #mask = [batch size, src len]
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs, mask)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):
        """
        Arguments:
            hidden: hidden representations s_{t-1}
            encoder_outputs: hidden representations of the entire source sequences
            mask: binary mask to identify padding (True: word, False: padding)            
        """        
        # The mask is used to force the model not to focus on padding words by artificially setting the attention score to -1e10
        
        ####################### Start coding here #########################
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
  
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        scores = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #scores = [batch size, src len, dec hid dim]

        attention = self.v(scores).squeeze(2)
        
        #attention = [batch size, src len]
        attention = attention.masked_fill(mask == False, -1e10)

        return torch.softmax(attention, dim = 1)
        ####################### Stop coding here #########################

In [None]:
##### Sanity Check ########
B, num_dec, num_enc, src_len = 12, 23, 14, 8

att = Attention(num_enc, num_dec)
hidden = torch.randn(B, num_dec)
encoder_outputs = torch.randn(src_len, B, 2*num_enc)
mask = torch.randn(B, src_len)
alphas = att(hidden, encoder_outputs, mask)
assert(alphas.shape == torch.Size((B, src_len)))
print("caution: this check only assert that your model is forwarding. I does not check that your implementation is correct!")
print("check passed")

### Regrouping the encoder and the decoder

We finally regroup the encoder and the decoder in a single class. Note that we are teacher forcing during training. The training procedure apply the following steps:

1. Pass the input through the encoder which return encoder output and the encoder hidden state.
2. The encoder output, encoder hidden state and the decoder input (which is the start token) is passed to the decoder.
3. The decoder returns the predictions and the decoder hidden state.
4. For each word in the target sequence:
    1. Randomly choose if we are using teacher forcing. 
        1. If yes, the target word is passed as the next input to the decoder, along with the previous decoder hidden state and the encoder hidden states 
        2. else the most likely word, as predicted by the current decoder, is passed as the next input to the decoder, along with the previous decoder hidden state and the encoder hidden states
    2. The decoder returns the predictions and the decoder hidden state.   

Note that we also add a mask to to force the attention to only be over non-padding elements of the input.

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #src_len = [batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
                    
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        mask = self.create_mask(src)

        #mask = [batch size, src len]
        #print("trg_len", trg_len)        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state, all encoder hidden states 
            #  and mask
            #receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        return outputs            

### Training the model

Let's define the meta-parameters of the model and create our model.

In [None]:
INPUT_DIM = len(inp_lang.vocab)+1
OUTPUT_DIM = len(targ_lang.vocab)+1
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = inp_lang.word2idx["<pad>"]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

optimizer = optim.Adam(model.parameters())

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
#model.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
# We don't want to train the network for the padding target. 
# For that, we use the option ignore_index that specifies a target
# value that is ignored and does not contribute to the input gradient.
# See https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html 
# for more details.

TRG_PAD_IDX = targ_lang.word2idx["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
### sort batch function to be able to use with pad_packed_sequence
def sort_batch(X, y, lengths):
    max = lengths.max().item()
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx].narrow(1,0,max)
    y = y[indx]
    return X.transpose(0,1), y.transpose(0,1), lengths # transpose (batch x seq) to (seq x batch)

**Question**: Complete the training function in order to:

1. Forward batch training samples (use the provide function *sort_batch*)
2. Compute the cost
3. Backward the gradient error
4. Update the weight using the optimizer

In [None]:
from tqdm import tqdm

def train(model, dataset, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for (batch, (inp, targ, inp_len)) in tqdm(enumerate(dataset)):   
        
        ################### Start coding here ################
        src, trg, src_len = sort_batch(inp, targ, inp_len)
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        output = model(src, src_len, trg)

        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].reshape(-1)

        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        ################### Stop coding here ################

            
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataset)

**Question**: Complete the evaluate function in order to:

1. forward batch training samples (use the provide function *sort_batch*)
2. compute the cost

**Caution**: don't forget to remove the teacher forcing strategy when forwarding the model

In [None]:
def evaluate(model, dataset, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for (batch, (inp, targ, inp_len)) in enumerate(dataset):   
            
            ################## Start coding here ################
            src, trg, src_len = sort_batch(inp, targ, inp_len)
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, src_len, trg, 0) #turn off teacher forcing
            
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
    
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)
            
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            ################## Stop coding here ################

            epoch_loss += loss.item()
        
    return epoch_loss / len(dataset)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Let's train our model!!!

In [None]:
N_EPOCHS = 10
CLIP = 1


best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

### Testing the model

Now that the model is trained, you can test it using the provided functions:

- $translate\_sentence()$ translates an input sentence by forwarding the source sentence using the encoder and recursively decodes the translation using the decoder. The function also returns an alignment matrix (the $t^{th}$ row correspond to the alignment scores $\alpha_{i,t}$ for the target word $y_t$ with any source words $x_i$.
- $display\_attention()$ allows to visualize the attention matrix.

You can either test your own model or test a model trained for 8 epoch using 500000 sentences (fyi, it took 1:30 hours). 

    torch.load('/mounts/Datasets1/NLP_Course/tp7/big_model.save')
    
Note: If you want to use the pre-trained model, you must run all the cells of this notebook (exept the one training the model) with the parameter $num\_examples$ set to 500000. Otherwise, the dictionnary size might not be the same, leading to an erratic behavior.

In [None]:
torch.load('/mounts/Datasets1/NLP_Course/tp7/big_model.save')

In [None]:
def translate_sentence(sentence, targ_lang, model, device, max_len = 50):

    model.eval()
        
    src_tensor = torch.LongTensor(src).unsqueeze(1).to(device)

    src_len = torch.LongTensor([len(src)])
    
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor, src_len)

    mask = model.create_mask(src_tensor)
        
    attentions = torch.zeros(max_len, 1, len(src)).to(device)
    
    trg_indexes = [targ_lang.word2idx["<start>"]]
    
    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
                
        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)

        attentions[i] = attention
            
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)

        if pred_token == targ_lang.word2idx["<end>"]:
            break
    
    trg_tokens = [targ_lang.idx2word[i] for i in trg_indexes]
    
    return trg_tokens[1:], attentions[:len(trg_tokens)-1]

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def display_attention(sentence, translation, attention):
    
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
   
    ax.tick_params(labelsize=15)
    
    x_ticks = [''] + [t.lower() for t in sentence]
    y_ticks = [''] + translation
     
    ax.set_xticklabels(x_ticks, rotation=45)
    ax.set_yticklabels(y_ticks)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()



In [None]:
example_idx = 10

src = val_dataset[example_idx] #getting example
src = src[0][:src[2]] #removing padding
src_tokens = [inp_lang.idx2word[i] for i in src]
print(src_tokens)

In [None]:
translation, attention = translate_sentence(src, targ_lang, model, device)

print(f'predicted trg = {translation}')

In [None]:
display_attention(src_tokens, translation, attention)