# LAB 6: Machine Translation. Italian to English with Seq2Seq RNNs with attention mechanism.

### Objectives:
1. Understand the seq2seq model architecture.
2. Understand the attention mechanism.
3. Implement a seq2seq model with attention mechanism for machine translation.
4. Train the model on a Italian to English translation dataset.
5. Translate Italian sentences to English.
6. Evaluate the model using quantitative and qualitative evaluation methods.

#### Importing Required Libraries


In [14]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
%matplotlib inline

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data and SetUp

#### Set Up Wandb for Experiment Tracking

In [15]:
import wandb
wandb.init(project="Machine_Translation", entity="nlp_2024", name="trial_1")

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112506263371971, max=1.0…

#### Data

Data can be found in our [github](https://github.com/Neilus03/NLP-2024/blob/main/data/eng-ita.txt).

The dataset is a txt file that looks like this:

```
I haven't eaten pizza recently. Io non ho mangiato della pizza di recente.
I haven't figured that out yet.	Non l'ho ancora capito.
Hello! Ciao!
I saw you in the park with Tom.	L'ho vista al parco con Tom.
```



#### Creating the Lang Class

In [16]:


# Define the start-of-sequence (SOS) and end-of-sequence (EOS) tokens
SOS_token = 0
EOS_token = 1

# Define the language class
class Lang:
    '''
    Lang class to store the language vocabulary and word-to-index & index-to-word mappings.
    It also stores the count of each word in the vocabulary
    '''
    def __init__(self, language):
        self.language = language
        self.word2index = {}  # dictionary to map words to indices
        self.word2count = {}  # dictionary to count the occurrences of each word
        self.index2word = {0: "SOS", 1: "EOS"}  # dictionary to map indices to words
        self.n_words = 2  # Count SOS and EOS tokens (initially 2)

    def addSentence(self, sentence):
        # Add each word in the sentence to the language
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        # Add a word to the language
        if word not in self.word2index:
            # If the word is not already in the dictionary, add it
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            # If the word is already in the dictionary, increment its count
            self.word2count[word] += 1

#### Preprocessing the Data a little bit

In [17]:
def unicodeToAscii(unicode_string):
    # Convert unicode characters to ASCII
    ascii_string = ''
    for c in unicodedata.normalize('NFD', unicode_string): # NFD = Normalization Form Canonical Decomposition
        if unicodedata.category(c) != 'Mn': # Mn = Nonspacing_Mark
            ascii_string += c
    return ascii_string


def normalizeString(s):
    # Normalize the string by:
    
    # converting to lowercase,
    s = unicodeToAscii(s.lower().strip())
    #removing accents 
    s = re.sub(r"([.!?])", r" \1", s)
    #replacing non-alphabetic characters with spaces
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    #removing extra spaces
    return s.strip()


#### Create read_data function, which reads the data from the file and returns the input and target language pairs.

In [20]:
def read_data(lang1, lang2, reverse=True, verbose=False):
    '''
    Read the data file, split the file into lines and split
    lines into pairs. the `reverse` flag is used to translate from italian
    to english instaed of the default english to italian
    '''
    if verbose:
        print("Opening Data")
    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Create Lang instances
    input_lang = Lang(lang1) if not reverse else Lang(lang2)
    output_lang = Lang(lang2) if not reverse else Lang(lang1)
    
    # Leave pairs as they are if not reverse, otherwise reverse the pairs
    pairs = [list(reversed(p)) for p in pairs] if reverse else pairs

    # Return the language vocabulary and the pairs
    return input_lang, output_lang, pairs

#### Example usage of the read_data function, and how the data looks like.

In [21]:
#Example usage of the read_data 
input_lang, output_lang, pairs = read_data('eng', 'ita', reverse=True, verbose=False)
print(random.choice(pairs), input_lang.language, output_lang.language)
input_lang, output_lang, pairs = read_data('eng', 'ita', False, verbose=False)
print(random.choice(pairs), input_lang.language, output_lang.language)

['io sono entusiasta', 'i m enthusiastic'] ita eng
['they re now alone', 'loro sono da soli adesso'] eng ita


#### Trim the dataset to contain only 10000 examples and only phrases with less than 10 words.
This is done to reduce the training time and to make the model learn faster.
If you have a better GPU, you can increase the number of examples and the length of the phrases,
sadly we are GPU-Poor hahaha, maybe Dani and Joan can grab some from the CVC.

In [22]:
MAX_LENGTH = 10
MAX_PAIRS = 100000

def filterPair(p):
    return (len(p[0].split(' ')) and len(p[1].split(' '))) < MAX_LENGTH


def trim_dataset(pairs):
    # Filter pairs using the filterPair condition and choosing 100000 random pairs
    pairs = [pair for pair in pairs if filterPair(pair)]
    return random.sample(pairs, MAX_PAIRS)


The full process for preparing the data is:

-   Read text file and split into lines, split lines into pairs
-   Normalize text, filter by length and content
-   Make word lists from sentences in pairs


In [27]:
def prepare_data(lang1, lang2, reverse=False):
    ''''
    Prepare the data for training by reading the data, filtering the pairs and counting the words
    Returns the input and output language instances and the processed pairs
    '''
    # Read the data
    input_lang, output_lang, pairs = read_data(lang1, lang2, reverse, verbose=True)
    print("Read all the %s sentence pairs in the dataset" % len(pairs))
    # Filter the pairs
    pairs = trim_dataset(pairs)
    print("Trimmed to %s sentence pairs using the trim_dataset function" % len(pairs))

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print(f"There are {input_lang.n_words} words in {input_lang.language} and {output_lang.n_words} words in {output_lang.language}")

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('eng', 'ita')
print(random.choice(pairs))

Opening Data
Read 345244 sentence pairs
Trimmed to 100000 sentence pairs
Counting words...
Counted words:
eng 9236
ita 17105
['i brush my teeth twice a day', 'io mi lavo i denti due volte al giorno']


#### Train, Test and Validation Splits

In [28]:
# Divide data into training, validation and test sets
from sklearn.model_selection import train_test_split

train_pairs, test_val_pairs = train_test_split(pairs, test_size=0.3, random_state=42) #70% train, 30% test & val
test_pairs, val_pairs = train_test_split(test_val_pairs, test_size=0.5, random_state=42) #15% test, 15% val

#Check the amount of data in each set
print(f"The number of training pairs is {len(train_pairs)}\n \
        The number of validation pairs is {len(val_pairs)}\n \
        The number of test pairs is {len(test_pairs)}")

#Sample an example from the training set, validation set and test set
print(f"Training Pair: {random.choice(train_pairs)}")
print(f"Validation Pair: {random.choice(val_pairs)}")
print(f"Test Pair: {random.choice(test_pairs)}")

The number of training pairs is 70000
         The number of validation pairs is 15000
         The number of test pairs is 15000
Training Pair: ['i was very busy yesterday', 'io ero molto impegnato ieri']
Validation Pair: ['please give me a piece of bread', 'per favore dammi un pezzo di pane']
Test Pair: ['you re unambitious', 'lei e priva di ambizioni']


## The Seq2Seq Model

The model is composed of an encoder and a decoder. The encoder reads the input sequence and outputs a context vector for each word in the input sequence. The decoder reads the context vector and generates the output sequence, hopefully translating the input sequence to the output sequence.

![](https://www.guru99.com/images/1/111318_0848_seq2seqSequ1.png)

### Encoder
The encoder of a seq2seq network is a RNN, in our case we'll use a GRU for the sake of simplicity.
This GRU encoder outputs a vector and a hidden state, and uses the hidden state for the
next input word.


In [30]:
class EncoderRNN(nn.Module):
    '''
    EncoderRNN class to encode the input language, it uses an embedding layer and a GRU layer
    basically it encodes the input language into a hidden state.
    '''
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        #Embedding layer to convert words to vectors of fixed size
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        #GRU layer to encode the input language
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        
        #Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        '''Forward pass of the encoder'''
        #Convert the input to embeddings and apply dropout
        embedded = self.dropout(self.embedding(input))
        #Pass the embeddings through the GRU layer and get the output and hidden state
        output, hidden = self.gru(embedded)
        return output, hidden

#### Example usage of the EncoderRNN

In [None]:
#Hidden size
hidden_size = 256

#Create an instance of the EncoderRNN 
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)

#input tensor, it has a batch size of 1 and a sequence length of 10
input_tensor = torch.randint(0, input_lang.n_words, (1, 10)).to(device) #random input tensor [1, 10] with values between 0 and input_lang.n_words

#output and hidden state of the encoder after passing the input tensor
output, hidden = encoder(input_tensor)

#Print interesting information
print(f"Input Tensor Shape: {input_tensor.shape}")
print("-"*100)
print(f"Output Shape: {output.shape}")
print(f"Encoder Output: {output}")
print("-"*100)
print(f"Hidden Shape: {hidden.shape}")
print(f"Encoder Hidden: {hidden}")

### Attention Decoder

The decoder is another RNN that takes the encoder output vector(s) and
outputs a sequence of words to create the translation, in this case we will use an Attention Decoder. In sequence-to-sequence models, an attention decoder lets the decoder focus on different parts of the encoder's output using a calculated set of "attention weights". These weights help create a weighted combination of encoder outputs (attn_applied), enhancing the decoder's ability to select accurate output words based on the input sequence's relevant parts. Here we use Bahdanau attention mechanism. The process involves training a feed-forward layer to calculate these weights, adjusted for varying sentence lengths.

In [42]:
class BahdanauAttention(nn.Module):
    '''
    The typical Badhanau Attention mechanism.
        1. Calculate the attention scores by applying a linear layer to the encoder outputs and decoder hidden state
        2. Apply a softmax to the scores to get the attention weights
        3. Multiply the attention weights by the encoder outputs to get the context vector
        4. Return the context vector and the attention weights
    '''
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        #Linear layers to calculate the attention scores
        self.Wa = nn.Linear(hidden_size, hidden_size) #W matrix [hidden_size x hidden_size]
        self.Ua = nn.Linear(hidden_size, hidden_size) #U matrix [hidden_size x hidden_size]
        self.Va = nn.Linear(hidden_size, 1) #V vector [hidden_size x 1]

    def forward(self, query, keys):
        #Get the sum of the projection of the query and keys
        addition = self.Wa(query) + self.Ua(keys) #Addition shape is [batch_size x seq_len x hidden_size]
        
        #Apply a non-linearity to the sum (tanh in this case because we want to keep the values between -1 and 1)
        activated_addition = torch.tanh(addition)#Activated addition shape is [batch_size x seq_len x hidden_size]
        
        #Get the attention scores by applying the V matrix to the activated addition
        scores = self.Va(activated_addition) #Scores shape right now is [batch_size x seq_len x 1]
        
        #Squeeze the scores to remove the last dimension and unsqueeze the scores to add a dimension at index 1
        scores = scores.squeeze(2).unsqueeze(1) #Scores shape is now [batch_size x 1 x seq_len]

        #Apply a softmax to the scores to get the attention weights adding up to 1
        weights = F.softmax(scores, dim=-1) #Weights shape is [batch_size x 1 x seq_len]
        
        #Multiply the weights by the keys to get the context vector (bmm stands for batch matrix multiplication)
        context = torch.bmm(weights, keys) #Context shape is [batch_size x 1 x hidden_size]

        #Return the context vector and the attention weights.
        return context, weights

class AttnDecoderRNN(nn.Module):
    '''
    Attention Decoder class to decode the encoder outputs and the hidden 
    state into the target language, hopefully making a good translation.
    It uses an embedding layer, an attention layer and a GRU layer 
    (you could add more layers if you want to improve the model)
    '''
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True) #2 * hidden_size because we concatenate the embeddings and context vector
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None): #encoder outputs and encoder hidden are returned by the encoder
        '''
        Forward takes care of the forward pass of the decoder, it loops over the whole sequence length,
        translating one word at a time by using the step_forward function.
        '''
        #Get the batch size
        batch_size = encoder_outputs.size(0) 
        
        #Initialize the decoder input with the SOS token for each sentence in the batch
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        
        #Initialize the decoder hidden state with the encoder hidden state 
        decoder_hidden = encoder_hidden
        
        #Initialize the decoder outputs and attention maps as empty lists
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            #Make a forward step in the decoder, get the output, hidden state and attention weights for the decoder
            decoder_output, decoder_hidden, attn_weights = self.forward_step(decoder_input, decoder_hidden, encoder_outputs)
            
            #Append the decoder output and attention weights to their respective lists
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            #If there is a target tensor, use it as the next input, otherwise use the decoder output
            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1) #Get the index of the maximum value
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input, gradient is not computed for the input.

        #Concatenate the decoder outputs and attention weights along the sequence length dimension
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) #Apply a log softmax to the decoder outputs
        attentions = torch.cat(attentions, dim=1)

        #Return the decoder outputs and attention weights
        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        '''
        forward_step takes care of the forward pass of the decoder for a single time step (i.e. a single word)
        '''
        #Get the embeddings of the input with the applied dropout
        embedded =  self.dropout(self.embedding(input))

        #Permute the hidden state to have the batch size as the first dimension
        query = hidden.permute(1, 0, 2) #from [1, batch_size, hidden_size] to [batch_size, 1, hidden_size]
        
        #Get the context vector and attention weights by applying the attention mechanism (Bahdanau)
        context, attn_weights = self.attention(query, encoder_outputs)
        
        #Concatenate the embeddings and context vector along the hidden size dimension
        input_gru = torch.cat((embedded, context), dim=2) #Concatenation shape is [batch_size x 1 x 2 * hidden_size]
        
        #Pass the concatenated tensor through the GRU layer, get the output and hidden state
        output, hidden = self.gru(input_gru, hidden)
        
        #Pass the output through a linear layer to get the decoder output
        output = self.out(output) #Now shape is [batch_size x 1 x output_size] 
                                  #where output_size is the number of words in the output language (i.e. Vocabulary size)

        #Return the decoder output, hidden state and attention weights
        return output, hidden, attn_weights

#### Example usage of the AttnDecoderRNN

In [43]:
#Example usage of the AttnDecoderRNN

#Create an instance of the AttnDecoderRNN
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

#Pass the encoder
decoder_outputs, decoder_hidden, attentions = decoder(output, hidden)

#Print interesting information
print(f"Decoder Outputs Shape: {decoder_outputs.shape}")
print(f"Decoder Hidden Shape: {decoder_hidden.shape}")
print(f"Attentions Shape: {attentions.shape}")

Decoder Outputs Shape: torch.Size([1, 10, 17105])
Decoder Hidden Shape: torch.Size([1, 1, 256])
Attentions Shape: torch.Size([1, 10, 10])



## Training



### Preparing Training Data

To train, for each pair we will need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the
EOS token to both sequences.

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepare_data('eng', 'ita', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

Training the Model
==================

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the `<SOS>` token as its first input, and the last hidden state of the
encoder as its first hidden state.

\"Teacher forcing\" is the concept of using the real target outputs as
each next input, instead of using the decoder\'s guess as the next
input. Using teacher forcing causes it to converge faster but [when the
trained network is exploited, it may exhibit
instability](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.378.4095&rep=rep1&type=pdf).

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation
-intuitively it has learned to represent the output grammar and can
\"pick up\" the meaning once the teacher tells it the first few words,
but it has not properly learned how to create the sentence from the
translation in the first place.

Because of the freedom PyTorch\'s autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
`teacher_forcing_ratio` up to use more of it.


In [12]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.


In [13]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-   Start a timer
-   Initialize optimizers and criterion
-   Create set of training pairs
-   Start empty losses array for plotting

Then we call `train` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.


In [14]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
================

Plotting is done with matplotlib, using the array of loss values
`plot_losses` saved while training.


In [15]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder\'s predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder\'s
attention outputs for display later.


In [16]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:


In [17]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

Training and Evaluating
=======================

With all these helper functions in place (it looks like extra work, but
it makes it easier to run multiple experiments) we can actually
initialize a network and start training.

Remember that the input sentences were heavily filtered. For this small
dataset we can use relatively small networks of 256 hidden nodes and a
single GRU layer. After about 40 minutes on a MacBook CPU we\'ll get
some reasonable results.

<div style="background-color: #54c7ec; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>NOTE:</strong></div>
<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">
<p>If you run this notebook you can train, interrupt the kernel,evaluate, and continue training later. Comment out the lines where theencoder and decoder are initialized and run <code>trainIters</code> again.</p>
</div>


In [18]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Reading lines...
Read 345244 sentence pairs
Trimmed to 32643 sentence pairs
Counting words...
Counted words:
ita 5466
eng 3219
2m 51s (- 42m 45s) (5 6%) 0.8493
5m 48s (- 40m 41s) (10 12%) 0.1683
8m 46s (- 38m 0s) (15 18%) 0.0803


Set dropout layers to `eval` mode


In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

Visualizing Attention
=====================

A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run `plt.matshow(attentions)` to see attention output
displayed as a matrix. For a better viewing experience we will do the
extra work of adding axes and labels:


In [None]:
def showAttention(input_sentence, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])


evaluateAndShowAttention('io sono troppo stanco per guidare')

evaluateAndShowAttention('tu sei un bravo ragazzo')

evaluateAndShowAttention('ci sono facendo gli spaghetti')

evaluateAndShowAttention('lui e molto carino')

Exercises
=========

-   Try with a different dataset
    -   Another language pair
    -   Human → Machine (e.g. IOT commands)
    -   Chat → Response
    -   Question → Answer
-   Replace the embeddings with pretrained word embeddings such as
    `word2vec` or `GloVe`
-   Try with more layers, more hidden units, and more sentences. Compare
    the training time and results.
-   If you use a translation file where pairs have two of the same
    phrase (`I am test \t I am test`), you can use this as an
    autoencoder. Try this:
    -   Train as an autoencoder
    -   Save only the Encoder network
    -   Train a new Decoder for translation from there
