# Welcome to Seq2Seq Translation!

As part of understanding LSTMs, we will build Seq2Seq translator for translating French to English.

In this module, we seek to understand and implement Attention Transfer for learning Seq2Seq translation

Just as in the previous assignments, please follow along the instructions provided in the cells/comments...

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
use_cuda = torch.cuda.is_available()

# Define a Language Class

Parameters:
1. word2index dict
2. index2word dict
3. word2count dict - to filter-out infrequent words
4. n_count variable - for counting number of unique words in the language

Input:

1. Name of the Language -> French/English etc

Methods:

1. addSentence using addWord
2. addWord

In [2]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self,name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        self.word2count = {}
        self.n_words = 2 #Including SOS and EOS token for each language
    
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self,word):
        if word not in self.word2index:
            self.word2count[word] = 1
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
        else :
            self.word2count[word] += 1

# Utilities for String Preprocessing

This module is already completed. This is basic string processing in python using regex and inbuilt utility functions

1. Convert string to lowerCase
2. remove leading or trailing spaces - use strip()
3. Convert all UniCode to ASCII Characters

In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# Load data

Note: The data is present in eng-fra.txt file. You need to load the data using the following steps

1. Load data from file, removing spaces and splitting by new-line
2. Each line has 2 parts:
    1. The french text
    2. Tab separation '\t'
    3. The English translation
    
3. Use normalize each of the strings and create pairs of sentences.
4. define a language model
5. Return input_lang,output_lang and the pairs

In [4]:
def readLangs(lang1,lang2):
    f = open('data/LSTM/eng-fra.txt', encoding='utf-8').read().strip().split("\n")
    
    pairs = [[normalizeString(p) for p in line.split('\t')] for line in f]
    
    pairs = [list(reversed(p)) for p in pairs]
    input_lang = Lang(lang2)
    output_lang = Lang(lang1)
    
    return input_lang,output_lang,pairs

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p): # boolean function. True -> keep the pair, False ->discard
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1,lang2):
    input_lang,output_lang,pairs = readLangs(lang1,lang2)
    pairs = filterPairs(pairs)
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang,output_lang,pairs

input_lang,output_lang,pairs = prepareData('eng','fra')
print(random.choice(pairs))

Trimmed to 10853 sentence pairs
Counted words:
fra 4489
eng 2925
['tu es un idiot .', 'you re an idiot .']


# Encoder

<img src ="images/lstm2.png">

We are going to simulate the network shown above. It consists of 2 stages. One is the encoding phase and other is the decoding phase..

The Encode Module is going to follow the network structure given below. The purpose of encoder is 2 folds:

1. Outputs some value for every word in the input sequence
2. For every input word - output a vector and a hidden state.

This hidden state is used as "input" for the word of the input sequence.

<img src = "images/lstm1.png" >

Here, we are using a differen type of RNNCell - called the GRU (Gated Recurrent Unit). It is a very popular variant of LSTM Cell.

Please fill in the code below to build the encoder network. You need to fill in the RHS of each line

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size,n_layers=1):
        super(EncoderRNN,self).__init__()
        
        # define hidden size
        self.hidden_size = hidden_size
        # define n_layers
        self.n_layers = n_layers
        # define an embedding from input_size to hidden_size
        self.embedding = nn.Embedding(input_size,hidden_size)
        # gru from hidden to hidden (hidden of embedding to output-hidden)
        self.gru = nn.GRU(hidden_size,hidden_size)
        
    def forward(self,input,hidden):
        # map embedding and reshape it to (1,1,-1) shape (seq_len,batch_Size and input_size)
        embedded = self.embedding(input).view(1,1,-1) #seq_len,batch_size = 1
        # save embedded in variable output
        output = embedded
        # for each gru layer - by default it is 1
        for i in range(self.n_layers):
            # output,hidden hold the return values from gru cell
            output,hidden = self.gru(output,hidden)
        
        # return output and hidden after each forward pass
        return output,hidden
    
    def initHidden(self):
        # return hidden layer values - zeros of size (1,1,hidden_size)
        result = Variable(torch.zeros(1,1,self.hidden_size))
        return result

# Attention Decoder

<img src="images/lstm2.png">

Decoder is another RNN that takes encoders output vector/vectors and maps it to a sequence of words (translation)

It takes the "Context Vector" from the encoder - the last output vector from the encoder module as its initial hidden state. At every step, it is given an input token and a hidden state. Initial state is <SOS> token and the hidden state is the context vector from encoder (its last hidden state)

# Attention Overview
<img src = "images/lstm5.png">

Please read this link - this is the best link I got online for Attention Transfer.

Link: https://distill.pub/2016/augmented-rnns/

To give an overview, Attention transfer mainly deals with "focusing" on important factors. How does attention transfer happens?

# Attention Module

<img src = "images/lstm4.png">

In [6]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self,hidden_size,output_size,n_layers=1,dropout_percent=.25,max_length=10):
        super(AttentionDecoderRNN,self).__init__()
        
        # store hidden_size
        self.hidden_size = hidden_size
        # store output_size
        self.output_size = output_size
        # store n_layers
        self.n_layers = n_layers
        # store drop_out ratio
        self.dropout_percent = dropout_percent
        # store max_length
        self.max_length = max_length
        
        # define embedding from output_size to hidden
        self.embedding = nn.Embedding(output_size,hidden_size)
        # linear embedding from 2*hidden_size to max_len - for attn weights
        self.attn = nn.Linear(hidden_size*2,self.max_length)
        # liear embedding from 2*hidden_size to hidden_size - decoder input
        self.attn_combine = nn.Linear(self.hidden_size*2,self.hidden_size)
        #  nn.dropout for reducing 'instability'
        self.dropout = nn.Dropout(self.dropout_percent)
        # define gru from hidden_size to hidden_size
        self.gru = nn.GRU(hidden_size,hidden_size)
        # linear embedding from hidden_size to output_size
        self.out = nn.Linear(hidden_size,output_size)
    
    def forward(self,input,hidden,encoder_output,encoder_outputs):
        # call embedding utility and reshape it to (1,1,-1)
        embedded = self.embedding(input).view(1,1,-1) #seq_len,batch_size = 1,1
        # apply dropout on embedded
        embedded = self.dropout(embedded)
        # softmax of attn map of concat (hidden[0],embedded[0]) -> input and hidden concat
        attn_weights = F.softmax(self.attn(torch.cat((hidden[0],embedded[0]),1)))
        # apply attention, using batch matrix multiply bmm on attn_weights and encoder_outputs, unsqueeze(0) dim
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0))
        # concat embedded[0] with attention applied[0] in dim 1
        output = torch.cat((embedded[0],attn_applied[0]),1)
        # do a linear embedding of output to hidden_size using attn_combine
        output = self.attn_combine(output).unsqueeze(0)
        # for range in n_layers
        for i in range(self.n_layers):
            # F.relu on output - activation
            output = F.relu(output)
            # pass output variable and hidden as input to gru
            output,hidden = self.gru(output,hidden)
        # final output will be F.log_softmax of the linear embedding (self.out) on output[0] (last decoder output) 
        output = F.log_softmax(self.out(output[0]))
        # return output,hidden and attn_weights
        return output,hidden, attn_weights

# Preparing Training Data

You have pairs [ french_Sentence, English_sentence]. We now need one-hot representation of each of these sentences, w.r.t to their own respective vocabularies. This module will focus on preparing you training data


In [7]:
# given a sentence from language convert to a index-vector
def indexesFromSentence(lang,sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

# convert the index-vector to tensor
def variableFromSentence(lang,sentence):
    indexes = indexesFromSentence(lang,sentence)
    # add EOS flag
    indexes.append(EOS_token)
    # longTensor(index).view(-1,1) -> a column vector
    result = Variable(torch.LongTensor(indexes).view(-1,1))

    return result

def variableFromPair(pair):
    # construct tensors for input and target for every pait
    input_variable = variableFromSentence(input_lang,pair[0])
    target_variable = variableFromSentence(output_lang,pair[1])
    return input_variable,target_variable

In [8]:
input_variable,target_variable = variableFromPair(pairs[1000])

# Training the Model

Now that we have defined our model, we need to train it to be able to perform seq2seq translation. The overview of the training process is as follows:

1. Initialize hidden_layers with zeros
2. Zero grad optimizers for both Encoder and Decoder RNN (Remember - there are 2 RNNs for each iteration)
3. Define input and target lengths
4. Define loss = 0

## EncoderRNN Training

1. For each word in the input sentence, pass it through the encoder
2. Output of each time-step becomes the input for the next time-step


<img src="images/lstm2.png">

Please complete the below module as per the steps given below.

In [9]:
def train(input_variable,target_variable,encoder,decoder,encoder_optimizer,decoder_optimizer, criterion,max_length=10):
    
    # initialize encode hidden
    encoder_hidden = encoder.initHidden()
    
    # zero-out gradient for encoder and decoder optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # compute input and target length
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]
    
    # intialize loss to 0
    loss = 0
    # define encoder outputs (max_len,encoder.hidden_size) for storing all output from each time step
    encoder_outputs = Variable(torch.zeros(max_length,encoder.hidden_size))
    
    # for each input index
    for ei in range(input_length):
        #pass it to the encoder with the hidden state
        encoder_output,encoder_hidden = encoder(input_variable[ei],encoder_hidden)
        encoder_outputs[ei]=encoder_output[0][0]
    
    # append decoder input with sos-token
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    # decoder hidden will be encoder hidden
    decoder_hidden = encoder_hidden
    
    # define teacher_forcing ratio and set flag
    use_teacher_forcing = True if random.random()<.5 else False
    
    if use_teacher_forcing:
        # for each till target length        
        for di in range(target_length):
            # pass decoder_input,decoder_hidden,encoder_output,encoder_outputs through decoder
            decoder_output,decoder_hidden, decoder_attention = decoder(decoder_input,decoder_hidden,encoder_output,encoder_outputs)
           # compute loss ,using deoder output and target variable
            loss += criterion(decoder_output,target_variable[di])
            # set input as targer variable
            decoder_input = target_variable[di]
    else:   
        for di in range(target_length):
            # pass input and hidden through the decoder
            decoder_output,decoder_hidden, decoder_attention = decoder(decoder_input,decoder_hidden,encoder_output,encoder_outputs)
            # find index of maximun using decoder_output probabilities
            topv,topi = decoder_output.data.topk(1)
            next_index = topi[0][0]
            # decoder input is the next variable Tensor
            decoder_input = Variable(torch.LongTensor([[next_index]]))
            # add to loss.. wait! dont backprop till the decoder computation is complete
            loss += criterion(decoder_output,target_variable[di])
            # if index is same as end of sentence then break!
            if next_index==EOS_token:
                break

   # back propagate loss
    loss.backward()
    
    # step optimizer
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    #return loss (normalized loss)
    return loss.data[0]/target_length
                                 

In [None]:
def trainIters(encoder,decoder,n_iter,print_every=1000,learning_rate = .01):
    plot_loss = []
    print_loss = 0
    # define optimizers - encoder and decoder opts
    encoder_optimizer = optim.SGD(encoder.parameters(),lr = learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(),lr = learning_rate)
    # generate random pairs..
    training_set = [variableFromPair(random.choice(pairs)) for i in range(n_iter)]
    # define NLLoss as loss criterion
    criterion = nn.NLLLoss()
    # for number of iterations
    for i in range(1,n_iter+1):
        # extract the input sentence
        train_sentence = training_set[i-1]
        # define input and target variable
        input_variable = train_sentence[0]
        target_variable = train_sentence[1]
        # call train utilities
        loss = train(input_variable,target_variable,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion)
        # add loss
        print_loss += loss
        
        if i%print_every==0:
            avg_loss = print_loss/print_every
            print ("Iteration %d and Loss %f" % (i,avg_loss))
            print_loss = 0
            plot_loss.append(avg_loss)
    
    plt.figure()
    fig, ax = plt.subplots()
    # show loss plot
    plt.plot(plot_loss)
    plt.show()

In [None]:
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words,hidden_size)
decoder = AttentionDecoderRNN(hidden_size,output_lang.n_words)

trainIters(encoder,decoder,n_iter=75000)

In [None]:
def evaluatePair(encoder,decoder,sentence,max_length = 10):
    input_variable = variableFromSentence(input_lang,sentence)
    input_length = input_variable.size()[0]
    
    encoder_hidden = encoder.initHidden()
    encoder_outputs = Variable(torch.zeros(max_length,encoder.hidden_size))
    
    for ei in range(input_length):
        encoder_output,encoder_hidden = encoder(input_variable[ei],encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]
    
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_hidden = encoder_hidden
    
    decoder_words = []
    decoder_att = torch.zeros(max_length,max_length)
    for di in range(max_length):
        decoder_output,decoder_hidden, decoder_attention = decoder(decoder_input,decoder_hidden,encoder_output,encoder_outputs)
        topv, topi = decoder_output.data.topk(1)
        next_index = topi[0][0]
        decoder_att[di] = decoder_attention.data
        if next_index==EOS_token:
            decoder_words.append('<EOS>')
            break
        else:
            decoder_words.append(output_lang.index2word[next_index])
            
        decoder_input = Variable(torch.LongTensor([[next_index]]))
    
    return decoder_words,decoder_att[:di+1]

def evaluate(encoder,decoder,n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print ("Input: ",pair[0])
        print ("Ground Truth: ",pair[1])
        output_words,attention = evaluatePair(encoder,decoder,pair[0])
        output_sentence = ' '.join(output_words)
        print ("Translation: ",output_sentence)
        print ("\n\n")
        print (attention)
        plt.matshow(attention.numpy())
        plt.show()

In [None]:
evaluate(encoder,decoder,n=5)

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluatePair(
        encoder, decoder, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")