[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)

# Attention for seq2seq

This notebook has been adapted fom the pytorch tutorial [NLP FROM SCRATCH: TRANSLATION WITH A SEQUENCE TO SEQUENCE NETWORK AND ATTENTION](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) by Sean Robertson and adapted by Marc Lelarge for the [deep learning course](https://dataflowr.github.io/website/)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# where data is stored:
# if on COLAB, you can comment the following lines 
import os
from pathlib import Path

ROOT_DIR = Path.home()
data_path = os.path.join(ROOT_DIR,'data/')

# and uncomment the following lines if on COLAB, you can download the data with:
#!wget https://download.pytorch.org/tutorial/data.zip
#!unzip data.zip
#data_path = './'

In [None]:
def running_mean(x, N=100):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

# data preprocessing

This code is directly taken from the Pytorch tutorial and creates the corpus of pairs of sentences in french and english as well as the french and englsih tokenizers.

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(data_path+'data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPair_test(p):
    return len(p[0].split(' ')) > MAX_LENGTH and \
        len(p[1].split(' ')) > MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def filterPairs_test(pairs):
    return [pair for pair in pairs if filterPair_test(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

In [None]:
pairs_train, pairs_val = train_test_split(pairs, test_size=0.2, random_state=42)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
val_pairs = [tensorsFromPair(pv) for pv in pairs_val]

# Seq2seq

Here we follow the PyTorch tutorial and implement [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215v3). The only modification in the code is that the encoder takes a full sentence (and no hidden state) in the forward pass and outputs all the correponding hidden states. As a result, there is no need to make a for loop for the encoder. Still, to get things simple, we do not deal with batches, if you want to deal with batches have a look at [Batches with sequences in Pytorch](https://dataflowr.github.io/website/modules/11c-batches-with-sequences/).

We also train on a test set and compute the loss on a validation set (see the split done on the corpus above).

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input):
        embedded = self.embedding(input)
        output, _ = self.gru(embedded, self.initHidden())
        return output

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
n_iters = 4
training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]

In [None]:
training_pairs[0][0]

In [None]:
training_pairs[0][1]

In [None]:
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)

In [None]:
one_input = training_pairs[0][0]
out = encoder(one_input)
# here is the code used in the tutorial:
#encoder_hidden = encoder.initHidden()
#for c in one_input:
#    out, encoder_hidden = encoder(c,encoder_hidden)

In [None]:
out.shape

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

In [None]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = out[-1,:,:].view(1,1,hidden_size)

In [None]:
decoder_hidden.shape

In [None]:
output, hidden = decoder(decoder_input, decoder_hidden)

In [None]:
def train_onepair(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = 0.5):
    encoder_hidden = encoder.initHidden()
    
    encoder = encoder.train()
    decoder = decoder.train()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    target_length = target_tensor.size(0)
    loss = 0
    encoder_outputs = encoder(input_tensor)
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_outputs[-1,:,:].view(1,1,hidden_size)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def eval_onepair(input_tensor, target_tensor, encoder, decoder, criterion, teacher_forcing_ratio = 0.5):
    encoder_hidden = encoder.initHidden()

    encoder = encoder.eval()
    decoder = decoder.eval()
    target_length = target_tensor.size(0)
    loss = 0
    encoder_outputs = encoder(input_tensor)
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_outputs[-1,:,:].view(1,1,hidden_size)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    return loss.item() / target_length

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01, teacher_forcing_ratio=0.9):
    
    plot_losses = []
    plot_losses_val = []
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs_train))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train_onepair(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio)
        print_loss_total += loss
        plot_losses.append(loss)

        if iter % print_every == 0:
            loss_val = 0
            for (input_tensor, target_tensor) in val_pairs:
                loss = eval_onepair(input_tensor, target_tensor, encoder,
                     decoder, criterion, teacher_forcing_ratio)
                loss_val += loss
            loss_val = loss_val/len(val_pairs)
            plot_losses_val.append(loss_val)
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) loss train %.4f and val %.4f' % (iter, iter / n_iters * 100, print_loss_avg, loss_val))
    return plot_losses, plot_losses_val

In [None]:
learning_rate=0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
train_onepair(training_pairs[0][0],training_pairs[0][1],encoder,decoder,encoder_optimizer,decoder_optimizer,criterion)

In [None]:
hidden_size = 128
n_epochs = 50000
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)
plot_losses,plot_losses_val = trainIters(encoder,decoder,n_epochs,print_every=2500)

In [None]:
plt.plot(running_mean(plot_losses))
plt.plot([2500*i for i in range(int(n_epochs/2500))], plot_losses_val)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = encoder(input_tensor)

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_outputs[-1,:,:].view(1,1,hidden_size)

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_val)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder, decoder)

# Attention mechanism

Here we implement the attention mechanism from the paper [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473). The code below is siginficantly different from the original Pytorch Tutorial...

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(self.output_size, self.hidden_size, padding_idx=0)
        self.attn_w = nn.Linear(2 * self.hidden_size, self.hidden_size)
        self.attn_v = nn.Linear(self.hidden_size, 1)
        self.gru = nn.GRU(self.hidden_size * 2, self.hidden_size)
        self.out = nn.Linear(self.hidden_size * 2, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # B = 1 batch size
        # encoder_outputs (L,B,H)
        seq_len, _, _ = encoder_outputs.shape
        # hidden (1,B,H)
        #
        # your code here
        #
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
one_input = training_pairs[0][0]
out = encoder(one_input)

In [None]:
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

In [None]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = decoder.initHidden()
encoder_outputs = out

In [None]:
decoder_input.shape

In [None]:
output, hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs)

In [None]:
attn_weights.shape

In [None]:
torch.sum(attn_weights,dim=0)

In [None]:
def train_onepair(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = 0.5):
    encoder_hidden = encoder.initHidden()
    
    encoder = encoder.train()
    decoder = decoder.train()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    target_length = target_tensor.size(0)
    loss = 0
    encoder_outputs = encoder(input_tensor)
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = decoder.initHidden()
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def eval_onepair(input_tensor, target_tensor, encoder, decoder, criterion, teacher_forcing_ratio = 0.5):
    encoder_hidden = encoder.initHidden()

    encoder = encoder.eval()
    decoder = decoder.eval()
    target_length = target_tensor.size(0)
    loss = 0
    encoder_outputs = encoder(input_tensor)
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = decoder.initHidden()
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    return loss.item() / target_length

In [None]:
learning_rate=0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
train_onepair(training_pairs[0][0],training_pairs[0][1],encoder,decoder,encoder_optimizer,decoder_optimizer,criterion)

In [None]:
hidden_size = 128
n_epochs = 50000
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
plot_losses,plot_losses_val = trainIters(encoder,decoder,n_epochs,print_every=2500)

In [None]:
plt.plot(running_mean(plot_losses))
plt.plot([2500*i for i in range(int(n_epochs/2500))], plot_losses_val)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = encoder(input_tensor)

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = decoder.initHidden()

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di,:input_length] = decoder_attention[:,0,0].data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1,:input_length]

In [None]:
francais = "c est un jeune directeur plein de talent ."#"elle a cinq ans de moins que moi ."
output_words, attentions = evaluate(encoder, decoder, francais)

In [None]:
attentions.shape

In [None]:
torch.sum(attentions,dim=1)

In [None]:
plt.matshow(attentions.numpy())

In [None]:
indexesFromSentence(input_lang, francais)

In [None]:
tensorFromSentence(input_lang, francais)

In [None]:
output_words

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_val)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder, decoder)

In [None]:
import warnings
warnings.filterwarnings("ignore")
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'],rotation =90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder, decoder, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

In [None]:
evaluateAndShowAttention("elle a cinq ans de moins que moi .")

In [None]:
evaluateAndShowAttention("elle est trop petite .")

In [None]:
evaluateAndShowAttention("je ne crains pas de mourir .")

In [None]:
evaluateAndShowAttention("c est un jeune directeur plein de talent .")

[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)