In [1]:
import json
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import math
import matplotlib.ticker as ticker
import numpy as np
import time
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.utils.data

In [2]:
MAX_LENGTH = 50

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.dropout_p = dropout_p

        self.word_embedding = nn.Embedding(input_size, hidden_size)
        self.pos_embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.linear = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded_words = self.word_embedding(input).view(1, 1, -1)
        embedded_pos = self.pos_embedding(input).view(1, 1, -1)
        output = self.dropout(embedded_words + embedded_pos)
        
        output, hidden = self.linear(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

    
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [3]:
def read_data(file_name):
    """
    Reads the data and returns it in a list.
    """
    
    f = open(file_name, "r")
    return [line.strip().split() for line in f.readlines()]


def word_to_index(file_name):
    """
    Obtains the vocabulary of a file and returns it 
    in a dictionary to be able to use w2i.
    """
    
    file = open(file_name) 
    w2i = json.load(file)
    w2i["sos"] = len(w2i)
    return w2i


def index_to_word(dictionary):
    """
    Reverses the dictionary such that i2w can be used.
    """
    
    reversed_dict = {}
    
    for word, index in dictionary.items():
        reversed_dict[index] = word
    reversed_dict[index + 1] = "sos" 
    return reversed_dict


def sentence_to_indices(w2i, sentence):
    """
    Returns the indices of the words in a sentence in a list.
    """
    
    return [w2i[word] for word in sentence]


def sentence_to_tensor(w2i, sentence):
    """
    Returns the tensor of a sentence.
    """
    
    indices = sentence_to_indices(w2i, sentence)
    indices.append(EOS_token)
    return torch.tensor(indices, dtype=torch.long).view(-1, 1)

train_english = read_data("data/train_preprocessed.en")
train_french = read_data("data/train_preprocessed.fr")

w2i_french = word_to_index("data/train_preprocessed.fr.json")
w2i_english = word_to_index("data/train_preprocessed.en.json")

i2w_french = index_to_word(w2i_french)
i2w_english = index_to_word(w2i_english)

EOS_token = w2i_english["eos"]
SOS_token = w2i_english["sos"]
teacher_forcing_ratio = 0.5
encoder = EncoderRNN(len(i2w_english), 256)
decoder = AttnDecoderRNN(256, len(i2w_french))

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))    
    

def train(input_sentence, target_sentence, w2i_english, 
          w2i_french, encoder, decoder, encoder_optimizer, 
          decoder_optimizer, criterion, minibatch_size, 
          max_length=MAX_LENGTH):
    """
    Does one iteration of training.
    """
    
    loss = 0     
    input_tensor = sentence_to_tensor(w2i_english, input_sentence)
    target_tensor = sentence_to_tensor(w2i_french, target_sentence)

    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    if input_length > MAX_LENGTH: input_length = MAX_LENGTH
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]])
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break         
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item()/target_length


def train_iterations(w2i_english, w2i_french, train_english, train_french,
                     encoder, decoder, minibatch_size, n_iters, print_every=100, 
                     plot_every=100, learning_rate=0.01):
    """
    Trains the Encoder-Decoder model for a certain amount of iterations.
    """
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        input_sentence = train_english[iter-1]
        target_sentence = train_french[iter-1]
        loss = train(input_sentence, target_sentence, w2i_english, w2i_french,
                     encoder, decoder, encoder_optimizer, decoder_optimizer, 
                     criterion, minibatch_size)        
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter/n_iters),
                                         iter, float(iter)/n_iters*100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    showPlot(plot_losses)
    
    
trainloader_english = torch.utils.data.DataLoader(train_english, 
                                                  batch_size=32, shuffle=False, 
                                                  num_workers=8)
trainloader_french = torch.utils.data.DataLoader(train_french,
                                                batch_size=32, shuffle=False,
                                                num_workers=8)
# testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=8)
    
train_iterations(w2i_english, w2i_french, train_english, train_french,
                 encoder, decoder, 10, 29000)

0m 9s (- 46m 36s) (100 0%) 4.0013
0m 19s (- 47m 3s) (200 0%) 4.0461
0m 28s (- 45m 54s) (300 1%) 4.0258
0m 38s (- 45m 18s) (400 1%) 3.8602
0m 48s (- 45m 37s) (500 1%) 4.2006
0m 57s (- 45m 28s) (600 2%) 3.9255
1m 7s (- 45m 35s) (700 2%) 3.9667
1m 17s (- 45m 29s) (800 2%) 3.8481
1m 28s (- 45m 47s) (900 3%) 3.8421
1m 41s (- 47m 10s) (1000 3%) 3.7804
1m 50s (- 46m 48s) (1100 3%) 3.6539
2m 2s (- 47m 10s) (1200 4%) 3.9055
2m 12s (- 47m 0s) (1300 4%) 3.6942
2m 23s (- 47m 12s) (1400 4%) 3.7904
2m 33s (- 46m 59s) (1500 5%) 3.8079
2m 43s (- 46m 37s) (1600 5%) 3.5527
2m 55s (- 46m 50s) (1700 5%) 3.5719
3m 6s (- 46m 56s) (1800 6%) 3.6230
3m 16s (- 46m 45s) (1900 6%) 3.6656
3m 26s (- 46m 31s) (2000 6%) 3.6212
3m 36s (- 46m 14s) (2100 7%) 3.5914
3m 47s (- 46m 6s) (2200 7%) 3.6264
3m 56s (- 45m 45s) (2300 7%) 3.5148
4m 7s (- 45m 42s) (2400 8%) 3.6877
4m 17s (- 45m 24s) (2500 8%) 3.6142
4m 27s (- 45m 14s) (2600 8%) 3.4633
4m 37s (- 45m 4s) (2700 9%) 3.5433
4m 48s (- 45m 1s) (2800 9%) 3.5948
4m 57s (- 4

35m 16s (- 11m 39s) (21800 75%) 3.0539
35m 25s (- 11m 29s) (21900 75%) 2.8515
35m 34s (- 11m 19s) (22000 75%) 3.0535
35m 45s (- 11m 9s) (22100 76%) 2.9054
35m 55s (- 11m 0s) (22200 76%) 2.9969
36m 5s (- 10m 50s) (22300 76%) 3.0142
36m 17s (- 10m 41s) (22400 77%) 3.1089
