In [15]:
from __future__ import unicode_literals, print_function, division

import os
import random
import time

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using {device} device")

Using cuda device


In [16]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20


class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [17]:
def normalize_string(df, lang):
    sentence = df[lang].str.lower()
    sentence = sentence.str.replace("[^A-Za-z\s]+", " ")
    sentence = sentence.str.normalize("NFD")
    sentence = sentence.str.encode("ascii", errors="ignore").str.decode("utf-8")
    return sentence


def read_sentence(df, lang_1, lang_2):
    sentence1 = normalize_string(df, lang_1)
    sentence2 = normalize_string(df, lang_2)
    return sentence1, sentence2


def read_file(loc, lang_1, lang_2):
    return pd.read_csv(loc, delimiter="\t", header=None, names=[lang_1, lang_2])


def process_data(lang_1, lang_2):
    df = read_file("../data/nlp/%s-%s.txt" % (lang_1, lang_2), lang_1, lang_2)
    sentence1, sentence2 = read_sentence(df, lang_1, lang_2)

    in_lang = Lang()
    out_lang = Lang()

    _pairs = []
    for i in range(len(df)):
        if len(sentence1[i].split()) < MAX_LENGTH and len(sentence2[i].split()) < MAX_LENGTH:
            full = [sentence1[i], sentence2[i]]
            in_lang.add_sentence(sentence1[i])
            out_lang.add_sentence(sentence2[i])
            _pairs.append(full)

    return in_lang, out_lang, _pairs

In [18]:
def index_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensor_from_sentence(lang, sentence):
    indexes = index_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensor_from_pair(in_lang, out_lang, pair):
    input_tensor = tensor_from_sentence(in_lang, pair[0])
    output_tensor = tensor_from_sentence(out_lang, pair[1])
    return input_tensor, output_tensor

In [19]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, n_layers):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = n_layers

        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, n_layers)

    def forward(self, src):
        embedded = self.embedding(src)
        output, hidden = self.gru(embedded)
        return output, hidden

In [20]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embed_dim, n_layers):
        super(Decoder, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = n_layers

        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, n_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, _input, hidden):
        _input = _input.view(1, -1)
        embedded = functional.relu(self.embedding(_input))
        output, hidden = self.gru(embedded, hidden)
        prediction = self.softmax(self.fc(output[0]))
        return prediction, hidden

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, _encoder, _decoder, _device, max_len=MAX_LENGTH):
        super(Seq2Seq, self).__init__()

        self.MAX_LENGTH = max_len
        self.encoder = _encoder
        self.decoder = _decoder
        self.device = _device

    def forward(self, input_tensor, target_tensor, _teacher_forcing_ratio=0.5):
        input_length = input_tensor.size(0)
        batch_size = target_tensor.shape[1]
        target_length = target_tensor.shape[0]

        vocab_size = self.decoder.output_dim

        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(input_tensor[i])

        # noinspection PyUnboundLocalVariable
        decoder_hidden = encoder_hidden.unsqueeze(0).to(device)
        decoder_input = torch.tensor([SOS_token], device=device)

        use_teacher_forcing = True if random.random() < _teacher_forcing_ratio else False

        if use_teacher_forcing:
            for t in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                outputs[t] = decoder_output
                decoder_input = target_tensor[t]

        else:
            for t in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                outputs[t] = decoder_output
                top_v, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()

                if decoder_input.item() == EOS_token:
                    break

        return outputs

In [22]:
teacher_forcing_ratio = 0.5


def create_model(_model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()

    loss = 0
    output = _model(input_tensor, target_tensor)
    num_iter = output.size(0)

    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()

    epoch_loss = loss.item() / num_iter
    return epoch_loss

In [23]:
def train_model(_model, in_lang, out_lang, _paris, n_iter=20000):
    if not os.path.exists('./ckpt'):
        os.makedirs('./ckpt')

    # if a model exists, load and print iteration
    if os.path.isfile('./ckpt/nlp.pt'):
        _model.load_state_dict(torch.load('./ckpt/nlp.pt'))
        print("Model loaded")

    res = input("Do you want to train the model? [y/N]").lower().strip()
    if res != 'y':
        return _model

    _model.train()
    optimizer = optim.SGD(_model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    training_pairs = [tensor_from_pair(in_lang, out_lang, random.choice(_paris))
                      for _ in range(n_iter)]

    for i in range(1, n_iter + 1):
        training_pair = training_pairs[i - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = create_model(_model, input_tensor, target_tensor, optimizer, criterion)
        total_loss_iterations += loss

        if i % 5000 == 0:
            average_loss = total_loss_iterations / 5000
            total_loss_iterations = 0
            print('%d %.4f' % (i, average_loss))

            # save
            torch.save(_model.state_dict(), './ckpt/nlp.pt')
            print(f"Model saved at {i} iteration")

    print("Model saved at last iteration")
    torch.save(_model.state_dict(), './ckpt/nlp.pt')

    return _model

In [24]:
def evaluate(_model, in_lang, out_lang, sentences):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(in_lang, sentences[0])
        output_tensor = tensor_from_sentence(out_lang, sentences[1])
        decoded_words = []
        output = _model(input_tensor, output_tensor)

        for ot in range(output.size(0)):
            top_v, topi = output[ot].topk(1)

            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(out_lang.index2word[topi[0].item()])

    return decoded_words


def evaluate_randomly(_model, in_lang, out_lang, _pairs, n=10):
    for i in range(n):
        pair = random.choice(_pairs)
        print('input {}'.format(pair[0]))
        print('output {}'.format(pair[1]))
        output_words = evaluate(_model, in_lang, out_lang, pair)
        output_sentence = ' '.join(output_words)
        print('predicted {}'.format(output_sentence))

In [25]:
lang1 = 'eng'
lang2 = 'fra'
input_lang, output_lang, pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

input_size = input_lang.n_words
output_size = output_lang.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 75000

encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

print(encoder)
print(decoder)

model = train_model(model, input_lang, output_lang, pairs, num_iteration)

random sentence ['i got one for christmas.', "j'en ai eu une pour noel."]
Input : 23194 Output : 39389
Encoder(
  (embedding): Embedding(23194, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(39389, 256)
  (gru): GRU(256, 512)
  (fc): Linear(in_features=512, out_features=39389, bias=True)
  (softmax): LogSoftmax(dim=1)
)
Model loaded
5000 5.1440
Model saved at 5000 iteration
10000 5.0352
Model saved at 10000 iteration
15000 4.9320
Model saved at 15000 iteration
20000 4.8429
Model saved at 20000 iteration
25000 4.8087
Model saved at 25000 iteration
30000 4.7591
Model saved at 30000 iteration
35000 4.7412
Model saved at 35000 iteration
40000 4.6842
Model saved at 40000 iteration
45000 4.6849
Model saved at 45000 iteration
50000 4.6109
Model saved at 50000 iteration
55000 4.6569
Model saved at 55000 iteration
60000 4.6267
Model saved at 60000 iteration
65000 4.5694
Model saved at 65000 iteration
70000 4.5376
Model saved at 70000 iteration
75000 4.5083
Model saved at 75000 

In [26]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, _hidden_size, _output_size, dropout=0.5, max_len=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()

        self.hidden_size = _hidden_size
        self.output_size = _output_size
        self.dropout = dropout
        self.max_len = max_len

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_len)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, _input, hidden, encoder_outputs):
        embedded = self.embedding(_input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = functional.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = functional.relu(output)
        output, hidden = self.gru(output, hidden)

        output = functional.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

In [27]:
def train_iters(_encoder, _decoder, n_iters, print_every=1000, plot_every=1000, lr=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(_encoder.parameters(), lr=lr)
    decoder_optimizer = optim.SGD(_decoder.parameters(), lr=lr)

    training_pairs = [tensor_from_pair(input_lang, output_lang, random.choice(pairs))
                      for _ in range(n_iters)]

    criterion = nn.NLLLoss()

    for i in range(1, n_iters + 1):
        training_pair = training_pairs[i - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = create_model(model, input_tensor, target_tensor, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%d %.4f' % (i, print_loss_avg))

        if i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [None]:
embed_size = 256
hidden_size = 512
num_layers = 1
input_size = input_lang.n_words
output_size = output_lang.n_words

encoder1 = Encoder(input_size, hidden_size, embed_size, num_layers).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_size, dropout=0.1).to(device)

print(encoder1)
print(attn_decoder1)

attn_model = train_iters(encoder1, attn_decoder1, 75000, print_every=5000, plot_every=100, lr=0.1)

Encoder(
  (embedding): Embedding(23194, 256)
  (gru): GRU(256, 512)
)
AttnDecoderRNN(
  (embedding): Embedding(39389, 512)
  (attn): Linear(in_features=1024, out_features=20, bias=True)
  (attn_combine): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(512, 512)
  (out): Linear(in_features=512, out_features=39389, bias=True)
)
5000 4.6863
10000 4.6931
