In [1]:
from __future__ import unicode_literals, print_function, division

import time

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as functional


import os, re, random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using {device} device")

Using cuda device


In [2]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20

class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)
            
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word]  = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def normalize_string(df, lang):
    sentence = df[lang].str.lower()
    sentence = sentence.str.replace("[^A-Za-z\s]+", " ")
    sentence = sentence.str.normalize("NFD")
    sentence = sentence.str.encode("ascii", errors="ignore").str.decode("utf-8")
    return sentence

def read_sentence(df, lang1, lang2):
    sentence1 = normalize_string(df, lang1)
    sentence2 = normalize_string(df, lang2)
    return sentence1, sentence2

def read_file(loc, lang1, lang2):
    return pd.read_csv(loc, delimiter="\t", header=None, names=[lang1, lang2])

def process_data(lang1, lang2):
    df = read_file("../data/nlp/%s-%s.txt" % (lang1, lang2), lang1, lang2)
    sentence1, sentence2 = read_sentence(df, lang1, lang2)

    in_lang = Lang()
    out_lang = Lang()

    _pairs = []
    for i in range(len(df)):
        if len(sentence1[i].split()) < MAX_LENGTH and len(sentence2[i].split()) < MAX_LENGTH:
            full = [sentence1[i], sentence2[i]]
            in_lang.add_sentence(sentence1[i])
            out_lang.add_sentence(sentence2[i])
            _pairs.append(full)

    return in_lang, out_lang, _pairs

In [4]:
def index_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensor_from_sentence(lang, sentence):
    indexes = index_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensor_from_pair(in_lang, out_lang, pair):
    input_tensor = tensor_from_sentence(in_lang, pair[0])
    output_tensor = tensor_from_sentence(out_lang, pair[1])
    return input_tensor, output_tensor

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, num_layers):
        super(Encoder, self).__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        
        self.embedding= nn.Embedding(input_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers)
        
    def forward(self, src):
        embedded = self.embedding(src)
        output, hidden = self.gru(embedded)
        return output, hidden

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embed_dim, num_layers):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        input = input.view(1, -1)
        embedded = functional.relu(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)
        prediction = self.softmax(self.fc(output[0]))
        return prediction, hidden

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, _device, max_len=MAX_LENGTH):
        super().__init__()

        self.MAX_LENGTH = max_len
        self.encoder = encoder
        self.decoder = decoder
        self.device = _device

    def forward(self, input_tensor, target_tensor, teacher_forcing_ratio=0.5):
        input_length = input_tensor.size(0)
        batch_size = target_tensor.shape[1]
        target_length = target_tensor.shape[0]

        vocab_size = self.decoder.output_dim

        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(input_tensor[i])

        decoder_hidden = encoder_hidden.unsqueeze(0).to(device)
        decoder_input = torch.tensor([SOS_token], device=device)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        if use_teacher_forcing:
            for t in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                outputs[t] = decoder_output
                decoder_input = target_tensor[t]

        else:
            for t in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                outputs[t] = decoder_output
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()

                if decoder_input.item() == EOS_token:
                    break

        return outputs

In [8]:
teacher_forcing_ratio = 0.5

def Model(_model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()

    loss = 0
    output = _model(input_tensor, target_tensor)
    num_iter = output.size(0)

    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()

    epoch_loss = loss.item() / num_iter
    return epoch_loss

In [9]:
def train_model(_model, input_lang, output_lang, pairs, num_iteration=20000):
    if not os.path.exists('./ckpt'):
        os.makedirs('./ckpt')
        
    # if a model exists, load and print iteration
    if os.path.isfile('./ckpt/nlp.pt'):
        _model.load_state_dict(torch.load('./ckpt/nlp.pt'))
        print("Model loaded")
        
    _model.train()
    optimizer = optim.SGD(_model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    training_pairs = [tensor_from_pair(input_lang, output_lang, random.choice(pairs))
                      for _ in range(num_iteration)]

    for iter in range(1, num_iteration+1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = Model(_model, input_tensor, target_tensor, optimizer, criterion)
        total_loss_iterations += loss

        if iter % 5000 == 0:
            average_loss = total_loss_iterations / 5000
            total_loss_iterations = 0
            print('%d %.4f' % (iter, average_loss ))
            
            # save
            torch.save(_model.state_dict(), './ckpt/nlp.pt')
            print(f"Model saved at {iter} iteration")

    print("Model saved at last iteration")    
    torch.save(_model.state_dict(), './ckpt/nlp.pt')
    
    return _model

In [10]:
def evaluate(_model, input_lang, output_lang, sentences):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentences[0])
        output_tensor = tensor_from_sentence(output_lang, sentences[1])
        decoded_words = []
        output = _model(input_tensor, output_tensor)

        for ot in range(output.size(0)):
            top_v, topi = output[ot].topk(1)

            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])

    return decoded_words

def evaluate_randomly(_model, input_lang, output_lang, pairs, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('input {}'.format(pair[0]))
        print('output {}'.format(pair[1]))
        output_words = evaluate(_model, input_lang, output_lang, pair)
        output_sentence = ' '.join(output_words)
        print('predicted {}'.format(output_sentence))

In [11]:
lang1 = 'eng'
lang2 = 'fra'
input_lang, output_lang, pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

input_size = input_lang.n_words
output_size = output_lang.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 75000

encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

print(encoder)
print(decoder)

model = train_model(model, input_lang, output_lang, pairs, num_iteration)

random sentence ["i don't want to shoot you.", 'je ne veux pas te tirer dessus.']
Input : 23194 Output : 39389
Encoder(
  (embedding): Embedding(23194, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(39389, 256)
  (gru): GRU(256, 512)
  (fc): Linear(in_features=512, out_features=39389, bias=True)
  (softmax): LogSoftmax(dim=1)
)
Model loaded
5000 5.3314
Model saved at 5000 iteration


KeyboardInterrupt: 

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.5, max_len=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.max_len = max_len
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_len)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = functional.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = functional.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = functional.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

In [None]:
def train_iters(encoder, decoder, n_iters, print_every=1000, plot_every=1000):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=lr)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=lr)
    
    training_pairs = [tensor_from_pair(input_lang, output_lang, random.choice(pairs))
                      for _ in range(n_iters)]
    
    criterion = nn.NLLLoss()
    
    for iter in range(1, n_iters+1):
        training_pair = training_pairs[iter-1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        loss = Model(encoder, decoder, input_tensor, target_tensor, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%d %.4f' % (iter, print_loss_avg))
            
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0