In [1]:
#https://github.com/spro/practical-pytorch
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import numpy as np
import math
from mosestokenizer import *
from collections import Counter

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

In [2]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 30

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {"UNK": 0}
        self.index2word = {0: "SOS", 1: "EOS", 2: "UNK"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence, topnwords):
        for word in sentence:
            if word in topnwords:
                self.addWord(word)
            else:
                self.word2count["UNK"] += 1

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def readLangs(datasets, lang1, lang2, reverse=False):
    print("Reading lines...")

    pairs = []
    tokened_pairs = []
    tokenizer1 = MosesTokenizer(lang1)
    tokenizer2 = MosesTokenizer(lang2)
    cnt = 0
    for dataset in datasets:
        # Read the file and split into lines
        lines1 = open('dataset/' + dataset + '/' + dataset + '.' + lang1, encoding='utf-8').read().strip().split('\n')
        lines2 = open('dataset/' + dataset + '/' + dataset + '.' + lang2, encoding='utf-8').read().strip().split('\n')

        # Split every line into pairs and normalize
        for line1, line2 in zip(lines1, lines2):
            pair1 = tokenizer1(line1)
            pair2 = tokenizer2(line2)
            if (len(pair1) < MAX_LENGTH) and (len(pair2) < MAX_LENGTH):
                pairs.append([line1, line2])            
                tokened_pairs.append([pair1, pair2])                
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs, tokened_pairs

def topnwords(pairs, lang, n):
        # Read the file and split into lines
    lang_list = [word for pair in pairs for word in pair[lang]]          
    top_n = Counter(lang_list).most_common(n)
    
    return [i[0] for i in top_n]

def prepareData(datasets, lang1, lang2, reverse=False):
    input_lang, output_lang, pairs, tokened_pairs = readLangs(datasets, lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    input_lang.topnwords = topnwords(tokened_pairs, 0, 30000)
    output_lang.topnwords = topnwords(tokened_pairs, 1, 30000)
    print("Counting words...")
    for pair in tokened_pairs:
        input_lang.addSentence(pair[0], input_lang.topnwords)
        output_lang.addSentence(pair[1], output_lang.topnwords)
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs, tokened_pairs
    
def indexesFromSentence(lang, sentence):   
    var = [lang.word2index[word] if word in lang.topnwords else 2 for word in sentence ]
    var.append(EOS_token)
    return var

def variablesFromPairs(pairs):
    variables = []
    for i, pair in enumerate(pairs):
        var_pair = [indexesFromSentence(input_lang, pair[0]), indexesFromSentence(output_lang, pair[1])]
        variables.append(var_pair)          
    return variables
    
# datasets = ['commoncrawl', 'europarl', 'un']
datasets = ['europarl']

# input_lang, output_lang, pairs, tokened_pairs = prepareData(datasets, 'en', 'fr', True)
# with open('pairs_30.pkl', 'wb') as file:
#     pickle.dump((input_lang, output_lang, pairs, tokened_pairs), file, protocol = pickle.HIGHEST_PROTOCOL)
# with open('pairs_30.pkl', 'rb') as file:
#     input_lang, output_lang, pairs, tokened_pairs = pickle.load(file)
    
# variables = variablesFromPairs(tokened_pairs)
#     with open('pairs_30_var.pkl', 'wb') as file:
#         pickle.dump((input_lang, output_lang, variables), file, protocol = pickle.HIGHEST_PROTOCOL)   
# with open('pairs_30_var.pkl', 'rb') as file:
#    input_lang, output_lang, variables = pickle.load(file)

# print(random.choice(pairs))    

In [3]:
class DataLoader():
    def __init__(self, batch_size, mini_batch_size, is_training):
        self.batch_size = batch_size
        self.minibatch_size = mini_batch_size
        self.token_stream = []
        self.is_training = is_training
        self.create_batches()

    def create_batches(self):
        input_lang, output_lang, pairs = pickle.load(open('pairs_30_var.pkl', 'rb'))
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.pairs = np.array(pairs)
        self.data_num = len(pairs)
        indices = np.arange(self.data_num)
        np.random.shuffle(indices)
        self.indices = indices
        self.num_batch = int(self.data_num / self.batch_size)
        self.num_minibatch = int(self.batch_size / self.minibatch_size)
        self.batch_pointer = 0
        self.minibatch_pointer = 0
        
    def next_batch(self):
        self.batch_pointer = (self.batch_pointer + 1) % self.num_batch
        start_pos = self.batch_pointer * self.batch_size
        idx = self.indices[start_pos:(start_pos+self.batch_size)]
        self.batch = np.array(sorted(self.pairs[idx], key = lambda pair: len(pair[0])))
        
    def next_minibatch(self):
        self.minibatch_pointer = (self.minibatch_pointer + 1) % self.num_minibatch
        start_pos = self.minibatch_pointer * self.minibatch_size
        input_sentences = self.batch[start_pos:(start_pos+self.minibatch_size), 0]
        target_sentences = self.batch[start_pos:(start_pos+self.minibatch_size), 1]
        return input_sentences, target_sentences
    
    def reset_pointer(self):
        self.batch_pointer = 0
        self.minibatch_pointer = 0
        np.random.shuffle(self.indices)


In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size, embedding):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embed_size = embed_size

        self.embedding = embedding
        self.gru = nn.GRU(self.embed_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).unsqueeze(0)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

In [5]:
# https://github.com/pytorch/pytorch/issues/805 erogol
class Maxout(nn.Module):
    def __init__(self, d_in, d_out, pool_size):
        super().__init__()
        self.d_in, self.d_out, self.pool_size = d_in, d_out, pool_size
        self.lin = nn.Linear(d_in, d_out * pool_size)

    def forward(self, inputs):
        shape = list(inputs.size())
        shape[-1] = self.d_out
        shape.append(self.pool_size)
        max_dim = len(shape) - 1
        out = self.lin(inputs)
        m, i = out.view(*shape).max(max_dim)
        return m    

# https://github.com/keon/seq2seq/blob/master/model.py
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 3, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size, maxout_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.maxout_size = maxout_size
        
        self.embed = nn.Embedding(output_size, embed_size)
        self.attention = Attention(hidden_size)
        self.maxout = nn.Linear(hidden_size * 3 + embed_size, maxout_size * 2)
        self.gru = nn.GRU(hidden_size * 2 + embed_size , hidden_size)
        self.maxout = Maxout(hidden_size * 3 + embed_size, maxout_size, 2)
        self.out = nn.Linear(maxout_size, output_size)
        
    def forward(self, input, last_hidden, encoder_outputs):
        # Get the embedding of the current input word (last output word)
        encoder_outputs = encoder_outputs.transpose(0, 1) 
        embedded = self.embed(input)  # 1 B m
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attention(last_hidden[-1], encoder_outputs) # t
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
        context = context.transpose(0, 1)  # (1,B,N)
        rnn_input = torch.cat([embedded, context], 2)
        out, hidden = self.gru(rnn_input, last_hidden)
        maxout_input = torch.cat([last_hidden, embedded, context], 2)
        output = self.maxout(maxout_input).squeeze(0)    
        output = self.out(output)
        output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights
        

In [6]:
def train(input_variables, target_variables, encoder1, encoder2, decoder, encoder1_optimizer, encoder2_optimizer, decoder_optimizer, transition, criterion, max_length=MAX_LENGTH):
    encoder1_optimizer.zero_grad()
    encoder2_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    minibatch_size = len(input_variables)
    input_maxlength = len(max(input_variables, key = lambda inp: len(inp)))
    target_maxlength = len(max(target_variables, key = lambda tar: len(tar)))
    
    
    for i in range(minibatch_size):
        n = input_maxlength - len(input_variables[i])
        if n != 0:
            input_variables[i].extend([0] * n) 
        m = target_maxlength - len(target_variables[i])
        if m != 0:
            target_variables[i].extend([0] * m) 
            
    input_variables = np.array(list(input_variables))
    target_variables = np.array(list(target_variables))
    input_variables = Variable(torch.LongTensor(input_variables))
    input_variables = input_variables.cuda() if use_cuda else input_variables
    target_variables = Variable(torch.LongTensor(target_variables))
    target_variables = target_variables.cuda() if use_cuda else target_variables
        
    encoder1_hidden = Variable(torch.zeros(1, minibatch_size, encoder1.hidden_size))
    encoder1_hidden = encoder1_hidden.cuda() if use_cuda else encoder1_hidden
    encoder2_hidden = Variable(torch.zeros(1, minibatch_size, encoder2.hidden_size))
    encoder2_hidden = encoder2_hidden.cuda() if use_cuda else encoder2_hidden
    
    encoder_outputs = Variable(torch.zeros(minibatch_size, max_length, encoder1.hidden_size * 2))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    
    loss = 0
    for ei in range(input_maxlength):
        encoder_output, encoder1_hidden = encoder1(
            input_variables[:, ei], encoder1_hidden)
        encoder_outputs[:, ei, :encoder1.hidden_size] = encoder_output
    for ei in reversed(range(input_maxlength)):
        encoder_output, encoder2_hidden = encoder2(
            input_variables[:, ei], encoder2_hidden)
        encoder_outputs[:, ei, encoder2.hidden_size:] = encoder_output
        
    Ws_input = encoder_output
    decoder_hidden = torch.tanh(transition(Ws_input))     
    decoder_input = Variable(torch.LongTensor([[SOS_token]*minibatch_size]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        
    for di in range(target_maxlength):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.data.topk(1)
        decoder_input = Variable(topi.transpose(0, 1))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        loss += criterion(decoder_output, target_variables[:, di])

    loss.backward()

    encoder1_optimizer.step()
    encoder2_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / (target_maxlength * minibatch_size)

In [7]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))  

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

%matplotlib inline
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
batch = 1600
mini_batch = 80
learning_rate=1
rho= 0.95
eps = 1e-06
weight_decay = 1e-03
     
def trainIters(encoder1, encoder2, decoder, transition, epoch=1, print_every=10, plot_every=50):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder1_optimizer = optim.Adadelta(encoder1.parameters(), lr=learning_rate, rho=rho, eps=eps, weight_decay=weight_decay)
    encoder2_optimizer = optim.Adadelta(encoder2.parameters(), lr=learning_rate, rho=rho, eps=eps, weight_decay=weight_decay)
    decoder_optimizer = optim.Adadelta(decoder.parameters(), lr=learning_rate, rho=rho, eps=eps, weight_decay=weight_decay)
    
    dataloader = DataLoader(batch, mini_batch, is_training=True)
    criterion = nn.NLLLoss()

    for ep in range(1, epoch + 1):
        dataloader.reset_pointer()
        for bt in range(1, dataloader.batch_size+1):
            dataloader.next_batch()
            for mt in range(dataloader.minibatch_size):
                input_variables, target_variables = dataloader.next_minibatch()
                loss = train(input_variables, target_variables, encoder1, encoder2,
                             decoder, encoder1_optimizer, encoder2_optimizer, decoder_optimizer, transition, criterion)
                print_loss_total += loss
                plot_loss_total += loss

            if bt % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, bt / batch), bt, bt / batch * 100, print_loss_avg))

            if bt % plot_every == 0:
                torch.save(encoder1, 'encoder1.pt')
                torch.save(encoder2, 'encoder2.pt')
                torch.save(decoder, 'decoder.pt')
                torch.save(transition, 'transition.pt')
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                showPlot(plot_losses)

In [None]:
hidden_size = 1000
embed_size = 620
maxout_size = 500

# lang1_embedding = nn.Embedding(input_lang.n_words, embed_size)
# encoder1 = EncoderRNN(hidden_size, embed_size, lang1_embedding)
# encoder2 = EncoderRNN(hidden_size, embed_size, lang1_embedding)
# decoder = DecoderRNN(hidden_size, embed_size, maxout_size, output_lang.n_words)
# transition = nn.Linear(encoder1.hidden_size, encoder1.hidden_size)
encoder1 = torch.load('encoder1.pt')
encoder2 = torch.load('encoder2.pt')
decoder = torch.load('decoder.pt')
transition = torch.load('transition.pt')

if use_cuda:
    encoder1 = encoder1.cuda()
    encoder2 = encoder2.cuda()    
    decoder = decoder.cuda()
    transition = transition.cuda()

trainIters(encoder1, encoder2, decoder, transition, 1, print_every=1, plot_every = 1)


1m 2s (- 1658m 57s) (1 0%) 3.4472


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


1m 58s (- 1574m 14s) (2 0%) 3.2220


In [None]:
def evaluate(encoder1, encoder2,decoder, transition, sentence, max_length=MAX_LENGTH):
    
    input_variable = indexesFromSentence(input_lang, sentence)
    input_variable = Variable(torch.LongTensor(input_variable))
    input_variable = input_variable.cuda() if use_cuda else input_variable
    input_length = input_variable.size()[0]
    
    encoder1_hidden = Variable(torch.zeros(1, 1, encoder1.hidden_size))
    encoder1_hidden = encoder1_hidden.cuda() if use_cuda else encoder1_hidden
    encoder2_hidden = Variable(torch.zeros(1, 1, encoder2.hidden_size))
    encoder2_hidden = encoder2_hidden.cuda() if use_cuda else encoder2_hidden
    
    encoder_outputs = Variable(torch.zeros(1, max_length, encoder1.hidden_size * 2))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    
    for ei in range(input_length):
        encoder_output, encoder1_hidden = encoder1(input_variable[ei], encoder1_hidden)
        encoder_outputs[:, ei, :encoder1.hidden_size] = encoder_output

    for ei in reversed(range(input_length)):
        encoder_output, encoder2_hidden = encoder2(input_variable[ei], encoder2_hidden)
        encoder_outputs[:, ei, encoder2.hidden_size:] = encoder_output
        
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    Ws_input = encoder_output
    decoder_hidden = torch.tanh(transition(Ws_input)) 

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni])

        decoder_input = Variable(torch.LongTensor([[ni]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]

def evaluateRandomly(encoder1, encoder2, decoder, transition, n=10):
    input_lang, output_lang, pairs, tokened_pairs = pickle.load(open('pairs_30.pkl', 'rb'))
    for i in range(n):
        pair = random.choice(tokened_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder1, encoder2, decoder, transition, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
encoder1 = torch.load('encoder1.pt')
encoder2 = torch.load('encoder2.pt')
decoder = torch.load('decoder.pt')
transition = torch.load('transition.pt')

if use_cuda:
    encoder1 = encoder1.cuda()
    encoder2 = encoder2.cuda()    
    decoder = decoder.cuda()
    transition = transition.cuda()

evaluateRandomly(encoder1, encoder2, decoder, transition, 1)

In [None]:
# Attempt for beam search
#     decoder_output, decoder_hidden, decoder_attention = decoder(
#             decoder_input, decoder_hidden, encoder_outputs)    
#     topv, topi = decoder_output.data.topk(3)
#     decoder_input = Variable(topi)
#     decoder_input = decoder_input.cuda() if use_cuda else decoder_input
#     top_prob = torch.exp(decoder_output)    
#     decoder_hidden = decoder_hidden.repeat(3, 1, 1).transpose(0,1)
# #     encoder_outputs = encoder_outputs.transpose(0,1)
# #     encoder_outputs = encoder_outputs.repeat(3, 1, 1, 1).squeeze(1)
# #     print(encoder_outputs.size())
#     for di in range(1, max_length):
#         print(decoder_input.size(), decoder_hidden.size())
#         decoder_output, decoder_hidden, decoder_attention = decoder(
#             decoder_input, decoder_hidden, encoder_outputs)
#         print(decoder_input.size(), decoder_hidden.size())
#         top_prob = top_prob * torch.exp(decoder_output).squeeze(0)
#         topv, topi = top_prob.data.topk(3)
# #         print(decoder_hidden)
#         print(topv, topi)

# top_prob = Variable(LongTensor([1, 1, 1])).cuda()
#     for di in range(max_length):
#         decoder_output, decoder_hidden, decoder_attention = decoder(
#             decoder_input, decoder_hidden, encoder_outputs)
#         top_prob = top_prob * decoder_output
#         topv, topi = decoder_output.data.topk(3)
#         decoder_input = Variable(topi.transpose(0, 1))

#         decoder_hidden = 

#         ni = topi[0][0]

#         topv, topi = decoder_output.data.topk(1)
#         decoder_input = decoder_input.cuda() if use_cuda else decoder_input
#         loss += criterion(decoder_output, target_variables[:, di])        
        
#         if ni == EOS_token:
#             decoded_words.append('<EOS>')
#             break
#         else:
#             decoded_words.append(output_lang.index2word[ni])

#         decoder_input = Variable(torch.LongTensor([[ni]]))
#         decoder_input = decoder_input.cuda() if use_cuda else decoder_input

#     return decoded_words, decoder_attentions[:di + 1]