In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle # to dump and load pretrained glove vectors 
import copy   # to make deepcopy of python lists and dictionaries
import operator
import numpy as np
from pandas import DataFrame # to visualize the glove word embeddings in form of DataFrame
from tqdm import tqdm
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 500
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        # output = self.out(output[0])
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [3]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    # print(pair)
    topic_tensor = tensorFromSentence(topic_lang, pair[0])
    category_tensor = tensorFromSentence(category_lang, pair[1])
    subtopic_tensor = tensorFromSentence(subtopic_lang, pair[2])
    return (topic_tensor, category_tensor, subtopic_tensor)
    

def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub("_", " ", s)
    return s

In [4]:
class Input:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
lines = open('/wikipedia/dataset/wikipedia_dataset.txt').read().strip().split('\n')
len(lines)

23512

In [6]:
topic_lang = Input("topic")
category_lang = Input("category")
subtopic_lang = Input("subtopic")

In [7]:
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
for pair in pairs:
    topic_lang.addSentence(pair[0])
    category_lang.addSentence(pair[1])
    subtopic_lang.addSentence(pair[2])

In [8]:
def train(topic_tensor, cat_tensor, ss_tensor, encoder1, decoder1, encoder1_optimizer, decoder1_optimizer, \
encoder2, decoder2, encoder2_optimizer, decoder2_optimizer, criterion, max_length=MAX_LENGTH):
    encoder1_hidden = encoder1.initHidden()

    encoder1_optimizer.zero_grad()
    decoder1_optimizer.zero_grad()

    input_length = topic_tensor.size(0)
    target_length = cat_tensor.size(0)
    # print("input_length: ", input_length)
    # print("target_length: ", target_length)

    encoder1_outputs = torch.zeros(max_length, encoder1.hidden_size, device=device)

    # print("encoder1_outputs: ", encoder1_outputs.shape)

    loss = 0

    for ei in range(input_length):
        encoder1_output, encoder1_hidden = encoder1(
            topic_tensor[ei], encoder1_hidden)
        encoder1_outputs[ei] = encoder1_output[0, 0]

    decoder1_input = torch.tensor([[SOS_token]], device=device)

    decoder1_hidden = encoder1_hidden

    decoder1_cat_output = []

    for di in range(target_length):
        decoder1_output, decoder1_hidden, decoder1_attention = decoder1(
            decoder1_input, decoder1_hidden, encoder1_outputs)
        topv, topi = decoder1_output.topk(1)
        decoder1_input = topi.squeeze().detach()  # detach from history as input
        decoder1_cat_output.append(decoder1_input)

        loss += criterion(decoder1_output, cat_tensor[di])
        if decoder1_input.item() == EOS_token:
            break

    loss.backward()
    decoder1_cat_output = torch.tensor(decoder1_cat_output, device=device).unsqueeze(1)

    encoder1_optimizer.step()
    decoder1_optimizer.step()
    # print("decoder1_cat_output: ", decoder1_cat_output)

    ##############################

    encoder2_hidden = encoder2.initHidden()

    encoder2_optimizer.zero_grad()
    decoder2_optimizer.zero_grad()

    input_length = decoder1_cat_output.size(0)
    target_length = ss_tensor.size(0)
    # print("cat_tensor: ", decoder1_cat_output)
    # print("input_length: ", input_length)
    # print("target_length: ", target_length)

    encoder2_outputs = torch.zeros(max_length, encoder2.hidden_size, device=device)

    # print("encoder2_outputs: ", encoder2_outputs.shape)

    loss = 0

    for ei in range(input_length):
        encoder2_output, encoder2_hidden = encoder2(
            decoder1_cat_output[ei], encoder2_hidden)
        encoder2_outputs[ei] = encoder2_output[0, 0]

    decoder2_input = torch.tensor([[SOS_token]], device=device)

    decoder2_hidden = encoder2_hidden

    for di in range(target_length):
        decoder2_output, decoder2_hidden, decoder2_attention = decoder2(
            decoder2_input, decoder2_hidden, encoder2_outputs)
        topv, topi = decoder2_output.topk(1)
        decoder2_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder2_output, ss_tensor[di])
        if decoder2_input.item() == EOS_token:
            break

    loss.backward()

    encoder2_optimizer.step()
    decoder2_optimizer.step()
    # print()

    return loss.item() / target_length

In [9]:
def trainIters(encoder1, decoder1, encoder2, decoder2, n_iters, print_every=1000, plot_every=100, learning_rate=0.1):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder1_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
    decoder1_optimizer = optim.SGD(decoder1.parameters(), lr=learning_rate)
    encoder2_optimizer = optim.SGD(encoder2.parameters(), lr=learning_rate)
    decoder2_optimizer = optim.SGD(decoder2.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        topic_tensor = training_pair[0]
        cat_tensor = training_pair[1]
        ss_tensor = training_pair[2]        
        # print("topic_tensor: ", topic_tensor.shape)
        # print("cat_tensor: ", cat_tensor.shape)
        # print("ss_tensor: ", ss_tensor.shape)

        loss = train(topic_tensor, cat_tensor, ss_tensor, encoder1,
                     decoder1, encoder1_optimizer, decoder1_optimizer, encoder2,
                     decoder2, encoder2_optimizer, decoder2_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(' (%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))
            #evaluateRandomly(encoder1, decoder1, 1)

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [13]:
encoder1 = EncoderRNN(topic_lang.n_words, 256).to(device)
decoder1 = AttnDecoderRNN(256, category_lang.n_words, dropout_p=0.1).to(device)

encoder2 = EncoderRNN(category_lang.n_words, 256).to(device)
decoder2 = AttnDecoderRNN(256, subtopic_lang.n_words, dropout_p=0.1).to(device)

In [14]:
trainIters(encoder1, decoder1, encoder2, decoder2,  100000, print_every=1000)

 (1000 1%) 39.4421
 (2000 2%) 53.5185
 (3000 3%) 68.1471
 (4000 4%) 76.4245
 (5000 5%) 81.6633
 (6000 6%) 77.3267
 (7000 7%) 83.4819
 (8000 8%) 83.1531
 (9000 9%) 79.4628
 (10000 10%) 83.0867
 (11000 11%) 79.0641
 (12000 12%) 76.5327
 (13000 13%) 70.9089
 (14000 14%) 71.0919


IndexError: index 500 is out of bounds for dimension 0 with size 500