In [336]:
import numpy as np
import scipy.sparse.csgraph as csg
from joblib import Parallel, delayed
import multiprocessing
import networkx as nx
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import math

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [337]:
# Distortion calculations

def entry_is_good(h, h_rec): return (not torch.isnan(h_rec)) and (not torch.isinf(h_rec)) and h_rec != 0 and h != 0

def distortion_entry(h,h_rec,me,mc):
    avg = abs(h_rec - h)/h
    if h_rec/h > me: me = h_rec/h
    if h/h_rec > mc: mc = h/h_rec
    return (avg,me,mc)

def distortion_row(H1, H2, n, row):
    mc, me, avg, good = 0,0,0,0
    for i in range(n):
        if i != row and entry_is_good(H1[i], H2[i]):
            (_avg,me,mc) = distortion_entry(H1[i], H2[i],me,mc)
            good        += 1
            avg         += _avg
    avg /= good if good > 0 else 1.0
    return (mc, me, avg, n-1-good)

def distortion(H1, H2, n, jobs=16):
    dists = Parallel(n_jobs=jobs)(delayed(distortion_row)(H1[i,:],H2[i,:],n,i) for i in range(n))
    avg = torch.stack([tup[2] for tup in dists]).sum()/n
    return avg


#Loading the graph and getting the distance matrix.

def load_graph(file_name, directed=False):
    G = nx.DiGraph() if directed else nx.Graph()
    with open(file_name, "r") as f:
        for line in f:
            tokens = line.split()
            u = int(tokens[0])
            v = int(tokens[1])
            if len(tokens) > 2:
                w = float(tokens[2])
                G.add_edge(u, v, weight=w)
            else:
                G.add_edge(u,v)
    return G


def compute_row(i, adj_mat): 
    return csg.dijkstra(adj_mat, indices=[i], unweighted=True, directed=False)

def get_dist_mat(G):
    n = G.order()
    adj_mat = nx.to_scipy_sparse_matrix(G, nodelist=list(range(G.order())))
    t = time.time()
    
    num_cores = multiprocessing.cpu_count()
    dist_mat = Parallel(n_jobs=num_cores)(delayed(compute_row)(i,adj_mat) for i in range(n))
    dist_mat = np.vstack(dist_mat)
    return dist_mat


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [338]:
class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0

    def addSentence(self, sentence):
        for token in sentence:
            self.addWord(token['form'])

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [339]:
from conllu import parse_tree, parse_tree_incr, parse, parse_incr
from io import open
import scipy.sparse.csgraph as csg
import networkx as nx
from collections import defaultdict
import json
import string


def unroll(node, G):
    if len(node.children) != 0:
        for child in node.children:
            G.add_edge(node.token['id'], child.token['id'])
            unroll(child, G)
    return G

sentences = []
data_file = open("UD_English-EWT/en_ewt-ud-train.conllu", "r", encoding="utf-8")
for sentence in parse_incr(data_file):
    sentences.append(sentence)
    
MIN_LENGTH = 10
MAX_LENGTH = 50

def check_length(sentence):
    return len(sentence.metadata['text'].split(' ')) < MAX_LENGTH and \
        len(sentence.metadata['text'].split(' ')) > MIN_LENGTH 

def filterSentences(sentences):
    return [sent for sent in sentences if check_length(sent)]

input_vocab = Vocab("ewt_train_trimmed")
filtered_sentences = filterSentences(sentences)

sentences_text = []
for sent in filtered_sentences:
    input_vocab.addSentence(sent)
    sentences_text.append(sent.metadata['text'])
    
dev_dict  = {}
for idx in range(0, len(filtered_sentences)):
    curr_tree = filtered_sentences[idx].to_tree()
    G_curr = nx.Graph()
    G_curr = unroll(curr_tree, G_curr)
    G = nx.relabel_nodes(G_curr, lambda x: x-1)
    nx.write_edgelist(G, "train/"+str(idx)+".edges", data=False)
    G_final = nx.convert_node_labels_to_integers(G_curr, ordering = "decreasing degree")
    nx.write_edgelist(G_final, "train_reordered/"+str(idx)+".edges", data=False)
    dev_dict[idx] = list(G_final.edges)


In [340]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[token['form']] for token in sentence]

def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def pairfromidx(idx):
    input_tensor = tensorFromSentence(input_vocab, filtered_sentences[idx])
    G = load_graph("train/"+str(idx)+".edges")
    target_matrix = get_dist_mat(G)
    target_tensor = torch.from_numpy(target_matrix).float().to(device)
    n = G.order()
    return (input_tensor, target_tensor, n, sentences_text[idx])


In [341]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [342]:
def train(input_tensor, ground_truth, n, encoder, encoder_optimizer, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = ground_truth.size(0)
    encoder_outputs = torch.zeros(input_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        
    loss += distortion(encoder_outputs, ground_truth, n)
    loss.backward()
    encoder_optimizer.step()

    return loss.item() / input_length

In [343]:
def trainIters(encoder, n_iters=2000, print_every=500, plot_every=100, learning_rate=1.0):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    training_pairs = [pairfromidx(idx) for idx in range(n_iters)]

    for iter in range(1, n_iters + 1):

        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_matrix = training_pair[1]
        n = training_pair[2]
        
        loss = train(input_tensor, target_matrix, n, encoder, encoder_optimizer)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
hidden_size = 100
encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
trainIters(encoder)