In [1]:
import os
import numpy as np
import scipy.sparse.csgraph as csg
from joblib import Parallel, delayed
import multiprocessing
import networkx as nx
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import math

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import learning_util as lu

In [2]:
# Distortion calculations

def acosh(x):
    return torch.log(x + torch.sqrt(x**2-1))

def dist_h(u,v):
    z  = 2*torch.norm(u-v,2)**2
    uu = 1. + torch.div(z,((1-torch.norm(u,2)**2)*(1-torch.norm(v,2)**2)))
    return acosh(uu)

def distance_matrix_euclidean(input):
    row_n = input.shape[0]
    mp1 = torch.stack([input]*row_n)
    mp2 = torch.stack([input]*row_n).transpose(0,1)
    dist_mat = torch.sum((mp1-mp2)**2,2).squeeze()
    return dist_mat

def distance_matrix_hyperbolic(input):
    row_n = input.shape[0]
    dist_mat = torch.zeros(row_n, row_n, device=device)
    for row in range(row_n):
        for i in range(row_n):
            if i != row:
                dist_mat[row, i] = dist_h(input[row,:], input[i,:])
    return dist_mat

def entry_is_good(h, h_rec): return (not torch.isnan(h_rec)) and (not torch.isinf(h_rec)) and h_rec != 0 and h != 0

def distortion_entry(h,h_rec):
    avg = abs(h_rec - h)/h
    return avg

def distortion_row(H1, H2, n, row):
    avg, good = 0, 0
    for i in range(n):
        if i != row and entry_is_good(H1[i], H2[i]):
            _avg = distortion_entry(H1[i], H2[i])
            good        += 1
            avg         += _avg
    if good > 0:
        avg /= good 
    else:
        avg, good = torch.tensor(0., device=device, requires_grad=True), torch.tensor(0., device=device, requires_grad=True)
    return (avg, good)

def distortion(H1, H2, n, jobs=16):
#     dists = Parallel(n_jobs=jobs)(delayed(distortion_row)(H1[i,:],H2[i,:],n,i) for i in range(n))
    dists = (distortion_row(H1[i,:],H2[i,:],n,i) for i in range(n))
    to_stack = [tup[0] for tup in dists]
    avg = torch.stack(to_stack).sum()/n
    return avg


#Loading the graph and getting the distance matrix.

def load_graph(file_name, directed=False):
    G = nx.DiGraph() if directed else nx.Graph()
    with open(file_name, "r") as f:
        for line in f:
            tokens = line.split()
            u = int(tokens[0])
            v = int(tokens[1])
            if len(tokens) > 2:
                w = float(tokens[2])
                G.add_edge(u, v, weight=w)
            else:
                G.add_edge(u,v)
    return G


def compute_row(i, adj_mat): 
    return csg.dijkstra(adj_mat, indices=[i], unweighted=True, directed=False)

def get_dist_mat(G):
    n = G.order()
    adj_mat = nx.to_scipy_sparse_matrix(G, nodelist=list(range(G.order())))
    t = time.time()
    
    num_cores = multiprocessing.cpu_count()
    dist_mat = Parallel(n_jobs=num_cores)(delayed(compute_row)(i,adj_mat) for i in range(n))
    dist_mat = np.vstack(dist_mat)
    return dist_mat


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [3]:
#Riemannian SGD

import glob
from torch.optim import Optimizer

class RiemannianSGD(Optimizer):
    """Riemannian stochastic gradient descent.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
    """

    def __init__(self, params, lr):
        # if lr is not required and lr < 0.0:
        #     raise ValueError("Invalid learning rate: {}".format(lr))
        defaults = dict(lr=lr)
        super(RiemannianSGD, self).__init__(params, defaults)

    def step(self):
        """Performs a single optimization step.
        Arguments:
            lr (float): learning rate for the current update.
        """
        loss = None

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                lr = group['lr']
            
            p.data.add_(hyperbolic_step(p.data, d_p, lr))
            p.data.clamp_(min=-10000.0, max=10000.0)
            p.grad.clamp_(min=-10000.0, max=10000.0)

        return loss

def batch_dot(u, v):
    return torch.sum(u * v, dim=-1, keepdim=True)

def natural_grad(v, dv):
    vnorm_squared = batch_dot(v, v)
    dv = dv * ((1 - vnorm_squared) ** 2 / 4).expand_as(dv)
    return dv

def batch_add(u, v, c=1):
    numer = 1 + 2 * batch_dot(u, v) + batch_dot(v, v) * u + (1 - batch_dot(u, u)) * v
    denom = 1 + 2 * batch_dot(u, v) + batch_dot(v, v) * batch_dot(u, u)

    return numer/denom

def batch_exp_map(x, v, c=1):
    term = torch.tanh((torch.norm(v, dim=-1, keepdim=True) / (1 - torch.norm(x, dim=-1, keepdim=True).pow(2)))) * \
                 (v/(torch.norm(v, dim=-1, keepdim=True)))
    return batch_add(x, term, c)

def hyperbolic_step(param, grad, lr):
    ngrad = natural_grad(param, grad)
    return batch_exp_map(param, -lr * ngrad, c=1)

In [4]:
class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0

    def addSentence(self, sentence):
        for token in sentence:
            self.addWord(token['form'])

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [5]:
from conllu import parse_tree, parse_tree_incr, parse, parse_incr
from io import open
import scipy.sparse.csgraph as csg
import networkx as nx
from collections import defaultdict
import json
import string


def unroll(node, G):
    if len(node.children) != 0:
        for child in node.children:
            G.add_edge(node.token['id'], child.token['id'])
            unroll(child, G)
    return G

sentences = []
data_file = open("UD_English-EWT/en_ewt-ud-train.conllu", "r", encoding="utf-8")
for sentence in parse_incr(data_file):
    sentences.append(sentence)
    
MIN_LENGTH = 10
MAX_LENGTH = 50

def check_length(sentence):
    return len(sentence) < MAX_LENGTH and len(sentence) > MIN_LENGTH 

def filterSentences(sentences):
    return [sent for sent in sentences if check_length(sent)]

input_vocab = Vocab("ewt_train_trimmed")
filtered_sentences = filterSentences(sentences)

sentences_text = []
for sent in filtered_sentences:
    input_vocab.addSentence(sent)
    sentences_text.append(sent.metadata['text'])
    
dev_dict  = {}
for idx in range(0, len(filtered_sentences)):
    curr_tree = filtered_sentences[idx].to_tree()
    G_curr = nx.Graph()
    G_curr = unroll(curr_tree, G_curr)
    G = nx.relabel_nodes(G_curr, lambda x: x-1)
    nx.write_edgelist(G, "train/"+str(idx)+".edges", data=False)
    G_final = nx.convert_node_labels_to_integers(G_curr, ordering = "decreasing degree")
    nx.write_edgelist(G_final, "ewt_train/"+str(idx)+".edges", data=False)
    dev_dict[idx] = list(G_final.edges)



In [6]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[token['form']] for token in sentence]

def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def pairfromidx(idx):
    input_tensor = tensorFromSentence(input_vocab, filtered_sentences[idx])
    G = load_graph("train/"+str(idx)+".edges")
    target_matrix = get_dist_mat(G)
    target_tensor = torch.from_numpy(target_matrix).float().to(device)
    target_tensor.requires_grad = False
    n = G.order()
    return (input_tensor, target_tensor, n, sentences_text[idx])


In [7]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
    
class HyperbolicEncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(HyperbolicEncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = HyperbolicLSTM(cell_class=HyperbolicLSTMCell, input_size=input_size, hidden_size=hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    

class Attention(nn.Module):
    def __init__(self, input_size, hidden_size, max_length=MAX_LENGTH):
        super(Attention, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.max_length = max_length
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)


    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        attention_scores = self.attn(torch.cat((embedded[0], hidden.unsqueeze(0)), 1))
        attn_weights = F.softmax(attention_scores, dim=0)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        return output

In [8]:
#Hyperbolic modules.

class HypLinear(nn.Module):
    """Applies a hyperbolic "linear" transformation to the incoming data: :math:`y = xA^T + b`
    """

    def __init__(self, in_features, out_features, bias=True):
        super(HypLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))

        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input_):
        result = lu.torch_hyp_add(lu.torch_mv_mul_hyp(torch.transpose(self.weight,0,1), input_), self.bias) #(batch, input) x (input, output)
        return result

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )

    
class HyperbolicLSTMCell(nn.Module):

    """
    Hyperbolic LSTM cell.
    """

    def __init__(self, input_size, hidden_size, use_bias=True):
        super(HyperbolicLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.use_bias = use_bias
        self.weight_ih = nn.Parameter(torch.FloatTensor(4*hidden_size, input_size))
        self.weight_hh = nn.Parameter(torch.FloatTensor(4*hidden_size, hidden_size))
        if use_bias:
            self.bias = nn.Parameter(torch.FloatTensor(1, 4*hidden_size))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        """
        Initialize weight parameters.
        Weight matrices -> Xavier uniform
        bias -> constant 0
        """
        torch.nn.init.xavier_uniform_(self.weight_ih.data)
        torch.nn.init.xavier_uniform_(self.weight_hh.data)
        #Set bias vectors to 0.
        if self.use_bias:
            torch.nn.init.constant_(self.bias.data, val=0)

    def forward(self, input_, hx):
        """
        Args:
            input: A (batch, input_size) tensor containing input
                features.
            hx: A tuple (h_0, c_0), which contains the initial hidden
                and cell state, where the size of both states is
                (batch_size, hidden_size).
        Returns:
            h_1, c_1: Tensors containing the next hidden and cell state.
        """
        h_0, c_0 = hx

        #torch_mv_mul_hyp does matmul(x, M) for Mx.
        th = lu.torch_mv_mul_hyp(torch.transpose(self.weight_hh,0,1), h_0) #(batch, hidden) x (hidden, 4*hidden) = (batch, 4*hidden)
        ti = lu.torch_mv_mul_hyp(torch.transpose(self.weight_ih,0,1), input_)  # (batch, input) x (input, 4*hidden) = (batch, 4*hidden)
        rnn_out = lu.torch_hyp_add(lu.torch_hyp_add(th, ti), self.bias) #(batch, 4*hidden)

        f, i, o, g = torch.split(rnn_out, split_size_or_sections= self.hidden_size, dim=1) #Each (batch, hidden)

        f = lu.torch_exp_map_zero(torch.sigmoid(lu.torch_log_map_zero(f)))
        i = lu.torch_exp_map_zero(torch.sigmoid(lu.torch_log_map_zero(i)))
        o = lu.torch_exp_map_zero(torch.sigmoid(lu.torch_log_map_zero(o)))
        g = lu.torch_exp_map_zero(torch.tanh(lu.torch_log_map_zero(g)))

        c_1 = lu.torch_hyp_add(lu.torch_pointwise_prod(f, c_0), lu.torch_pointwise_prod(i, g))
        c_1_nl = lu.torch_exp_map_zero(torch.tanh(lu.torch_log_map_zero(c_1)))
        h_1 = lu.torch_pointwise_prod(o, c_1_nl)
        
        return h_1, c_1


    def __repr__(self):
        s = '{name}({input_size}, {hidden_size})'
        return s.format(name=self.__class__.__name__, **self.__dict__)

class HyperbolicLSTM(nn.Module):

    """A module that runs multiple steps of LSTM in Hyperbolic Space."""
    def __init__(self, cell_class, input_size, hidden_size, num_layers=1, use_bias=True, batch_first=False, **kwargs):
        super(HyperbolicLSTM, self).__init__()
        self.cell_class = cell_class
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.use_bias = use_bias
        self.batch_first = batch_first

        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else hidden_size
            cell = cell_class(input_size=layer_input_size, hidden_size=hidden_size, **kwargs)
            setattr(self, 'cell_{}'.format(layer), cell)
        self.reset_parameters()

    def get_cell(self, layer):
        return getattr(self, 'cell_{}'.format(layer))

    def reset_parameters(self):
        for layer in range(self.num_layers):
            cell = self.get_cell(layer)
            cell.reset_parameters()

    @staticmethod
    def _forward_rnn(cell, input_, hx):
        max_time = input_.size(0)
        output = []
        for time in range(max_time):
            if isinstance(cell, HyperbolicLSTMCell):
                h_next, c_next = cell(input_=input_[time], hx=hx)
            else:
                h_next, c_next = cell(input_=input_[time], hx=hx)
            hx_next = (h_next, c_next)
            output.append(h_next)
            hx = hx_next
        output = torch.stack(output, 0)
        return output, hx

    def forward(self, input_, hx=None):
        if self.batch_first:
            input_ = input_.transpose(0, 1)
        max_time, batch_size, _ = input_.size()
        if hx is None:
            hx = (torch.zeros(self.num_layers, batch_size, self.hidden_size), torch.zeros(self.num_layers, batch_size, self.hidden_size))
        h_n = []
        c_n = []
        layer_output = None
        for layer in range(self.num_layers):
            cell = self.get_cell(layer)
            hx_layer = (hx[0][layer,:,:], hx[1][layer,:,:])
        
            if layer == 0:
                layer_output, (layer_h_n, layer_c_n) = HyperbolicLSTM._forward_rnn(cell=cell, input_=input_, hx=hx_layer)
            else:
                layer_output, (layer_h_n, layer_c_n) = HyperbolicLSTM._forward_rnn(cell=cell, input_=layer_output, hx=hx_layer)
        
            h_n.append(layer_h_n)
            c_n.append(layer_c_n)
        output = layer_output
        h_n = torch.stack(h_n, 0)
        c_n = torch.stack(c_n, 0)
        return output, (h_n, c_n)              

In [9]:
def trainVanilla(input_tensor, ground_truth, n, encoder, encoder_optimizer, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
 
    input_length = input_tensor.size(0)
    target_length = ground_truth.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    final_embeddings = torch.zeros(input_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
    
    dist_recovered = distance_matrix_euclidean(encoder_outputs)
    loss += distortion(ground_truth, dist_recovered, n)
    loss.backward()
    encoder_optimizer.step()

    return loss.item()

In [10]:
def trainWAttention(input_tensor, ground_truth, n, encoder, encoder_optimizer, attention, attention_optimizer, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    attention_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = ground_truth.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    encoder_hiddens = torch.zeros(input_length, encoder.hidden_size, device=device)
    final_embeddings = torch.zeros(input_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        encoder_hiddens[ei] = encoder_hidden[0, 0]
        
    for idx in range(input_length):
        output = attention(input_tensor[idx], encoder_hiddens[idx], encoder_outputs)
        final_embeddings[idx] = output[0]
        
    dist_recovered = distance_matrix_euclidean(final_embeddings)
    loss += distortion(ground_truth, dist_recovered, n)
    loss.backward()
    encoder_optimizer.step()
    attention_optimizer.step()

    return loss.item()

In [11]:
def trainEuclidean(encoder, attention, n_iters=800, print_every=100, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    attention_optimizer = optim.SGD(attention.parameters(), lr=learning_rate)
    training_pairs = [pairfromidx(idx) for idx in range(n_iters)]

    for iter in range(1, n_iters + 1):     
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_matrix = training_pair[1]
        n = training_pair[2]
        loss = trainWAttention(input_tensor, target_matrix, n, encoder, encoder_optimizer, attention, attention_optimizer)
#         loss = train(input_tensor, target_matrix, n, encoder, encoder_optimizer)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0



In [12]:
hidden_size = 100
encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
attention = Attention(input_vocab.n_words, hidden_size).to(device)
trainEuclidean(encoder, attention)

0m 22s (- 2m 35s) (100 12%) 3.1245
0m 37s (- 1m 51s) (200 25%) 0.5900
0m 52s (- 1m 28s) (300 37%) 0.4712
1m 6s (- 1m 6s) (400 50%) 0.4527
1m 21s (- 0m 48s) (500 62%) 0.4416
1m 33s (- 0m 31s) (600 75%) 0.4353
1m 45s (- 0m 15s) (700 87%) 0.4397
1m 57s (- 0m 0s) (800 100%) 0.4314


In [13]:
def trainHyperbolic(input_tensor, ground_truth, n, encoder, encoder_optimizer, fc, fc_optimizer, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    fc_optimizer.zero_grad()
 
    input_length = input_tensor.size(0)
    target_length = ground_truth.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    final_embeddings = torch.zeros(input_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        
    for idx in range(input_length):
        output = fc(encoder_outputs[idx])
        final_embeddings[idx] = output[0]

    dist_recovered = distance_matrix_hyperbolic(final_embeddings) 
    loss += distortion(ground_truth, dist_recovered, n)
    loss.backward()
    encoder_optimizer.step()
    fc_optimizer.step()

    return loss.item()



In [14]:
def trainHypIters(encoder, fc, n_iters=800, print_every=100, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  

    encoder_optimizer = RiemannianSGD(encoder.parameters(), lr=learning_rate)
    fc_optimizer = RiemannianSGD(fc.parameters(), lr=learning_rate)
    training_pairs = [pairfromidx(idx) for idx in range(n_iters)]

    for iter in range(1, n_iters + 1):     
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_matrix = training_pair[1]
        n = training_pair[2]
        loss = trainHyperbolic(input_tensor, target_matrix, n, encoder, encoder_optimizer, fc, fc_optimizer)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0



In [15]:
hidden_size = 100
encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
fc = nn.Linear(hidden_size, hidden_size).to(device)
# trainHypIters(encoder, fc)