In [109]:
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [185]:
from NER_ETL import EntityETL

In [186]:
ner = EntityETL('glove/glove.6B.50d.txt', 50)

In [187]:
ner.load_train_vocab_nn('clean_data/clean_data.csv')

In [188]:
ner.vocab_size

8768

In [189]:
ner.ne_tag_map

{'UNK_NE': 0,
 'PAD': -1,
 'I-gpe': 1,
 'I-tim': 2,
 'B-art': 3,
 'I-art': 4,
 'I-per': 5,
 'B-geo': 6,
 'O': 7,
 'I-nat': 8,
 'I-eve': 9,
 'I-geo': 10,
 'B-nat': 11,
 'B-gpe': 12,
 'B-org': 13,
 'I-org': 14,
 'B-per': 15,
 'B-tim': 16,
 'B-eve': 17}

In [190]:
ner.pos_tag_map

{'UNK_POS': 0,
 'PAD': -1,
 'WRB': 1,
 'MD': 2,
 'JJS': 3,
 'RRB': 4,
 'VBN': 5,
 'CC': 6,
 'WDT': 7,
 'PRP$': 8,
 'LRB': 9,
 'VB': 10,
 'WP$': 11,
 'RB': 12,
 'IN': 13,
 'NN': 14,
 '.': 15,
 '$': 16,
 'PRP': 17,
 'RP': 18,
 ':': 19,
 'NNS': 20,
 'PDT': 21,
 'WP': 22,
 'RBR': 23,
 'NNP': 24,
 'DT': 25,
 'UH': 26,
 'NNPS': 27,
 'POS': 28,
 'VBP': 29,
 'VBG': 30,
 'RBS': 31,
 'VBZ': 32,
 'VBD': 33,
 'EX': 34,
 'TO': 35,
 'JJ': 36,
 'CD': 37,
 'JJR': 38}

In [191]:
train_sent, train_labels = ner.load_train_input_data('clean_data/clean_data.csv')

In [192]:
train_sent[2990]

['Other',
 'options',
 'include',
 'moving',
 'the',
 'U.N.',
 'mission',
 'headquarters',
 'from',
 'Eritrea',
 'to',
 'Ethiopia',
 'and',
 'downgrading',
 'the',
 'operation',
 'to',
 'either',
 'an',
 'observer',
 'or',
 'liaison',
 'effort',
 '.']

In [219]:
x=1
print(len(train_labels[x]))
print(len(train_sent[x]))

30
30


In [194]:
ner.load_embed_vects(embedding_dim=50)

In [195]:
nn_train_sent, nn_train_pos, nn_train_labels = ner.prep_train_for_nn()

In [220]:
print(len(nn_train_sent[x]))
print(len(nn_train_labels[x]))

30
30


In [249]:
ner.batch_starting_point = 0

In [246]:
batch_1, labels_1 = next(ner.nn_train_batch_generator(nn_train_sent, nn_train_labels, nn_train_pos))

In [247]:
batch_1

tensor([[2866, 5572, 1902,  ...,   -1,   -1,   -1],
        [6292, 6807, 5266,  ...,   -1,   -1,   -1],
        [2866, 3867, 2607,  ...,   -1,   -1,   -1],
        ...,
        [6292, 3419, 6332,  ...,   -1,   -1,   -1],
        [2797, 1949, 3725,  ...,   -1,   -1,   -1],
        [6676, 6555,  344,  ...,   -1,   -1,   -1]])

In [255]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, number_of_tags):
        super(Net, self).__init__()

        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(lstm_hidden_dim, number_of_tags)
    
    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        #reshape the Variable so that each row contains one token
        s = s.view(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*b
    
    def loss_fn(outputs, labels):
        #reshape labels to give a flat vector of length batch_size*seq_len
        labels = labels.view(-1)  

        #mask out 'PAD' tokens
        mask = (labels >= 0).float()

        #the number of tokens is the sum of elements in mask
        num_tokens = int(torch.sum(mask).data[0])

        #pick the values corresponding to labels and multiply by mask
        outputs = outputs[range(outputs.shape[0]), labels]*mask

        #cross entropy loss for all non 'PAD' tokens
        return -torch.sum(outputs)/num_tokens
    
    

    

In [256]:
model = Net(ner.vocab_size, 50, 20, len(ner.ne_tag_map))

In [257]:
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)


In [258]:
model.parameters()

<generator object Module.parameters at 0x7ff3f71e3620>

In [260]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs==labels)/float(labels.size)

train_iterator = ner.nn_train_batch_generator(nn_train_sent, nn_train_labels)

for _ in range(10):
    batch_sentences, batch_ne_labels = next(train_iterator)
    
    output_batch = model(batch_sentences)
    
    loss = Net.loss_fn(output_batch, batch_ne_labels)
    optimizer.zero_grad()  # clear previous gradients
    loss.backward()        # compute gradients of all variables wrt loss
    optimizer.step()       # perform updates using calculated gradients

    print(accuracy(output_batch, batch_ne_labels))
    

IndexError: index out of range in self