In [1]:
import pandas as pd
import numpy as np
import json
import csv
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn 

In [2]:
train_indexes = []
train_words = []
train_tags = []
with open("data/train") as f:
    for l in f:
        if l != "\n":
            l = l.strip()
            index = l.split(" ")[0]
            word = l.split(" ")[1]
            tag = l.split(" ")[2]
            
            train_indexes.append(index)
            train_words.append(word)
            train_tags.append(tag)

In [3]:
dev_indexes = []
dev_words = []
dev_tags = []
with open("data/dev") as f:
    for l in f:
        if l != "\n":
            l = l.strip()
            index = l.split(" ")[0]
            word = l.split(" ")[1]
            tag = l.split(" ")[2]
            
            dev_indexes.append(index)
            dev_words.append(word)
            dev_tags.append(tag)

In [4]:
train_data = pd.DataFrame(zip(train_indexes, train_words, train_tags), columns = ["index", "word", "tag"])
dev_data = pd.DataFrame(zip(dev_indexes, dev_words, dev_tags), columns = ["index", "word", "tag"])

In [5]:
train_words = train_data["word"].values
train_indexes = train_data["index"].values
train_pos_tags = train_data["tag"].values
train_words_dict = dict()

# Count words in the training set.

for word in train_words:
    train_words_dict[word] = train_words_dict.get(word,0) + 1

In [6]:
common_vocab = dict()
unknown_words = []

# Find words with less than 3 occurences and store the counts.

for word, count in train_words_dict.items():
    if count <= 3:
        unknown_words.append(word)
    else:
        common_vocab[word] = count

In [7]:
# Sort the words by occurrences and create the vocabulary.

final_words = ["< pad >", "< unk >"]
for word, count in sorted(common_vocab.items(), key = lambda item: item[1], reverse = True):
    final_words.append(word)

index = range(0,len(final_words))

In [8]:
final_vocabulary = pd.DataFrame(zip(final_words, index), columns = ["word", "index"])
final_vocabulary.index = final_vocabulary.index + 1  # shifting index
final_vocabulary = final_vocabulary.sort_index() 

In [9]:
final_vocabulary

Unnamed: 0,word,index
1,< pad >,0
2,< unk >,1
3,.,2
4,",",3
5,the,4
...,...,...
6180,Demel,6179
6181,Kekkila,6180
6182,T&N,6181
6183,Yr,6182


In [10]:
final_vocabulary_dictionary = final_vocabulary[["word","index"]].set_index("word").T.to_dict("list")
final_vocabulary_dictionary

{'< pad >': [0],
 '< unk >': [1],
 '.': [2],
 ',': [3],
 'the': [4],
 'of': [5],
 'in': [6],
 'to': [7],
 'a': [8],
 '(': [9],
 ')': [10],
 'and': [11],
 '"': [12],
 'on': [13],
 'said': [14],
 "'s": [15],
 'for': [16],
 '1': [17],
 '-': [18],
 'The': [19],
 'was': [20],
 '2': [21],
 '-DOCSTART-': [22],
 '0': [23],
 '3': [24],
 'at': [25],
 'with': [26],
 'that': [27],
 'from': [28],
 'by': [29],
 'is': [30],
 ':': [31],
 'as': [32],
 'he': [33],
 '4': [34],
 'had': [35],
 'has': [36],
 'it': [37],
 'his': [38],
 'not': [39],
 'were': [40],
 'be': [41],
 'an': [42],
 'have': [43],
 'after': [44],
 'who': [45],
 'will': [46],
 '5': [47],
 'but': [48],
 'first': [49],
 'U.S.': [50],
 'been': [51],
 '$': [52],
 '--': [53],
 'two': [54],
 'are': [55],
 'their': [56],
 '6': [57],
 'beat': [58],
 'which': [59],
 'would': [60],
 'up': [61],
 'I': [62],
 'its': [63],
 'they': [64],
 'percent': [65],
 'year': [66],
 'out': [67],
 'Thursday': [68],
 'this': [69],
 'last': [70],
 'million': [71],

In [11]:
vocab_size = len(final_vocabulary_dictionary)
vocab_size

6184

In [12]:
tags_dict = dict()
tags = list(train_data["tag"].unique())
for i, tag in enumerate(tags):
    tags_dict[tag] = i+1

In [13]:
tags_dict

{'B-ORG': 1,
 'O': 2,
 'B-MISC': 3,
 'B-PER': 4,
 'I-PER': 5,
 'B-LOC': 6,
 'I-ORG': 7,
 'I-MISC': 8,
 'I-LOC': 9}

In [14]:
new_tags = []
for tag in train_tags:
    new_tags.append(tags_dict[tag])

In [15]:
dev_new_tags = []
for tag in dev_tags:
    dev_new_tags.append(tags_dict[tag])

In [16]:
# Map word to index (train)
# Unknown word = 1
# Else, map to dictionary
word_indexes = []
true_ners = []
sentence = []
temp_ner = []
for i in range(0, len(train_words)):
    ner = new_tags[i]
    try:
        word = final_vocabulary_dictionary[train_words[i]][0]
        
    except:
        word = 1

    if i == len(train_words) - 1:
        temp_ner.append(ner)
        true_ners.append(temp_ner)
        sentence.append(word)
        word_indexes.append(sentence)
        break

    sentence.append(word)
    temp_ner.append(ner)
    if train_indexes[i+1] == "1":
        word_indexes.append(sentence)
        true_ners.append(temp_ner)
        sentence = []
        temp_ner = []

In [17]:
word_indexes[:10]

[[959, 1, 235, 764, 7, 4149, 211, 1, 2],
 [734, 2070],
 [1381, 136],
 [19,
  228,
  457,
  14,
  13,
  68,
  37,
  1,
  26,
  235,
  4150,
  7,
  2478,
  7,
  1,
  211,
  1,
  409,
  3544,
  2071,
  501,
  1791,
  1922,
  653,
  289,
  41,
  1,
  7,
  1923,
  2],
 [116,
  15,
  3112,
  7,
  4,
  228,
  487,
  15,
  2752,
  1060,
  1,
  1,
  14,
  13,
  73,
  2478,
  259,
  876,
  1,
  28,
  539,
  126,
  114,
  124,
  409,
  4,
  2479,
  4150,
  20,
  1,
  2],
 [12,
  119,
  170,
  197,
  342,
  214,
  565,
  1,
  156,
  101,
  170,
  197,
  822,
  214,
  3545,
  16,
  37,
  3,
  12,
  4,
  457,
  15,
  442,
  172,
  1,
  1297,
  2480,
  1,
  88,
  8,
  236,
  3546,
  2],
 [84,
  14,
  715,
  2479,
  2256,
  20,
  2753,
  11,
  198,
  37,
  20,
  338,
  27,
  790,
  20,
  960,
  37,
  259,
  41,
  556,
  29,
  4,
  228,
  487,
  2],
 [84,
  14,
  8,
  2257,
  70,
  222,
  29,
  959,
  1,
  3113,
  4916,
  3547,
  7,
  633,
  1923,
  1,
  3,
  1,
  11,
  4151,
  1,
  28,
  4,
  791,
  1

In [18]:
# Map word to index (dev)
dev_word_indexes = []
dev_true_ners = []
sentence = []
temp_ner = []
for i in range(0, len(dev_words)):
    ner = dev_new_tags[i]
    try:
        word = final_vocabulary_dictionary[dev_words[i]][0]
        
    except:
        word = 1

    if i == len(dev_words) - 1:
        temp_ner.append(ner)
        dev_true_ners.append(temp_ner)
        sentence.append(word)
        dev_word_indexes.append(sentence)
        break
        
    sentence.append(word)
    temp_ner.append(ner)
    if train_indexes[i+1] == "1":
        dev_word_indexes.append(sentence)
        dev_true_ners.append(temp_ner)
        sentence = []
        temp_ner = []

In [19]:
# Function to prepare datasets.

class PrepareDataset(Dataset):
    
    def __init__(self, word_index, label, transform=None):
        self.features = word_index
        self.labels = label
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        
        feature = self.features[index]
        label = self.labels[index]
        return torch.LongTensor(feature), torch.LongTensor(label)

In [20]:
def collate_fn(data):

    sequences, labels = zip(*data)
    length = [len(seq) for seq in sequences]
    padded_seq = torch.zeros(len(sequences), max(length)).long()
    label_seq = torch.zeros(len(sequences), max(length)).long()
    for i, seq in enumerate(zip(sequences, labels)):
        end = length[i]
        padded_seq[i,:end] = seq[0]
        label_seq[i,:end] = seq[1]
        
    return padded_seq, label_seq, torch.tensor([length])

In [21]:
# Split train - val
train_word_indexes = word_indexes[:11200]
train_word_ners = true_ners[:11200]
val_word_indexes = word_indexes[11200:]
val_word_ners = true_ners[11200:]

In [22]:
# Prepare datasets.

train_data_lstm = PrepareDataset(train_word_indexes, train_word_ners)
val_data_lstm = PrepareDataset(val_word_indexes, val_word_ners)
dev_data_lstm = PrepareDataset(dev_word_indexes, dev_true_ners)

In [23]:
# Create data loaders.

batch_size = 16

train_loader_lstm = torch.utils.data.DataLoader(train_data_lstm, batch_size = batch_size, collate_fn = collate_fn)
val_loader_lstm = torch.utils.data.DataLoader(val_data_lstm, batch_size = batch_size, collate_fn = collate_fn)
dev_loader_lstm = torch.utils.data.DataLoader(dev_data_lstm, batch_size = batch_size, collate_fn = collate_fn)

In [24]:
# First sentence, padded with 0, has length 40
for data,target,_ in train_loader_lstm:
    print(data[0].shape, target[0].shape)
    print(data[0], target[0])
    break

torch.Size([40]) torch.Size([40])
tensor([ 959,    1,  235,  764,    7, 4149,  211,    1,    2,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]) tensor([1, 2, 3, 2, 2, 2, 3, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [76]:
# LSTM architecture.

class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, dropout_p):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 100, padding_idx = 0)
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.linear = nn.Linear(hidden_dim, output_size)
        self.dropout = nn.Dropout(p = dropout_p)
        self.elu = nn.ELU()
        self.lstm = nn.LSTM(input_size, hidden_dim // 2, n_layers, batch_first=True, bidirectional=True)   
    
#         self.fc = nn.Linear(output_size, 10)
        self.fc = nn.Linear(output_size, 9)

    
    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        cell = self.init_cell(batch_size)
        x = self.embedding(x)
        x = self.dropout(x)
#         out, hidden = self.lstm(x, (hidden, cell))
        out, hidden = self.lstm(x)

        out = self.dropout(out)
        out = self.elu(self.linear(out))
        out = self.fc(out)
        out = out.view(out.shape[0] * out.shape[1], -1)[:,-1]
        
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # 2 x 20 x 128
        hidden = torch.zeros(2, batch_size, self.hidden_dim // 2)
        
        return hidden
    
    def init_cell(self, batch_size):
        cell = torch.zeros(2, batch_size, self.hidden_dim // 2)
        
        return cell

In [77]:
model_1 = LSTM(100, 128, 256, 1, 0.33)

In [78]:
# weights = [0,1,0.7,1,1,1,1,1,1,1]
weights = [1,0.7,1,1,1,1,1,1,1]

weight_tensor = torch.FloatTensor(weights)

In [79]:
# Initialize loss function and optimizer.

criterion = nn.CrossEntropyLoss(ignore_index = 0, weight = weight_tensor)

optimizer = torch.optim.SGD(model_1.parameters(), lr = 0.5)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose = True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma = 0.9, verbose = True)

Adjusting learning rate of group 0 to 5.0000e-01.


In [86]:
# Model training.

n_epochs = 5
for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0
    
    model_1.train() # prep model for training
    for data, target, _ in train_loader_lstm:
        optimizer.zero_grad() 
        output, hidden = model_1(data)
#         output = output.contiguous().view(-1, 10)
#         output = output.contiguous().view(-1, 9)
#         target = target.contiguous().view(-1)

#         target = target.view(-1).to(torch.float64)
        print(output.shape)
        print(target.shape)
        print(output)
        loss = criterion(output, target)
        loss.backward() 
        optimizer.step()
        train_loss += loss.item()*data.size(0)
    
    model_1.eval() # prep model for evaluation
    for data, target, _ in val_loader_lstm:
        output, hidden = model_1(data)
#         output = output.contiguous().view(-1, 10)
#         output = output.contiguous().view(-1, 10)
#         target = target.contiguous().view(-1)
        
        
        loss = criterion(output, target)
        valid_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_loader_lstm.dataset)
    valid_loss = valid_loss/len(val_loader_lstm.dataset)
    scheduler.step()
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(epoch+1, train_loss, valid_loss))

torch.Size([640])
torch.Size([16, 40])
tensor([-7.4302e-02,  2.2492e-03, -5.0630e-02, -1.5394e-02,  5.4972e-02,
        -9.4638e-03, -3.0981e-02,  6.5776e-02,  4.9027e-02,  2.1473e-02,
         1.6051e-02, -8.0813e-03, -9.1731e-03,  9.2439e-03,  4.1248e-03,
        -8.9288e-03,  1.1795e-02,  1.4176e-02, -2.2808e-04, -6.2785e-04,
         7.6839e-03,  1.7267e-02,  1.0683e-02, -5.1633e-03,  9.8800e-03,
         1.8193e-02, -3.5734e-03,  1.0928e-02,  2.3503e-03,  1.1179e-02,
        -1.4687e-03,  2.4723e-03,  6.3117e-03,  1.8562e-02,  5.0056e-03,
        -2.8291e-03,  1.2038e-02, -5.0723e-03, -3.2683e-03,  6.6601e-03,
         9.5724e-03,  3.4080e-03,  6.1816e-03,  8.1863e-04,  6.5112e-04,
        -1.6210e-03,  1.6366e-02,  5.5235e-03,  9.6755e-03,  8.8363e-03,
         1.0967e-02,  2.6648e-03, -8.8835e-03,  1.3977e-02,  2.0223e-03,
         1.5907e-03,  1.5590e-02,  1.2826e-02,  1.1099e-02,  7.7913e-04,
         1.2449e-02,  5.5221e-03,  1.1144e-02,  7.0263e-03,  2.8291e-03,
         9.5

RuntimeError: 0D or 1D target tensor expected, multi-target not supported

In [85]:
# Get predictions.

prediction_list = []
count = 0
for i, batch in enumerate(dev_loader_lstm):
    prediction = []
    lengths = []
    output, hidden = model_1(batch[0])
    for b in batch[0]:
        lengths.append((torch.count_nonzero(b).item()))
    for i,res in enumerate(output):
        for j in range(lengths[i]):
            prediction.append(torch.argmax(res[j]).item())
    prediction_list.append(prediction)

IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number