In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from data_utils import create_vocabularies, load_dataset

DATA_PATH = "Data/ner_dataset.csv"

In [3]:
sentences, tags = load_dataset(DATA_PATH)

Total Number of sentences : 47959


In [4]:
X_train, X_test, y_train, y_test = train_test_split(sentences, 
                                                    tags, test_size=0.2)

In [5]:
UNK_TOKEN = '<unk>'
PAD_WORD_TOKEN = '<pad>'
PAD_TAG_TOKEN = '<pad_tag>'

word_vocab, tag_vocab = create_vocabularies(X_train, y_train)

In [6]:
print(f"Vocabulary size : {len(word_vocab)}")

Vocabulary size : 31985


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [8]:
class NERDataset:
    def __init__(self, vocab_dict, tag_dict, sent_list, tag_list):
        self.vocab = vocab_dict
        self.tags = tag_dict
        
        self.sentences = sent_list
        self.sentence_tags = tag_list
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sent = self.sentences[idx]
        tags = self.sentence_tags[idx]
        
        psent = [self.vocab.get(word, self.vocab[UNK_TOKEN]) for word in sent]
        ptags = [self.tags.get(tag) for tag in tags]
        
        return torch.LongTensor(psent), torch.LongTensor(ptags)

In [9]:
def collate_fn(batch):
    sentences, tags = map(list, zip(*batch))
    max_len = max([len(x) for x in sentences])
    
    bsent = torch.zeros([len(batch), max_len], dtype = torch.long)
    btags = torch.zeros([len(batch), max_len], dtype = torch.long)
    
    for i in range(len(batch)):
        bsent[i, :] = F.pad(sentences[i], (0, max_len - len(sentences[i])), 'constant', word_vocab[PAD_WORD_TOKEN])
        btags[i, :] = F.pad(tags[i], (0, max_len - len(tags[i])), 'constant', tag_vocab[PAD_TAG_TOKEN])
    
    return bsent, btags

In [10]:
train_data = NERDataset(word_vocab, tag_vocab, X_train, y_train)
test_data = NERDataset(word_vocab, tag_vocab, X_test, y_test)

BATCH_SIZE = 32

dl = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_dl = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [11]:
import torch.nn as nn
import torch.optim as optim

In [12]:
class LSTMNet(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, out_size):
        super(LSTMNet, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.output = nn.Linear(hidden_size, out_size)
        self.logSoftmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, x):
        ex = self.embedding(x)
        out, _ = self.lstm(ex)
        out = out.view(-1, out.shape[2])
        out = self.output(out)
        
        return self.logSoftmax(out)

In [13]:
VOCAB_SIZE = len(word_vocab)
OUT_SIZE = len(tag_vocab) - 1
EMB_SIZE = 300
HIDDEN_SIZE = 128

In [14]:
def loss_fn(outputs, label):
    label = label.view(-1)
    
    mask = (label >= 0).float()  # Mask for pad tag tokens i.e. -1
    
    label = label % outputs.shape[1] # because indexing with -ve number is 
                                     # not desired
        
    num_tokens = torch.sum(mask)  # number of tokens to be counted in loss
    
    # we have log likelihood, we are calculating NLLL by neglecting Pad tokens
    return -torch.sum(outputs[range(outputs.shape[0]), label] * mask) / num_tokens

def accuracy(outputs, labels):
    labels = labels.ravel()
    
    mask = (labels >= 0)
    
    outputs = np.argmax(outputs, axis = 1)
    
    return np.sum(outputs == labels) / float(np.sum(mask))

In [15]:
net = LSTMNet(VOCAB_SIZE, EMB_SIZE, HIDDEN_SIZE, OUT_SIZE)
opt = optim.Adam(net.parameters(), lr=1e-04)

In [16]:
net = net.cuda()



for k in range(10):
    train_l = 0
    train_a = 0
    
    test_l = 0
    test_a = 0
    
    for j, (sbatch, tbatch) in enumerate(dl):
        sbatch = sbatch.cuda()
        tbatch = tbatch.cuda()
        out_pred = net(sbatch)
        
        l = loss_fn(out_pred, tbatch)

        l.backward()
        train_l += l.item()
        opt.step()
        
        out = out_pred.data.cpu().numpy()
        tout = tbatch.data.cpu().numpy()
        train_a += accuracy(out, tout)
    print(f"Epoch : {k + 1}")
    print(f"Training Loss : {train_l / j:.3f} Accuracy : {train_a * 100 / j:.3f}")
    
    with torch.no_grad():
        for k, (sbatch, tbatch) in enumerate(test_dl):
            sbatch = sbatch.cuda()
            tbatch = tbatch.cuda()
            
            out_pred = net(sbatch)
            
            l = loss_fn(out_pred, tbatch)
            
            test_l += l.item()
            
            out = out_pred.data.cpu().numpy()
            tout = tbatch.data.cpu().numpy()
            test_a += accuracy(out, tout)
    
    print(f"Testing Loss : {test_l / k:.3f} Accuracy : {test_a * 100 / k:.3f}")

Epoch : 1
Training Loss : 0.763 Accuracy : 83.786
Testing Loss : 0.416 Accuracy : 89.314
Epoch : 2
Training Loss : 0.326 Accuracy : 91.117
Testing Loss : 0.298 Accuracy : 92.658
Epoch : 3
Training Loss : 0.227 Accuracy : 94.208
Testing Loss : 0.255 Accuracy : 94.070
Epoch : 4
Training Loss : 0.187 Accuracy : 95.244
Testing Loss : 0.253 Accuracy : 94.349
Epoch : 5
Training Loss : 0.172 Accuracy : 95.580
Testing Loss : 0.255 Accuracy : 94.510
Epoch : 6
Training Loss : 0.164 Accuracy : 95.696
Testing Loss : 0.261 Accuracy : 94.574
Epoch : 7
Training Loss : 0.159 Accuracy : 95.807
Testing Loss : 0.269 Accuracy : 94.623
Epoch : 8
Training Loss : 0.157 Accuracy : 95.836
Testing Loss : 0.279 Accuracy : 94.615
Epoch : 9
Training Loss : 0.155 Accuracy : 95.849
Testing Loss : 0.292 Accuracy : 94.551
Epoch : 10
Training Loss : 0.154 Accuracy : 95.862
Testing Loss : 0.300 Accuracy : 94.567


In [17]:
idx2word = dict(zip(word_vocab.values(), word_vocab.keys()))
idx2tag = dict(zip(tag_vocab.values(), tag_vocab.keys()))

In [63]:
out = torch.argmax(out_pred, axis = 1)
out = out.reshape(tbatch.shape[0], -1)

sentences = sbatch[:5].data.cpu().numpy()
labels = tbatch[:5].data.cpu().numpy()
preds = out[:5].data.cpu().numpy()

for i in range(5):
    print("Sentence : {}".format(i + 1))
    sent = sentences[i]
    label = labels[i]
    pred = preds[i]
    
    temp = (label >= 0).sum()
    
    sent = list(map(lambda x: idx2word.get(x, idx2word[1]), sent[:temp]))
    sent = '\t'.join(word for word in sent if word != '<pad>')
    
    label = list(map(lambda x: idx2tag.get(x, idx2tag[-1]), label[:temp]))
    label = '\t'.join(word for word in label if word != '<pad_tag>')
    
    pred = list(map(lambda x: idx2tag.get(x, idx2tag[-1]), pred[:temp]))
    pred = '\t'.join(word for word in pred if word != '<pad_tag>')
    
    print(sent)
    print("Truth Labels")
    print(label)
    print("Predictions")
    print(pred)
    print("\n\n")

Sentence : 1
Merck	withdrew	the	popular	drug	last	year	after	a	study	showed	it	doubled	the	risk	of	heart	problems	in	long-term	users	.
Truth Labels
B-org	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O
Predictions
B-org	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O



Sentence : 2
French	Foreign	Minister	Philippe	Douste-Blazy	said	Friday	he	hopes	Iran	will	hear	the	voice	of	reason	and	not	resume	nuclear	activities	.
Truth Labels
B-org	I-org	O	B-per	I-per	O	B-tim	O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O
Predictions
B-gpe	O	I-per	B-per	I-per	O	B-tim	O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O



Sentence : 3
Meanwhile	,	China	has	reported	two	new	outbreaks	of	bird	flu	among	poultry	in	the	northeastern	province	of	Liaoning	,	bringing	the	total	number	of	reported	outbreaks	in	the	country	over	the	past	month	to	six	.
Truth Labels
O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O	O	B-tim	O	B-tim	I-tim	O
Predictions
O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	B-geo	O	O	O	O	O	O	O	O	O	O	O	O	O	B-tim	O	O	O	