
# Diploma Thesis -- Gálfi András



## Task: Named Entity Recognition (_A Natural Language Processing task_)



### _Step 1_: Import in all the necesssary libraries

   (there can be imports later in this notebook, up here these are the ones usually needed for an NLP task)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

  from IPython.core.display import display, HTML


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [4]:
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


### _Step 2_: Import the data

   We need data because we want to use a neural network for this task and it needs the dataset for learning.


#### _Function_ read_and_sentence:

   Divide the given file into two datasets, *words* and *labels*:
   

In [5]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            # tags.append(entity_to_number[tag])
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [6]:
train_data_full, train_tags_full = read_and_sentence('D:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data_full, dev_tags_full = read_and_sentence('D:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data_full, test_tags_full = read_and_sentence('D:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [7]:
train_data_full[:5]

[['-DOCSTART-'],
 ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.']]


### _Step 3_: Prepare the data

In their current form the datasets can't be used for teaching. We need to prepare the data and make it understandable for the neural network.

This cell I modified the size of the datasets. I used it when I tested the whole project and it was more practical to use smaller datasets.

In [8]:
## train_data = train_data_full[:4000]
## train_tags = train_tags_full[:4000]

## dev_data = dev_data_full[:1000]
## dev_tags = dev_tags_full[:1000]

## test_data = test_data_full[:1000]
## test_tags = test_tags_full[:1000]

train_data = train_data_full
train_tags = train_tags_full

dev_data = dev_data_full
dev_tags = dev_tags_full

test_data = test_data_full
test_tags = test_tags_full


#### Create vocabularies:

1. turn the datasets (list of lists) into a single list
2. _function_ create_vocab:
    1. selecting the unique members of the list
    2. giving every unique element a number to represent it
3. two vocab is needed:
    1. one for the words --> *word_vocab*
    2. one for the labels --> *label_vocab*

In [9]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence

In [10]:
tr_words = [i for sublist in train_data for i in sublist]
dv_words = [i for sublist in dev_data for i in sublist]
tst_words = [i for sublist in test_data for i in sublist]

tr_tags = [i for sublist in train_tags for i in sublist]
dv_tags = [i for sublist in dev_tags for i in sublist]
tst_tags = [i for sublist in test_tags for i in sublist]

In [11]:
words = set(tr_words + dv_words + tst_words)
vocab_size = len(words)
vocab_size

30290

In [12]:
from keras.preprocessing.text import Tokenizer

In [13]:
t = Tokenizer()
t.fit_on_texts(tr_words + dv_words + tst_words)

In [14]:
## print(t.word_counts)
## print(t.document_count)
## print(t.word_docs)

## word_vocab = t.word_index

## print(t.word_index)

In [15]:
def create_vocab(word_list):
    unique_list = []
    vocab = {}
    
    for val in word_list:
        if val not in unique_list:
            unique_list.append(val)
            
    for i, l in enumerate(unique_list):
        vocab[l] = i
    return vocab

In [16]:
word_vocab = create_vocab(tr_words + dv_words + tst_words)

In [17]:
label_vocab = create_vocab(tr_tags+dv_tags+tst_tags)

In [18]:
print(len(label_vocab))
label_vocab

9


{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [19]:
len(word_vocab)

30290


#### *Function* check_similar_label_length:

Check the longest streak of similar labels in the dataset.
(I used this function only to get to know the data a little better)

In [20]:
def check_similar_label_length(data):
    current_label = 'O'
    current_length = 0
    longest_similar_length = 0
    longest_label = 'O'
    
    for sentence in data:
        current_label = 'O'
        current_length = 0
        
        for i in sentence:
            if i == 'O':
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                    
                current_label = 'O'
                current_length = 0
                
            elif i != current_label:
                
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                
                current_label = i
                current_length = 1
            
            else:
                current_length += 1
    return longest_similar_length, longest_label
                
        

In [21]:
print(check_similar_label_length(train_tags))
print(check_similar_label_length(dev_tags))
print(check_similar_label_length(test_tags))

(9, 'I-ORG')
(9, 'I-ORG')
(5, 'I-MISC')


### check_label_order function:

This function is checking whether there is an appropiate B-xxx tag in fornt of each I-xxx sequence

In [22]:
def check_label_order(data): 
    for sentence in data:
        if 'I-PER' in sentence:
            if 'B-PER' not in sentence:
                return False, sentence
        
        if 'I-ORG' in sentence:
            if 'B-ORG' not in sentence:
                return False, sentence
        
        if 'I-MISC' in sentence:
            if 'B-MISC' not in sentence:
                return False, sentence
            
        if 'I-LOC' in sentence:
            if 'B-LOC' not in sentence:
                return False, sentence
            
        
        for i, j in enumerate(sentence[:-1]):
            if sentence[i + 1] ==  'I-PER':
                    if sentence[i] != 'I-PER' and sentence[i] != 'B-PER':
                        return False, sentence
            if sentence[i + 1] ==  'I-ORG':
                    if sentence[i] != 'I-ORG' and sentence[i] != 'B-ORG':
                        return False, sentence
            if sentence[i + 1] ==  'I-MISC':
                    if sentence[i] != 'I-MISC' and sentence[i] != 'B-MISC':
                        return False, sentence
            if sentence[i + 1] ==  'I-LOC':
                    if sentence[i] != 'I-LOC' and sentence[i] != 'B-LOC':
                        return False, sentence
    return True

In [23]:
print(check_label_order(train_tags))
print(check_label_order(dev_tags))
print(check_label_order(test_tags))

True
True
True


This cell makes tuples from the words and the tags.

In [24]:
training_data = list(zip(train_data, train_tags))
eval_data = list(zip(dev_data, dev_tags))
testing_data = list(zip(test_data, test_tags))

#### Function _prepare_sequence_:
Parameters:
- seq: a list of elements, this case it is a sentence
- to_ix: a vocabulary that contains the elements of the seq

The function assigns a number for every element of the sequence (according to the vocab). And returns it in a torch.tensor.long format.

In [25]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long).to(device)

In [26]:
testing_data[1][1]

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O']

In [27]:
import collections
label_collection = collections.Counter([label_vocab[w] for w in tr_tags])
label_collection

Counter({0: 170524,
         1: 6321,
         2: 3438,
         3: 6600,
         4: 4528,
         5: 7140,
         6: 3704,
         7: 1155,
         8: 1157})

In [28]:
## sum(list(label_collection.values()))
list(label_collection.values())

[170524, 6321, 3438, 6600, 4528, 7140, 3704, 1155, 1157]

In [29]:
len(list(label_collection.values()))

9

In [30]:
import collections

def calc_weights(label_list):
    weights = []
    label_collection = collections.Counter(label_list)
    label_values = list(label_collection.values())
    label_sum = sum(label_values)
    
    weights = label_sum / label_values
    
    ## for i in range(len(label_collection.values())):
        ## weights.append(label_sum / (len(label_values) * label_values[i]))
    return weights
    
    

In [31]:
weights = torch.Tensor(calc_weights([label_vocab[w] for w in tr_tags]))
weights

tensor([  1.1996,  32.3631,  59.5017,  30.9950,  45.1782,  28.6508,  55.2287,
        177.1143, 176.8081])

In [32]:
## weights[0] = 0.12
## weights


### Step 4: Neural Network

Now that the data is ready the only thing remains is the Neural Network.


First there are a a few global parameters that need to be assigned.

In [33]:
VOCAB_SIZE = len(word_vocab)
OUT_DIM = len(label_vocab)
EMBED_DIM = 64
HIDDEN_DIM = 32
learning_rate = 0.01

BATCH_SIZE = 256


This is the cell, where the model is defined.

In [34]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

The model, loss and optimizer are defined here. The model is assigned for the class created in the previous cell.

In [35]:
model = LSTMTagger(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)
model.to(device)

loss_function = nn.CrossEntropyLoss(weight=weights)
## loss_function = nn.CrossEntropyLoss()
loss_function.to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [36]:
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[1][0], word_vocab)
#     tag_scores = model(inputs)
#     print(tag_scores)

#### Function *class_performance*:

This function helps evaluate the results.

In [37]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[0], recall[0], fscore[0]

### This is the part where the training is defined.

In [38]:
def train(model, training_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for sentence, tags in training_data:
        
        optimizer.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_vocab)
        targets = prepare_sequence(tags, label_vocab)
        
        tag_scores = model(sentence_in)

        loss = criterion(tag_scores, targets)
        prec, recall, fscore = class_performance(tag_scores, targets)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
        
    return (epoch_loss / len(training_data),
            epoch_prec / len(training_data),
            epoch_recall / len(training_data),
            epoch_fscore / len(training_data),
    )

### This is the part where the evaluation is defined.

In [39]:
def evaluate(model, eval_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for sentence, tags in eval_data:
            
            sentence_in = prepare_sequence(sentence, word_vocab)
            targets = prepare_sequence(tags, label_vocab)
            
            tag_scores = model(sentence_in)
            loss = criterion(tag_scores, targets)
            
            prec, recall, fscore = class_performance(tag_scores, targets)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(test_data),
        epoch_prec / len(test_data),
        epoch_recall / len(test_data),
        epoch_fscore / len(test_data),
    )

#### Function *epoch_time*:

Shows how much time has passed since the last epoch during training.

In [40]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [41]:
# for epoch in range(5):
#     print("Current epoch: ")
    
    
    
#     for sentence, tags in training_data:
#         model.zero_grad()
        
#         sentence_in = prepare_sequence(sentence, word_vocab)
#         targets = prepare_sequence(tags, label_vocab)

#         tag_scores = model(sentence_in)

#         loss = loss_function(tag_scores, targets)
#         loss.backward()
#         optimizer.step()


### This is where the training and the evaluation are actually called. 

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, training_data, loss_function)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, eval_data, loss_function)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

Epoch: 01 | Epoch Time: 1m 28s
	Train Loss: 1.840 | Train Prec: 93.12% | Train Rec: 78.81% | Train Fscore: 83.57%
	 Val. Loss: 1.692 |  Val Prec: 66.38% | Val Rec: 75.41% | Val Fscore: 68.44%
Epoch: 02 | Epoch Time: 1m 26s
	Train Loss: 1.548 | Train Prec: 78.17% | Train Rec: 85.24% | Train Fscore: 79.60%
	 Val. Loss: 1.418 |  Val Prec: 68.53% | Val Rec: 82.67% | Val Fscore: 73.47%
Epoch: 03 | Epoch Time: 1m 22s
	Train Loss: 1.170 | Train Prec: 79.15% | Train Rec: 91.76% | Train Fscore: 83.58%
	 Val. Loss: 1.152 |  Val Prec: 67.12% | Val Rec: 85.40% | Val Fscore: 73.58%
Epoch: 04 | Epoch Time: 1m 22s
	Train Loss: 0.917 | Train Prec: 80.97% | Train Rec: 93.94% | Train Fscore: 85.71%
	 Val. Loss: 1.028 |  Val Prec: 67.67% | Val Rec: 85.76% | Val Fscore: 74.16%
Epoch: 05 | Epoch Time: 1m 20s
	Train Loss: 0.770 | Train Prec: 82.18% | Train Rec: 94.92% | Train Fscore: 86.93%
	 Val. Loss: 0.951 |  Val Prec: 70.59% | Val Rec: 86.22% | Val Fscore: 76.31%
Epoch: 06 | Epoch Time: 1m 21s
	Train Lo

In [44]:
sent_num = 337
print(testing_data[sent_num][0])
print(testing_data[sent_num][1])

['West', 'Indies', 'captain', 'Courtney', 'Walsh', 'elected', 'to', 'bat', 'after', 'winning', 'the', 'toss', 'in', 'the', 'first', 'match', 'in', 'the', 'World', 'Series', 'limited', 'overs', 'competition', 'against', 'Australia', 'at', 'the', 'Melbourne', 'Cricket', 'Ground', 'on', 'Friday', '.']
['B-LOC', 'I-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O']


In [45]:
with torch.no_grad():
    inputs = prepare_sequence(testing_data[sent_num][0], word_vocab)
    tag_scores = model(inputs)
    prediction = tag_scores.argmax(1)

    print(inputs)
    ## print(tag_scores)
    print(prediction)

tensor([ 1063, 13963,  2753, 13972, 13973,  8730,     5,  2313,   119,  3117,
           40, 14772,   236,    40,  1394,  1908,   236,    40,  1787,  1788,
         3703,  3704,  1578,   788,  1832,   156,    40,  9889, 12910, 27289,
           18,  1162,     9])
tensor([5, 8, 0, 3, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 7, 7, 7, 0,
        5, 8, 0, 5, 6, 6, 0, 0, 0])


In [46]:
from sklearn.metrics import classification_report

out_sent = []
out_lab = []
with torch.no_grad():
    for sentence, tags in testing_data:
        sent = prepare_sequence(sentence, word_vocab)
        sent_tag_scores = model(sent)
        predict = sent_tag_scores.argmax(1)
        
        label = prepare_sequence(tags, label_vocab)
        
        out_sent.append(predict.tolist())
        out_lab.append(label.tolist())
        
    out_sent = [item for sublists in out_sent for item in sublists]
    out_lab = [item for sublists in out_lab for item in sublists]
        
    ## print(out_sent)
    ## print(out_lab)
        
    print(classification_report(out_sent, out_lab))
    
        

              precision    recall  f1-score   support

           0       0.85      0.96      0.90     34227
           1       0.41      0.35      0.38      2036
           2       0.59      0.35      0.44      1205
           3       0.51      0.41      0.45      2025
           4       0.61      0.43      0.50      1668
           5       0.75      0.52      0.61      2378
           6       0.51      0.24      0.32      1905
           7       0.51      0.21      0.29       625
           8       0.68      0.29      0.41       597

    accuracy                           0.80     46666
   macro avg       0.60      0.42      0.48     46666
weighted avg       0.78      0.80      0.78     46666

