# Diploma Thesis Design 1
## Named Entity Recognition


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

  from IPython.core.display import display, HTML


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [3]:
import re
from pathlib import Path

## Importing the data from the filesystem

(all the data are from the *nlpprogress.com* website)

### Explaining the data structure:
- First column contains one word in each row
- Second column contains the part-of-speech tag of the word
- Third column contains the chunk tag of the word
- Fourth column contains the named entity tag of the word

- There are 4 types of named entities in the data
    - Person (PER)
    - Location (LOC)
    - Organization (ORG)
    - Miscellaneous/Other (MISC)


In [4]:
train_data = pd.read_csv(r'D:\Egyetem\Diplomaterv\data\conllpp_train.txt', sep=" ", names=['word', 'part-of-speech', 'chunk', 'named entity'])
dev = pd.read_csv(r'D:\Egyetem\Diplomaterv\data\conllpp_dev.txt', sep=" ", names=['word', 'part-of-speech', 'chunk', 'named entity'])
test = pd.read_csv(r'D:\Egyetem\Diplomaterv\data\conllpp_test.txt', sep=" ", names=['word', 'part-of-speech', 'chunk', 'named entity'])

In [5]:
train_data.head(10)

Unnamed: 0,word,part-of-speech,chunk,named entity
0,-DOCSTART-,-X-,-X-,O
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O
5,to,TO,B-VP,O
6,boycott,VB,I-VP,O
7,British,JJ,B-NP,B-MISC
8,lamb,NN,I-NP,O
9,.,.,O,O


### Importing the data in another way

In [6]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [7]:
texts, tags = read_and_sentence('D:\Egyetem\Diplomaterv\data\conllpp_train.txt')

In [8]:
tags[:5]

[['O'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 ['B-PER', 'I-PER'],
 ['B-LOC', 'O'],
 ['O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [9]:
texts[:5]

[['-DOCSTART-'],
 ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.']]

In [10]:
train_data

Unnamed: 0,word,part-of-speech,chunk,named entity
0,-DOCSTART-,-X-,-X-,O
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O
...,...,...,...,...
204562,three,CD,I-NP,O
204563,Swansea,NN,B-NP,B-ORG
204564,1,CD,I-NP,O
204565,Lincoln,NNP,I-NP,B-ORG


In [11]:
train_data[1:15]

Unnamed: 0,word,part-of-speech,chunk,named entity
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O
5,to,TO,B-VP,O
6,boycott,VB,I-VP,O
7,British,JJ,B-NP,B-MISC
8,lamb,NN,I-NP,O
9,.,.,O,O
10,Peter,NNP,B-NP,B-PER


In [12]:
len(train_data)

204567

In [13]:
train_data.tail(5)

Unnamed: 0,word,part-of-speech,chunk,named entity
204562,three,CD,I-NP,O
204563,Swansea,NN,B-NP,B-ORG
204564,1,CD,I-NP,O
204565,Lincoln,NNP,I-NP,B-ORG
204566,2,CD,I-NP,O


In [14]:
postags = train_data.groupby("part-of-speech")
## words = train.groupby("word")

In [15]:
ner_unique = train_data.groupby("named entity")
ner_unique.first()

Unnamed: 0_level_0,word,part-of-speech,chunk
named entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B-LOC,BRUSSELS,NNP,B-NP
B-MISC,German,JJ,B-NP
B-ORG,EU,NNP,B-NP
B-PER,Peter,NNP,B-NP
I-LOC,Strait,NNP,I-NP
I-MISC,Spongiform,NNP,I-NP
I-ORG,Commission,NNP,I-NP
I-PER,Blackburn,NNP,I-NP
O,-DOCSTART-,-X-,-X-


In [16]:
vmi = train_data[train_data.word == "-DOCSTART-"]
vmi

Unnamed: 0,word,part-of-speech,chunk,named entity
0,-DOCSTART-,-X-,-X-,O
470,-DOCSTART-,-X-,-X-,O
660,-DOCSTART-,-X-,-X-,O
900,-DOCSTART-,-X-,-X-,O
977,-DOCSTART-,-X-,-X-,O
...,...,...,...,...
203676,-DOCSTART-,-X-,-X-,O
203867,-DOCSTART-,-X-,-X-,O
204161,-DOCSTART-,-X-,-X-,O
204273,-DOCSTART-,-X-,-X-,O


In [17]:
postags.first()

Unnamed: 0_level_0,word,chunk,named entity
part-of-speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
$,$,I-NP,O
'',',I-NP,O
(,(,O,O
),),O,O
",",",",O,O
-X-,-DOCSTART-,-X-,O
.,.,O,O
:,--,O,O
CC,and,O,O
CD,1996-08-22,I-NP,O


In [18]:
postags.describe()

Unnamed: 0_level_0,word,word,word,word,chunk,chunk,chunk,chunk,named entity,named entity,named entity,named entity
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
part-of-speech,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
$,427,10,$,362,427,3,B-NP,325,427,2,O,362.0
'',35,1,',35,35,2,O,30,35,2,O,33.0
(,2866,2,(,2861,2866,3,O,2788,2866,4,O,2851.0
),2866,2,),2861,2866,2,O,2851,2866,4,O,2851.0
",",7291,2,",",7290,7291,7,O,6961,7291,5,O,7276.0
-X-,946,1,-DOCSTART-,946,946,1,-X-,946,946,1,O,946.0
.,7389,3,.,7374,7389,2,O,7344,7389,4,O,7376.0
:,2386,5,-,1243,2386,5,O,1878,2386,2,O,2382.0
CC,3653,19,and,2838,3653,7,O,2642,3653,4,O,3580.0
CD,19704,3491,1,1420,19704,9,I-NP,13553,19704,8,O,19596.0


In [19]:
full_train = train_data
## train_data = train_data.head(10000)

full_dev = dev
## dev = dev.head(10000)

full_test = test
## test = test.head(10000)


In [20]:
def label_to_number(data):
    entity = data["named entity"]
    label = 0
    if entity == "O":
        label = 0
    elif entity == "B-ORG":
        label = 1
    elif entity == "B-MISC":
        label = 2
    elif entity == "B-PER":
        label = 3
    elif entity == "I-PER":
        label = 4
    elif entity == "B-LOC":
        label = 5
    elif entity == "I-ORG":
        label = 6
    elif entity == "I-MISC":
        label = 7
    elif entity == "I-LOC":
        label = 8
    
    return label



In [21]:
# label_to_number = {'O': 0,'B-ORG': 1,'B-MISC': 2,'B-PER': 3,'I-PER': 4,'B-LOC': 5,'I-ORG': 6,'I-MISC': 7,'I-LOC': 8}

#### Weights (a calculated the weidhts in another notebooks (for example in dt_2), i just use them here

In [22]:
import torch

weights = torch.Tensor([  1.1996,  32.3631,  59.5017,  30.9950,  45.1782,  28.6508,  55.2287,
        177.1143, 176.8081])
weights

tensor([  1.1996,  32.3631,  59.5017,  30.9950,  45.1782,  28.6508,  55.2287,
        177.1143, 176.8081])

## First model (a feed-forward neural network):

In [23]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [24]:
train_word_list = []
train_label_list = []
for index, row in train_data.iterrows():
    train_word_list.append(row.word)
    train_label_list.append(label_to_number(row))

In [25]:
dev_word_list = []
dev_label_list = []
for index, row in dev.iterrows():
    dev_word_list.append(row.word)
    dev_label_list.append(label_to_number(row))

In [26]:
test_word_list = []
test_label_list = []
for index, row in test.iterrows():
    test_word_list.append(row.word)
    test_label_list.append(label_to_number(row))

In [27]:
print(train_label_list[:10])
print(train_word_list[:10])

[0, 1, 0, 2, 0, 0, 0, 2, 0, 0]
['-DOCSTART-', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [28]:
print(test_label_list[:10])
print(test_word_list[:10])

[0, 0, 0, 5, 0, 0, 0, 0, 5, 0]
['-DOCSTART-', 'SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN']


In [29]:
train_word_list = pd.Series(train_word_list).fillna("").tolist()

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

vectorizer = CountVectorizer()

word_to_ix = vectorizer.fit(train_word_list)

In [31]:
VOCAB_SIZE = len(word_to_ix.vocabulary_)
OUT_DIM = len(ner_unique.first())
EMBED_DIM = 100
HIDDEN_DIM_1 = 200
HIDDEN_DIM_2 = 100

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [32]:
tr_data, val_data = train_word_list, test_word_list

tr_data_vecs = torch.FloatTensor(word_to_ix.transform(tr_data).toarray())
val_data_vecs = torch.FloatTensor(word_to_ix.transform(val_data).toarray())

tr_labels = train_label_list
val_labels = test_label_list

In [33]:
tr_data_loader = [(sample, label) for sample, label in zip(tr_data_vecs, tr_labels)]
val_data_loader = [(sample, label) for sample, label in zip(val_data_vecs, val_labels)]

train_iterator = DataLoader(tr_data_loader,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            )

valid_iterator = DataLoader(val_data_loader,
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          )

In [34]:
print(type(tr_labels))

<class 'list'>


In [35]:
class SimpleFNN(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(SimpleFNN, self).__init__()
        self.linear = nn.Linear(vocab_size, OUT_DIM)
    
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [36]:
class FNN(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(FNN, self).__init__()
    
        self.hidden1 = nn.Linear(vocab_size, HIDDEN_DIM_1)
        self.act1 = nn.ReLU()
        
        self.hidden2 = nn.Linear(HIDDEN_DIM_1, HIDDEN_DIM_2)
        self.act2 = nn.ReLU()
        
        self.hidden3 = nn.Linear(HIDDEN_DIM_2, num_labels)
        self.act3 = nn.Sigmoid()
    
    def forward(self, bow_vec):
        bow_vec = self.hidden1(bow_vec)
        bow_vec = self.act1(bow_vec)
        
        bow_vec = self.hidden2(bow_vec)
        bow_vec = self.act2(bow_vec)
        
        bow_vec = self.hidden3(bow_vec)
        bow_vec = self.act3(bow_vec)
        return F.log_softmax(bow_vec, dim=1)

In [37]:
model = FNN(OUT_DIM, VOCAB_SIZE)

In [38]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.NLLLoss(weight=weights)
## criterion = nn.NLLLoss()

model = model.to(device)
criterion = criterion.to(device)

In [39]:
def class_accuracy(preds, y):
    
    rounded_preds = preds.argmax(1)
    correct = (rounded_preds == y).float()
    
    acc = correct.sum() / len(correct)
    return acc

In [40]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[0], recall[0], fscore[0]

In [41]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for texts, labels in iterator:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
                  
        predictions = model(texts)

        loss = criterion(predictions, labels)
        ## acc = class_accuracy(predictions, labels)
        prec, recall, fscore = class_performance(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
        
    return (epoch_loss / len(iterator),
            epoch_prec / len(iterator),
            epoch_recall / len(iterator),
            epoch_fscore / len(iterator),
    )

In [42]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for texts, labels in iterator:
            
            texts = texts.to(device)
            labels = labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            ## acc = class_accuracy(predictions, labels)
            prec, recall, fscore = class_performance(predictions, labels)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (epoch_loss / len(iterator),
            epoch_prec / len(iterator),
            epoch_recall / len(iterator),
            epoch_fscore / len(iterator),
    )

In [43]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [44]:
import warnings
warnings.filterwarnings('ignore')

In [45]:
import torch.nn.functional as F

N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, train_iterator, optimizer, criterion)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

Epoch: 01 | Epoch Time: 0m 36s
	Train Loss: 1.836 | Train Prec: 62.24% | Train Rec: 98.36% | Train Fscore: 75.82%
	 Val. Loss: 1.770 |  Val Prec: 60.94% | Val Rec: 99.05% | Val Fscore: 74.83%
Epoch: 02 | Epoch Time: 0m 38s
	Train Loss: 1.594 | Train Prec: 64.49% | Train Rec: 99.44% | Train Fscore: 78.18%
	 Val. Loss: 1.755 |  Val Prec: 63.04% | Val Rec: 98.92% | Val Fscore: 76.38%
Epoch: 03 | Epoch Time: 0m 39s
	Train Loss: 1.555 | Train Prec: 64.67% | Train Rec: 99.58% | Train Fscore: 78.35%
	 Val. Loss: 1.752 |  Val Prec: 61.29% | Val Rec: 99.05% | Val Fscore: 75.13%
Epoch: 04 | Epoch Time: 0m 40s
	Train Loss: 1.544 | Train Prec: 66.01% | Train Rec: 99.59% | Train Fscore: 79.34%
	 Val. Loss: 1.753 |  Val Prec: 62.56% | Val Rec: 98.96% | Val Fscore: 76.01%
Epoch: 05 | Epoch Time: 0m 43s
	Train Loss: 1.542 | Train Prec: 65.91% | Train Rec: 99.63% | Train Fscore: 79.29%
	 Val. Loss: 1.754 |  Val Prec: 63.08% | Val Rec: 98.96% | Val Fscore: 76.43%
Epoch: 06 | Epoch Time: 0m 40s
	Train Lo

In [46]:
from sklearn.metrics import classification_report

out_sent = []
out_lab = []
with torch.no_grad():
    for texts, labels in valid_iterator:
        texts = texts.to(device)
        labels = labels.to(device)
        
        sent_tag_scores = model(texts)
        predict = sent_tag_scores.argmax(1)
        
        
        out_sent.append(predict.tolist())
        out_lab.append(labels.tolist())
        
    out_sent = [item for sublists in out_sent for item in sublists]
    out_lab = [item for sublists in out_lab for item in sublists]
        
    ## print(out_sent)
    ## print(out_lab)
        
    print(classification_report(out_sent, out_lab))

              precision    recall  f1-score   support

           0       0.64      0.99      0.78     24947
           1       0.52      0.62      0.57      1417
           2       0.57      0.56      0.56       732
           3       0.00      0.38      0.00         8
           4       0.27      0.25      0.26      1218
           5       0.66      0.76      0.71      1442
           6       0.72      0.04      0.08     15944
           7       0.57      0.23      0.33       625
           8       0.55      0.43      0.48       333

    accuracy                           0.61     46666
   macro avg       0.50      0.47      0.42     46666
weighted avg       0.65      0.61      0.51     46666



## Second model ( a recurrent neural network / long-short term memory model) 

It can be found in the other notebooks