# Diploma Thesis Design 1
## Named Entity Recognition
### LSTM model

This notebook was created because this way the project stays cleaner and more understandeable.
The data is the same as in the *Diploma_thesis* notebook.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [4]:
entity_to_number = {"O": 0, "B-LOC": 1, "I-LOC": 2, "B-MISC": 3, "I-MISC": 4, "B-PER": 5, "I-PER": 6, "B-ORG": 7, "I-ORG": 8}

In [5]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            tags.append(entity_to_number[tag])
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [6]:
train_data, train_tags = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data, dev_tags = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data, test_tags = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [7]:
test_tags[:5]

[[0],
 [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [5, 6],
 [1, 0, 1, 2, 2, 0],
 [1, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [8]:
train_words = [i for sublist in train_data for i in sublist]
dev_words = [i for sublist in dev_data for i in sublist]
test_words = [i for sublist in dev_data for i in sublist]

train_tags = [i for sublist in train_tags for i in sublist]
dev_tags = [i for sublist in dev_tags for i in sublist]
test_tags = [i for sublist in test_tags for i in sublist]

In [9]:
train_words = train_words[:40000]
dev_words = dev_words[:10000]
test_words = test_words[:10000]

train_tags = train_tags[:40000]
dev_tags = dev_tags[:10000]
test_tags = test_tags[:10000]


In [10]:
len(train_words)

40000

In [11]:
len(dev_words)

10000

In [12]:
len(test_words)

10000

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

vectorizer = CountVectorizer()

word_to_ix = vectorizer.fit(train_words)

In [14]:
len(word_to_ix.vocabulary_)

7021

In [15]:
VOCAB_SIZE = len(word_to_ix.vocabulary_)
OUT_DIM = len(entity_to_number)
EMBED_DIM = 100
HIDDEN_DIM = 20
learning_rate = 0.001

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
def convert_words_to_number(dataset, vocab):
    words_as_numbers = []
    
    for word in dataset:
        if word in vocab:
            words_as_numbers.append(torch.LongTensor(vocab[word]).to(device))
        else:
            words_as_numbers.append(torch.LongTensor(VOCAB_SIZE).to(device))
    
    return words_as_numbers
        

In [17]:
from torch.nn.utils.rnn import pad_sequence

# tr_data_vecs = torch.FloatTensor(word_to_ix.transform(train_words + dev_words).toarray())
# val_data_vecs = torch.FloatTensor(word_to_ix.transform(test_words).toarray())
tr_data_vecs = pad_sequence(
        convert_words_to_number(train_words + dev_words, word_to_ix.vocabulary_),
        batch_first=True,
        padding_value=VOCAB_SIZE+1,
)
val_data_vecs = pad_sequence(
        convert_words_to_number(test_words, word_to_ix.vocabulary_),
        batch_first=True,
        padding_value=VOCAB_SIZE+1,
)
tr_labels = torch.LongTensor(train_tags + dev_tags).to(device)
val_labels = torch.LongTensor(test_tags).to(device)

In [18]:
tr_data_vecs[10]

tensor([ 448,  448, 2442,  ..., 1177,   66,  190])

In [19]:
tr_data_loader = [(sample, label) for sample, label in zip(tr_data_vecs, tr_labels)]
val_data_loader = [(sample, label) for sample, label in zip(val_data_vecs, val_labels)]

train_iterator = DataLoader(tr_data_loader,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            )

valid_iterator = DataLoader(val_data_loader,
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          )

In [20]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[1], recall[1], fscore[1]

In [23]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels):
        super(LSTM, self).__init__()
        
        self.hidden_dim = HIDDEN_DIM
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=7020)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2tag = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [24]:
model = LSTM(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)

In [25]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

model = model.to(device)
criterion = criterion.to(device)

In [26]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for texts, labels in iterator:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
                  
        predictions = model(texts)

        loss = criterion(predictions, labels)
        prec, recall, fscore = class_performance(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
        
    return (epoch_loss / len(iterator),
            epoch_prec / len(iterator),
            epoch_recall / len(iterator),
            epoch_fscore / len(iterator),
    )

In [27]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for texts, labels in iterator:
            
            texts = texts.to(device)
            labels = labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            prec, recall, fscore = class_performance(predictions, labels)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(iterator),
        epoch_prec / len(iterator),
        epoch_recall / len(iterator),
        epoch_fscore / len(iterator),
    )

In [28]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
import torch.nn.functional as F

N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, train_iterator, optimizer, criterion)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

IndexError: index out of range in self