In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [4]:
import torch.nn.functional as F

In [5]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            # tags.append(entity_to_number[tag])
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [6]:
train_data_full, train_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data_full, dev_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data_full, test_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [7]:
## train_data = train_data_full[:40000]
## train_tags = train_tags_full[:40000]

## dev_data = dev_data_full[:10000]
## dev_tags = dev_tags_full[:10000]

## test_data = test_data_full[:10000]
## test_tags = test_tags_full[:10000]

train_data = train_data_full
train_tags = train_tags_full

dev_data = dev_data_full
dev_tags = dev_tags_full

test_data = test_data_full
test_tags = test_tags_full

In [8]:
tr_words = [i for sublist in train_data for i in sublist]
dv_words = [i for sublist in dev_data for i in sublist]
tst_words = [i for sublist in test_data for i in sublist]

tr_tags = [i for sublist in train_tags for i in sublist]
dv_tags = [i for sublist in dev_tags for i in sublist]
tst_tags = [i for sublist in test_tags for i in sublist]

In [9]:
def create_vocab(word_list):
    unique_list = []
    vocab = {}
    
    for val in word_list:
        if val not in unique_list:
            unique_list.append(val)
            
    for i, l in enumerate(unique_list):
        vocab[l] = i
    return vocab

In [10]:
word_vocab = create_vocab(tr_words+dv_words+tst_words)
og_length = len(word_vocab)

In [11]:
label_vocab = create_vocab(tr_tags+dv_tags+tst_tags)

In [12]:
print(len(label_vocab))
label_vocab

9


{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [13]:
len(word_vocab)

30290

In [14]:
def check_similar_label_length(data):
    current_label = 'O'
    current_length = 0
    longest_similar_length = 0
    longest_label = 'O'
    
    for sentence in data:
        current_label = 'O'
        current_length = 0
        
        for i in sentence:
            if i == 'O':
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                    
                current_label = 'O'
                current_length = 0
                
            elif i != current_label:
                
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                
                current_label = i
                current_length = 1
            
            else:
                current_length += 1
    return longest_similar_length, longest_label
                
        

In [15]:
print(check_similar_label_length(train_tags))
print(check_similar_label_length(dev_tags))
print(check_similar_label_length(test_tags))

(9, 'I-ORG')
(9, 'I-ORG')
(5, 'I-MISC')


### check_label_order function:

This function is checking whether there is an appropiate B-xxx tag in fornt of each I-xxx sequence

In [16]:
def check_label_order(data): 
    for sentence in data:
        if 'I-PER' in sentence:
            if 'B-PER' not in sentence:
                return False, sentence
        
        if 'I-ORG' in sentence:
            if 'B-ORG' not in sentence:
                return False, sentence
        
        if 'I-MISC' in sentence:
            if 'B-MISC' not in sentence:
                return False, sentence
            
        if 'I-LOC' in sentence:
            if 'B-LOC' not in sentence:
                return False, sentence
            
        
        for i, j in enumerate(sentence[:-1]):
            if sentence[i + 1] ==  'I-PER':
                    if sentence[i] != 'I-PER' and sentence[i] != 'B-PER':
                        return False, sentence
            if sentence[i + 1] ==  'I-ORG':
                    if sentence[i] != 'I-ORG' and sentence[i] != 'B-ORG':
                        return False, sentence
            if sentence[i + 1] ==  'I-MISC':
                    if sentence[i] != 'I-MISC' and sentence[i] != 'B-MISC':
                        return False, sentence
            if sentence[i + 1] ==  'I-LOC':
                    if sentence[i] != 'I-LOC' and sentence[i] != 'B-LOC':
                        return False, sentence
    return True

In [17]:
print(check_label_order(train_tags))
print(check_label_order(dev_tags))
print(check_label_order(test_tags))

True
True
True


In [18]:
training_data = list(zip(train_data, train_tags))
testing_data = list(zip(test_data, test_tags))

In [19]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [20]:
testing_data[1][1]

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O']

In [21]:
VOCAB_SIZE = len(word_vocab)
OUT_DIM = len(label_vocab)
EMBED_DIM = 64
HIDDEN_DIM = 32
learning_rate = 0.01

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [23]:
model = LSTMTagger(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [24]:
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[1][0], word_vocab)
#     tag_scores = model(inputs)
#     print(tag_scores)

In [25]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[0], recall[0], fscore[0]

In [26]:
def train(model, training_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for sentence, tags in training_data:
        
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_vocab)
        targets = prepare_sequence(tags, label_vocab)
        
        tag_scores = model(sentence_in)

        loss = criterion(tag_scores, targets)
        prec, recall, fscore = class_performance(tag_scores, targets)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
        
    return (epoch_loss / len(training_data),
            epoch_prec / len(training_data),
            epoch_recall / len(training_data),
            epoch_fscore / len(training_data),
    )

In [27]:
def evaluate(model, testing_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for sentence, tags in testing_data:
            
            sentence_in = prepare_sequence(sentence, word_vocab)
            targets = prepare_sequence(tags, label_vocab)
            
            tag_scores = model(sentence_in)
            loss = criterion(tag_scores, targets)
            
            prec, recall, fscore = class_performance(tag_scores, targets)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(test_data),
        epoch_prec / len(test_data),
        epoch_recall / len(test_data),
        epoch_fscore / len(test_data),
    )

In [28]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
# for epoch in range(5):
#     print("Current epoch: ")
    
    
    
#     for sentence, tags in training_data:
#         model.zero_grad()
        
#         sentence_in = prepare_sequence(sentence, word_vocab)
#         targets = prepare_sequence(tags, label_vocab)

#         tag_scores = model(sentence_in)

#         loss = loss_function(tag_scores, targets)
#         loss.backward()
#         optimizer.step()


In [30]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, training_data, loss_function)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, testing_data, loss_function)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 01 | Epoch Time: 2m 45s
	Train Loss: 0.836 | Train Prec: 98.13% | Train Rec: 78.75% | Train Fscore: 85.93%
	 Val. Loss: 0.792 |  Val Prec: 98.26% | Val Rec: 79.17% | Val Fscore: 86.27%
Epoch: 02 | Epoch Time: 2m 47s
	Train Loss: 0.691 | Train Prec: 97.91% | Train Rec: 81.45% | Train Fscore: 87.72%
	 Val. Loss: 0.715 |  Val Prec: 97.55% | Val Rec: 81.68% | Val Fscore: 87.69%
Epoch: 03 | Epoch Time: 2m 46s
	Train Loss: 0.614 | Train Prec: 97.57% | Train Rec: 84.29% | Train Fscore: 89.46%
	 Val. Loss: 0.674 |  Val Prec: 96.98% | Val Rec: 83.31% | Val Fscore: 88.53%
Epoch: 04 | Epoch Time: 2m 42s
	Train Loss: 0.560 | Train Prec: 97.45% | Train Rec: 86.20% | Train Fscore: 90.64%
	 Val. Loss: 0.646 |  Val Prec: 96.57% | Val Rec: 84.25% | Val Fscore: 88.96%
Epoch: 05 | Epoch Time: 2m 40s
	Train Loss: 0.519 | Train Prec: 97.41% | Train Rec: 87.73% | Train Fscore: 91.60%
	 Val. Loss: 0.626 |  Val Prec: 95.98% | Val Rec: 84.64% | Val Fscore: 88.99%
Epoch: 06 | Epoch Time: 2m 44s
	Train Lo

In [64]:
sent_num = 565
print(testing_data[sent_num][0])
print(testing_data[sent_num][1])

['ST', 'LOUIS', 'AT', 'CHICAGO']
['B-ORG', 'I-ORG', 'O', 'B-LOC']


In [66]:
with torch.no_grad():
    inputs = prepare_sequence(testing_data[sent_num][1], label_vocab)
    tag_scores = model(inputs)
    prediction = tag_scores.argmax(1)

    print(inputs)
    ## print(tag_scores)
    print(prediction)

tensor([1, 6, 0, 5])
tensor([1, 0, 0, 0])
