In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [4]:
import torch.nn.functional as F

In [5]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            # tags.append(entity_to_number[tag])
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [6]:
train_data_full, train_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data_full, dev_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data_full, test_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [7]:
## train_data = train_data_full[:40000]
## train_tags = train_tags_full[:40000]

## dev_data = dev_data_full[:10000]
## dev_tags = dev_tags_full[:10000]

## test_data = test_data_full[:10000]
## test_tags = test_tags_full[:10000]

train_data = train_data_full
train_tags = train_tags_full

dev_data = dev_data_full
dev_tags = dev_tags_full

test_data = test_data_full
test_tags = test_tags_full

In [8]:
tr_words = [i for sublist in train_data for i in sublist]
dv_words = [i for sublist in dev_data for i in sublist]
tst_words = [i for sublist in test_data for i in sublist]

tr_tags = [i for sublist in train_tags for i in sublist]
dv_tags = [i for sublist in dev_tags for i in sublist]
tst_tags = [i for sublist in test_tags for i in sublist]

In [9]:
def create_vocab(word_list):
    unique_list = []
    vocab = {}
    
    for val in word_list:
        if val not in unique_list:
            unique_list.append(val)
            
    for i, l in enumerate(unique_list):
        vocab[l] = i
    return vocab

In [10]:
word_vocab = create_vocab(tr_words+dv_words+tst_words)
og_length = len(word_vocab)

In [11]:
label_vocab = create_vocab(tr_tags+dv_tags+tst_tags)

In [12]:
print(len(label_vocab))
label_vocab

9


{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [13]:
len(word_vocab)

30290

In [14]:
def check_similar_label_length(data):
    current_label = 'O'
    current_length = 0
    longest_similar_length = 0
    longest_label = 'O'
    
    for sentence in data:
        current_label = 'O'
        current_length = 0
        
        for i in sentence:
            if i == 'O':
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                    
                current_label = 'O'
                current_length = 0
                
            elif i != current_label:
                
                if current_length > longest_similar_length:
                    longest_similar_length = current_length
                    longest_label = current_label
                
                current_label = i
                current_length = 1
            
            else:
                current_length += 1
    return longest_similar_length, longest_label
                
        

In [15]:
print(check_similar_label_length(train_tags))
print(check_similar_label_length(dev_tags))
print(check_similar_label_length(test_tags))

(9, 'I-ORG')
(9, 'I-ORG')
(5, 'I-MISC')


### check_label_order function:

This function is checking whether there is an appropiate B-xxx tag in fornt of each I-xxx sequence

In [16]:
def check_label_order(data): 
    for sentence in data:
        if 'I-PER' in sentence:
            if 'B-PER' not in sentence:
                return False, sentence
        
        if 'I-ORG' in sentence:
            if 'B-ORG' not in sentence:
                return False, sentence
        
        if 'I-MISC' in sentence:
            if 'B-MISC' not in sentence:
                return False, sentence
            
        if 'I-LOC' in sentence:
            if 'B-LOC' not in sentence:
                return False, sentence
            
        
        for i, j in enumerate(sentence[:-1]):
            if sentence[i + 1] ==  'I-PER':
                    if sentence[i] != 'I-PER' and sentence[i] != 'B-PER':
                        return False, sentence
            if sentence[i + 1] ==  'I-ORG':
                    if sentence[i] != 'I-ORG' and sentence[i] != 'B-ORG':
                        return False, sentence
            if sentence[i + 1] ==  'I-MISC':
                    if sentence[i] != 'I-MISC' and sentence[i] != 'B-MISC':
                        return False, sentence
            if sentence[i + 1] ==  'I-LOC':
                    if sentence[i] != 'I-LOC' and sentence[i] != 'B-LOC':
                        return False, sentence
    return True

In [17]:
print(check_label_order(train_tags))
print(check_label_order(dev_tags))
print(check_label_order(test_tags))

True
True
True


In [18]:
training_data = list(zip(train_data, train_tags))
testing_data = list(zip(test_data, test_tags))

In [19]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [20]:
testing_data[1][1]

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O']

In [21]:
VOCAB_SIZE = len(word_vocab)
OUT_DIM = len(label_vocab)
EMBED_DIM = 64
HIDDEN_DIM = 32
learning_rate = 0.01

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [23]:
model = LSTMTagger(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [24]:
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[1][0], word_vocab)
#     tag_scores = model(inputs)
#     print(tag_scores)

In [25]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[0], recall[0], fscore[0]

In [26]:
def train(model, training_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for sentence, tags in training_data:
        
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_vocab)
        targets = prepare_sequence(tags, label_vocab)
        
        tag_scores = model(sentence_in)

        loss = criterion(tag_scores, targets)
        prec, recall, fscore = class_performance(tag_scores, targets)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall += recall.item()
        epoch_fscore += fscore.item()
        
    return (epoch_loss / len(training_data),
            epoch_prec / len(training_data),
            epoch_recall / len(training_data),
            epoch_fscore / len(training_data),
    )

In [27]:
def evaluate(model, testing_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for sentence, tags in testing_data:
            
            sentence_in = prepare_sequence(sentence, word_vocab)
            targets = prepare_sequence(tags, label_vocab)
            
            tag_scores = model(sentence_in)
            loss = criterion(tag_scores, targets)
            
            prec, recall, fscore = class_performance(tag_scores, targets)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(test_data),
        epoch_prec / len(test_data),
        epoch_recall / len(test_data),
        epoch_fscore / len(test_data),
    )

In [28]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
# for epoch in range(5):
#     print("Current epoch: ")
    
    
    
#     for sentence, tags in training_data:
#         model.zero_grad()
        
#         sentence_in = prepare_sequence(sentence, word_vocab)
#         targets = prepare_sequence(tags, label_vocab)

#         tag_scores = model(sentence_in)

#         loss = loss_function(tag_scores, targets)
#         loss.backward()
#         optimizer.step()


In [30]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, training_data, loss_function)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, testing_data, loss_function)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 01 | Epoch Time: 3m 11s
	Train Loss: 0.845 | Train Prec: 97.98% | Train Rec: 78.52% | Train Fscore: 85.69%
	 Val. Loss: 0.802 |  Val Prec: 98.33% | Val Rec: 78.99% | Val Fscore: 86.16%
Epoch: 02 | Epoch Time: 2m 59s
	Train Loss: 0.696 | Train Prec: 97.83% | Train Rec: 81.23% | Train Fscore: 87.53%
	 Val. Loss: 0.721 |  Val Prec: 97.49% | Val Rec: 82.15% | Val Fscore: 87.97%
Epoch: 03 | Epoch Time: 2m 40s
	Train Loss: 0.620 | Train Prec: 97.40% | Train Rec: 83.97% | Train Fscore: 89.20%
	 Val. Loss: 0.676 |  Val Prec: 97.15% | Val Rec: 83.72% | Val Fscore: 88.83%
Epoch: 04 | Epoch Time: 2m 36s
	Train Loss: 0.567 | Train Prec: 97.28% | Train Rec: 86.07% | Train Fscore: 90.49%
	 Val. Loss: 0.644 |  Val Prec: 96.75% | Val Rec: 84.77% | Val Fscore: 89.36%
Epoch: 05 | Epoch Time: 2m 58s
	Train Loss: 0.525 | Train Prec: 97.39% | Train Rec: 87.72% | Train Fscore: 91.57%
	 Val. Loss: 0.620 |  Val Prec: 96.77% | Val Rec: 86.08% | Val Fscore: 90.18%
Epoch: 06 | Epoch Time: 2m 46s
	Train Lo

In [31]:
print(testing_data[3][0])
print(testing_data[3][1])

['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']
['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']


In [32]:
with torch.no_grad():
    inputs = prepare_sequence(testing_data[3][1], label_vocab)
    tag_scores = model(inputs)

    print(inputs)
    print(tag_scores)

tensor([5, 0, 5, 8, 8, 0])
tensor([[-2.9190e-04, -9.5194e+00, -1.1069e+01, -9.8535e+00, -1.1782e+01,
         -9.2140e+00, -1.0334e+01, -1.2444e+01, -1.1933e+01],
        [-5.0068e-06, -1.3960e+01, -1.4420e+01, -1.2737e+01, -1.5006e+01,
         -1.8286e+01, -1.5508e+01, -1.6789e+01, -1.7489e+01],
        [-8.7022e-06, -1.3373e+01, -1.3046e+01, -1.2606e+01, -1.5486e+01,
         -1.3896e+01, -1.5178e+01, -1.6133e+01, -1.5895e+01],
        [-5.8459e-03, -7.3551e+00, -6.8860e+00, -6.0619e+00, -9.2522e+00,
         -6.4093e+00, -9.9195e+00, -1.0626e+01, -1.0727e+01],
        [-1.6480e-03, -1.0122e+01, -9.1045e+00, -7.6094e+00, -7.1840e+00,
         -1.0090e+01, -8.8298e+00, -1.0485e+01, -1.0592e+01],
        [-5.9605e-07, -1.6956e+01, -1.5902e+01, -1.5388e+01, -1.5545e+01,
         -2.1150e+01, -1.5606e+01, -1.6928e+01, -1.9042e+01]])
