In [26]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

In [27]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [28]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [29]:
import torch.nn.functional as F

In [30]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            # tags.append(entity_to_number[tag])
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [31]:
train_data_full, train_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data_full, dev_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data_full, test_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [32]:
train_data = train_data_full[:40000]
train_tags = train_tags_full[:40000]

dev_data = dev_data_full[:10000]
dev_tags = dev_tags_full[:10000]

test_data = test_data_full[:10000]
test_tags = test_tags_full[:10000]

In [33]:
tr_words = [i for sublist in train_data for i in sublist]
dv_words = [i for sublist in dev_data for i in sublist]
tst_words = [i for sublist in test_data for i in sublist]

tr_tags = [i for sublist in train_tags for i in sublist]
dv_tags = [i for sublist in dev_tags for i in sublist]
tst_tags = [i for sublist in test_tags for i in sublist]

In [34]:
def create_vocab(word_list):
    unique_list = []
    vocab = {}
    
    for val in word_list:
        if val not in unique_list:
            unique_list.append(val)
            
    for i, l in enumerate(unique_list):
        vocab[l] = i
    return vocab

In [35]:
word_vocab = create_vocab(tr_words)
og_length = len(word_vocab)

In [36]:
label_vocab = create_vocab(tr_tags)

In [37]:
print(len(label_vocab))
label_vocab

9


{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [38]:
len(word_vocab)

23624

In [39]:
training_data = list(zip(train_data, train_tags))

In [40]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [41]:
training_data[1][1]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [42]:
VOCAB_SIZE = len(word_vocab)
OUT_DIM = len(label_vocab)
EMBED_DIM = 64
HIDDEN_DIM = 32
learning_rate = 0.01

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [64]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [65]:
model = LSTMTagger(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [45]:
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[1][0], word_vocab)
#     tag_scores = model(inputs)
#     print(tag_scores)

In [58]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[0], recall[0], fscore[0]

In [62]:
def train(model, training_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for sentence, tags in training_data:
        
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_vocab)
        targets = prepare_sequence(tags, label_vocab)
        
        tag_scores = model(sentence_in)

        loss = criterion(tag_scores, targets)
        prec, recall, fscore = class_performance(tag_scores, targets)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall = recall.item()
        epoch_fscore = fscore.item()
        
    return (epoch_loss / len(training_data),
            epoch_prec / len(training_data),
            epoch_recall / len(training_data),
            epoch_fscore / len(training_data),
    )

In [63]:
def evaluate(model, training_data, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for sentence, tags in training_data:
            
            sentence_in = prepare_sequence(sentence, word_vocab)
            targets = prepare_sequence(tags, label_vocab)
            
            tag_scores = model(sentence_in)
            loss = criterion(tag_scores, targets)
            
            prec, recall, fscore = class_performance(tag_scores, targets)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(training_data),
        epoch_prec / len(training_data),
        epoch_recall / len(training_data),
        epoch_fscore / len(training_data),
    )

In [49]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [50]:
# for epoch in range(5):
#     print("Current epoch: ")
    
    
    
#     for sentence, tags in training_data:
#         model.zero_grad()
        
#         sentence_in = prepare_sequence(sentence, word_vocab)
#         targets = prepare_sequence(tags, label_vocab)

#         tag_scores = model(sentence_in)

#         loss = loss_function(tag_scores, targets)
#         loss.backward()
#         optimizer.step()


In [66]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, training_data, loss_function)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, training_data, loss_function)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

Epoch: 01 | Epoch Time: 2m 26s
	Train Loss: 0.844 | Train Prec: 97.93% | Train Rec: 0.00% | Train Fscore: 0.00%
	 Val. Loss: 0.761 |  Val Prec: 98.10% | Val Rec: 79.83% | Val Fscore: 86.67%
Epoch: 02 | Epoch Time: 2m 39s
	Train Loss: 0.698 | Train Prec: 97.83% | Train Rec: 0.00% | Train Fscore: 0.01%
	 Val. Loss: 0.663 |  Val Prec: 97.70% | Val Rec: 82.91% | Val Fscore: 88.62%
Epoch: 03 | Epoch Time: 2m 37s
	Train Loss: 0.622 | Train Prec: 97.48% | Train Rec: 0.00% | Train Fscore: 0.01%
	 Val. Loss: 0.600 |  Val Prec: 97.55% | Val Rec: 85.05% | Val Fscore: 89.95%
Epoch: 04 | Epoch Time: 3m 3s
	Train Loss: 0.567 | Train Prec: 97.36% | Train Rec: 0.01% | Train Fscore: 0.01%
	 Val. Loss: 0.550 |  Val Prec: 97.52% | Val Rec: 86.49% | Val Fscore: 90.86%
Epoch: 05 | Epoch Time: 3m 33s
	Train Loss: 0.523 | Train Prec: 97.45% | Train Rec: 0.01% | Train Fscore: 0.01%
	 Val. Loss: 0.508 |  Val Prec: 97.71% | Val Rec: 87.91% | Val Fscore: 91.83%
