In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%pylab inline
import tensorflow as tf
import re
from pathlib import Path

Populating the interactive namespace from numpy and matplotlib


In [3]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split as split
from torch.utils.data import DataLoader

In [4]:
entity_to_number = {"O": 0, "B-LOC": 1, "I-LOC": 2, "B-MISC": 3, "I-MISC": 4, "B-PER": 5, "I-PER": 6, "B-ORG": 7, "I-ORG": 8}

In [5]:
def read_and_sentence(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text().strip()
    raw_sentences = re.split(r'\n\t?\n', raw_text)
    
    sentence_tokens = []
    sentence_tags = []
    
    for sents in raw_sentences:
        tokens = []
        tags = []
        
        for line in sents.split('\n'):
            token = line.split()[0]
            tag = line.split()[3]
            
            tokens.append(token)
            # tags.append(entity_to_number[tag])
            tags.append(tag)
            
        sentence_tokens.append(tokens)
        sentence_tags.append(tags)
    
    return sentence_tokens, sentence_tags

In [6]:
train_data_full, train_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_train.txt')
dev_data_full, dev_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_dev.txt')
test_data_full, test_tags_full = read_and_sentence('E:\Egyetem\Diplomaterv\data\conllpp_test.txt')

In [7]:
train_data = train_data_full[:40000]
train_tags = train_tags_full[:40000]

dev_data = dev_data_full[:10000]
dev_tags = dev_tags_full[:10000]

test_data = test_data_full[:10000]
test_tags = test_tags_full[:10000]

In [8]:
tr_words = [i for sublist in train_data for i in sublist]
dv_words = [i for sublist in dev_data for i in sublist]
tst_words = [i for sublist in test_data for i in sublist]

tr_tags = [i for sublist in train_tags for i in sublist]
dv_tags = [i for sublist in dev_tags for i in sublist]
tst_tags = [i for sublist in test_tags for i in sublist]

In [9]:
def create_vocab(word_list):
    unique_list = []
    vocab = {}
    
    for val in word_list:
        if val not in unique_list:
            unique_list.append(val)
            
    for i, l in enumerate(unique_list):
        vocab[l] = i
    return vocab
            
    

In [10]:
word_vocab = create_vocab(tr_words)
og_length = len(word_vocab)

In [11]:
label_vocab = create_vocab(tr_tags)

In [12]:
print(len(label_vocab))
label_vocab

9


{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [13]:
len(word_vocab)

23624

In [14]:
word_vocab["UNK"] = og_length
word_vocab["PAD"] = og_length+1

In [15]:
len(word_vocab)

23626

## Next phase

In [16]:
VOCAB_SIZE = len(word_vocab)
OUT_DIM = len(label_vocab)
EMBED_DIM = 100
HIDDEN_DIM = 20
learning_rate = 0.001

BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
def prepare_datasets(sentences, labels):
    
    sentence_to_ix = []
    label_to_ix = []
    
    for sent in sentences:
        current_sent = []
        
        for word in sent:
            if word in word_vocab.keys():
                current_sent.append(word_vocab[word])
            else:
                current_sent.append(word_vocab["UNK"])
        
        sentence_to_ix.append(current_sent)
        
    for label in labels:
        l = [label_vocab[lab] for lab in label]
        label_to_ix.append(l)
   
    return sentence_to_ix, label_to_ix

In [18]:
train_sent_ix, train_label_ix = prepare_datasets(train_data, train_tags)
dev_sent_ix, dev_label_ix = prepare_datasets(dev_data, dev_tags)
test_sent_ix, test_label_ix = prepare_datasets(test_data, test_tags)

In [19]:
test_sent_ix[:5]

[[0],
 [1891, 676, 23624, 23624, 23624, 3395, 71, 21354, 2337, 23624, 10615, 9],
 [23624, 23624],
 [23624, 71, 868, 1139, 1140, 23624],
 [1807,
  1396,
  40,
  3410,
  162,
  201,
  6199,
  1906,
  2142,
  22,
  79,
  7123,
  1895,
  2519,
  788,
  705,
  236,
  79,
  8174,
  8521,
  2724,
  1908,
  18,
  1162,
  9]]

In [20]:
max_len = max([len(s) for s in train_sent_ix])
train_data_vecs = word_vocab["PAD"] * np.ones((len(train_sent_ix), max_len))
train_label_vecs = -1 * np.ones((len(train_sent_ix), max_len))

for j in range(len(train_sent_ix)):
    current_len = len(train_sent_ix[j])
    train_data_vecs[j][:current_len] = train_sent_ix[j]
    train_label_vecs[j][:current_len] = train_label_ix[j]


In [21]:
max_len = max([len(s) for s in test_sent_ix])
test_data_vecs = word_vocab["PAD"] * np.ones((len(test_sent_ix), max_len))
test_label_vecs = -1 * np.ones((len(test_sent_ix), max_len))

for j in range(len(test_sent_ix)):
    current_len = len(test_sent_ix[j])
    test_data_vecs[j][:current_len] = test_sent_ix[j]
    test_label_vecs[j][:current_len] = test_label_ix[j]


In [22]:
train_data_vecs = torch.LongTensor(train_data_vecs)
train_label_vecs = torch.LongTensor(train_label_vecs)

test_data_vecs = torch.LongTensor(test_data_vecs)
test_label_vecs = torch.LongTensor(test_label_vecs)

# train_data_vecs = tf.Variable(train_data_vecs)
# train_label_vecs = tf.Variable(train_label_vecs)
# test_data_vecs = tf.Variable(test_data_vecs)
# test_label_vecs = tf.Variable(test_label_vecs)

In [23]:
tr_data_loader = [(sample, label) for sample, label in zip(train_data_vecs, train_label_vecs)]
val_data_loader = [(sample, label) for sample, label in zip(test_data_vecs, test_label_vecs)]

train_iterator = DataLoader(tr_data_loader,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            )

valid_iterator = DataLoader(val_data_loader,
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          )

In [24]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels):
        super(RNN, self).__init__()
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    
        self.fc = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, s):
        
        s = self.embedding(s)
        s, _ = self.lstm(s)
        s = s.view(-1, s.shape[2])
        s = self.fc(s)
        
        return F.log_softmax(s, dim=1)

In [25]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels):
        super(LSTM, self).__init__()
        
        self.hidden_dim = HIDDEN_DIM
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2tag = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [26]:
from sklearn.metrics import precision_recall_fscore_support

def class_performance(preds, y):

    rounded_preds = preds.argmax(1)

    precision, recall, fscore, support = precision_recall_fscore_support(
        rounded_preds.cpu(), y.cpu()
    )

    return precision[1], recall[1], fscore[1]

In [27]:
model = RNN(EMBED_DIM, HIDDEN_DIM, VOCAB_SIZE, OUT_DIM)

In [28]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

model = model.to(device)
criterion = criterion.to(device)

In [29]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0
    
    model.train()
    
    for texts, labels in iterator:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
                  
        predictions = model(texts)

        loss = criterion(predictions, labels)
        prec, recall, fscore = class_performance(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_prec += prec.item()
        epoch_recall = recall.item()
        epoch_fscore = fscore.item()
        
    return (epoch_loss / len(iterator),
            epoch_prec / len(iterator),
            epoch_recall / len(iterator),
            epoch_fscore / len(iterator),
    )

In [30]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_prec = 0
    epoch_recall = 0
    epoch_fscore = 0

    model.eval()

    with torch.no_grad():
        
        for texts, labels in iterator:
            
            texts = texts.to(device)
            labels = labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            prec, recall, fscore = class_performance(predictions, labels)

            epoch_loss += loss.item()
            epoch_prec += prec.item()
            epoch_recall += recall.item()
            epoch_fscore += fscore.item()
        
    return (
        epoch_loss / len(iterator),
        epoch_prec / len(iterator),
        epoch_recall / len(iterator),
        epoch_fscore / len(iterator),
    )

In [31]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
import torch.nn.functional as F

N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_prec, train_rec, train_fscore = train(model, train_iterator, optimizer, criterion)
    
    valid_loss, valid_prec, valid_rec, valid_fscore = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Prec: {train_prec*100:.2f}% | Train Rec: {train_rec*100:.2f}% | Train Fscore: {train_fscore*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val Prec: {valid_prec*100:.2f}% | Val Rec: {valid_rec*100:.2f}% | Val Fscore: {valid_fscore*100:.2f}%")

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.