In [276]:
import os
import torch

from enum import Enum

import pandas as pd
import numpy as np

import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from uuid import uuid4

# 1. Implement functionality to read and process NER 2003 English Shared Task data in CoNNL file format

In [129]:
def read_file(path):
    with open(path, 'r') as f:
        return f.read()

def read_folder(folder, ext):
    return [
        read_file(os.path.join(folder, filename)) for filename in sorted(os.listdir(folder)) if filename.endswith(ext)
    ]

def read_connl_to_df(folder, ext):
    result = []
    offest = 0
    for txt in read_folder(folder, ext):
        rows = []
        sentences = txt.split('\n\n')
        for i, sentence in enumerate(sentences):
            for word in sentence.split('\n'):
                row = word.split(' ')
                row.append(int(offest + i))
                rows.append(row)
        result.append(pd.DataFrame(rows))
        offest = len(sentences)
    return result

def set_connl_df_naming(df):
    df.columns = ['word', 'part_of_speech', 'chunk', 'tag', 'sentence_id']
    return df.reset_index(drop=True)

In [131]:
test, train = [set_connl_df_naming(df.iloc[1:]) for df in read_connl_to_df('./dataset/', '.txt')]
df = test.append(train)

df.head(10)

3685
14988


Unnamed: 0,word,part_of_speech,chunk,tag,sentence_id
0,SOCCER,NN,B-NP,O,1.0
1,-,:,O,O,1.0
2,JAPAN,NNP,B-NP,B-LOC,1.0
3,GET,VB,B-VP,O,1.0
4,LUCKY,NNP,B-NP,O,1.0
5,WIN,NNP,I-NP,O,1.0
6,",",",",O,O,1.0
7,CHINA,NNP,B-NP,B-PER,1.0
8,IN,IN,B-PP,O,1.0
9,SURPRISE,DT,B-NP,O,1.0


# 2. Implement 3 strategies for loading the embeddings

1) load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

2) load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

3) load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding (20% of score).

In [108]:
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec('./glove.6B.100d.txt', word2vec_output_file)

(400000, 100)

In [252]:
class STRATEGY(Enum):
    CASED = 1
    LOWER = 2
    FALLBACK_TO_LOWER = 3

class W2V:
    
    def __init__(self, w2v):
        self.w2v = w2v
    
    def get_vector(self, word):
        try:
            return self.w2v.get_vector(word)
        except KeyError:
            return self.w2v.get_vector("unk")
        
    def get_vector_lowercased(self, word):
        try:
            return self.w2v.get_vector(str(word).lower())
        except KeyError:
            return self.w2v.get_vector("unk")
        
    def get_vector_lowercased_onfail(self, word):
        unk = self.w2v.get_vector("unk")
        original_case = self.get_vector(word)
        
        if np.array_equal(unk, original_case):
            return self.get_vector_lowercased(word)
        
        return original_case
    
    def enreach_df_with_vector_representation_of_words(self, df, strategy, col_in, col_out):
        if strategy == STRATEGY.CASED:
            df[col_out] = df[col_in].apply(lambda word: self.get_vector(word))
        if strategy == STRATEGY.LOWER:
            df[col_out] = df[col_in].apply(lambda word: self.get_vector_lowercased(word))
        if strategy == STRATEGY.FALLBACK_TO_LOWER:
            df[col_out] = df[col_in].apply(lambda word: self.get_vector_lowercased_onfail(word))
        return df
        
    @staticmethod
    def load_word2vec_format(path):
        embeddings = KeyedVectors.load_word2vec_format(path)
        return W2V(embeddings)

In [253]:
glove = W2V.load_word2vec_format(word2vec_output_file)

In [254]:
df_str_1 = glove.enreach_df_with_vector_representation_of_words(df.copy(deep=True), STRATEGY.CASED, "word", "vec")
df_str_2 = glove.enreach_df_with_vector_representation_of_words(df.copy(deep=True), STRATEGY.LOWER, "word", "vec")
df_str_3 = glove.enreach_df_with_vector_representation_of_words(df.copy(deep=True), STRATEGY.FALLBACK_TO_LOWER, "word", "vec")

df_str_1.head()

Unnamed: 0,word,part_of_speech,chunk,tag,sentence_id,vec
0,SOCCER,NN,B-NP,O,1.0,"[0.027166, -0.1762, -0.19623, 0.33527, 0.06239..."
1,-,:,O,O,1.0,"[-1.2557, 0.61036, 0.56793, -0.96596, -0.45249..."
2,JAPAN,NNP,B-NP,B-LOC,1.0,"[0.027166, -0.1762, -0.19623, 0.33527, 0.06239..."
3,GET,VB,B-VP,O,1.0,"[0.027166, -0.1762, -0.19623, 0.33527, 0.06239..."
4,LUCKY,NNP,B-NP,O,1.0,"[0.027166, -0.1762, -0.19623, 0.33527, 0.06239..."


# 3. Implement training on batches

[Article](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da)

[Tutorial](http://cs230.stanford.edu/blog/namedentity/)

[DOCS](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py)

In [255]:
labels = {}
for i, label in enumerate(df['tag'].unique()):
    labels[label] = i
    
labels

{'O': 0,
 'B-LOC': 1,
 'B-PER': 2,
 'I-PER': 3,
 'I-LOC': 4,
 'B-MISC': 5,
 'I-MISC': 6,
 'B-ORG': 7,
 'I-ORG': 8,
 None: 9}

In [341]:
class Vacabulary:
    def __init__(self, df):
        self.df = df.drop_duplicates('word', keep='first')
        self.df = self.df.reset_index()
        self.df.set_index("word", inplace=True)
    
    def get_vec_by_word(self, word):
        return self.df.loc[word]['vec']
    
    def get_index_by_word(self, word):
        return self.df.loc[word]['index']
    
    def get_vec_by_index(self, i):
        return self.df.iloc[i]['vec']
    
    def embedding_matrix(self):
        return self.df['vec'].to_numpy(copy=True)
    
    def get_padding(self):
        return len(self.df)
    
    def get_size(self):
        return len(self.df)

In [419]:
class PreparationPipeline:
    @staticmethod
    def __to_batches(df, batch_size):
        batches = []
        offset = 0
        for index, row in df.iterrows():
            if row['sentence_id'] % batch_size == 0 and \
                index + 1 < len(df) and \
                df.iloc[index + 1]['sentence_id'] % batch_size != 0:
                batches.append(df[offset:index+1])
                offset = index + 1
        return batches
    
    @staticmethod
    def __to_sentences(batch):
        result = []
        batch.groupby('sentence_id').apply(lambda x: result.append((x['word'].values, x['tag'].values)))
        return result
    
    @staticmethod
    def __to_indexes(batch, vocab, label_mapping):
        max_len = 0
        for (words, _) in batch:
            if max_len < len(words):
                max_len = len(words)
                
        batch_data = vocab.get_padding() * np.ones((len(batch), max_len))
        batch_labels = -1*np.ones((len(batch), max_len))
        
        for i in range(len(batch)):
            words, labels = batch[i]
            cur_len = len(words)
            batch_data[i][:cur_len] = np.array([vocab.get_index_by_word(word) for word in words])
            batch_labels[i][:cur_len] = np.array([label_mapping[label] for label in labels])
        
        #since all data are indices, we convert them to torch LongTensors
        batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

        #convert Tensors to Variables
        batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
        return (batch_data, batch_labels)
            
    
    @staticmethod
    def prepare(df, vocab, labels, batch_size):
        pp = PreparationPipeline
        
        batches = [pp.__to_sentences(batch) for batch in pp.__to_batches(test, batch_size)]
        
        result_batches = []
        result_labels  = []
        
        for batch in batches:
            batch_data, batch_labels = pp.__to_indexes(batch, vocab, labels)
            result_batches.append(batch_data)
            result_labels.append(batch_labels)
        
        return result_batches, result_labels

In [432]:
class Net(nn.Module):
    def __init__(self, embedding_matrix, lstm_hidden_dim, vocab_size, result_size):
        super(Net, self).__init__()
        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(
            embedding_matrix.size()[0], 
            embedding_matrix.size()[1],
        ).from_pretrained(
            embedding_matrix, 
            freeze=True,
        )            
        self.embedding.weight.requires_grad = False

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(embedding_matrix.size()[1], lstm_hidden_dim, num_layers=2, bidirectional=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(2*lstm_hidden_dim, result_size)
        
        self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01)
        
    def forward(self, sentence):
        print(sentence)
        #apply the embedding layer that maps each token to its embedding
        sentence = self.embedding(sentence)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        sentence, _ = self.lstm(sentence)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        #reshape the Variable so that each row contains one token
        sentence = sentence.view(-1, sentence.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        sentence = self.fc(sentence)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(sentence, dim=1)   # dim: batch_size*batch_max_len x num_tags
    
    def loss_fn(outputs, labels):
        #reshape labels to give a flat vector of length batch_size*seq_len
        labels = labels.view(-1)  

        #mask out 'PAD' tokens
        mask = (labels >= 0).float()

        #the number of tokens is the sum of elements in mask
        num_tokens = int(torch.sum(mask).data[0])

        #pick the values corresponding to labels and multiply by mask
        outputs = outputs[range(outputs.shape[0]), labels]*mask

        #cross entropy loss for all non 'PAD' tokens
        return -torch.sum(outputs) / num_tokens
    
    def fit(self, train_batches, train_labels, epochs=5):
        for i in range(epochs):
            print("Epoch: {}".format(i))
            
            total_loss = 0
            for j, batch in enumerate(train_batches):
                self.zero_grad()
                print(batch.size())
                #pass through model, perform backpropagation and updates
                output_batch = self.forward(batch)
                exspected_labels = train_labels[j]
                loss = self.loss_fn(output_batch, exspected_labels)
                loss.backward()
                optimizer.step()
                
                total_loss += loss
                
            avg_epoch_loss = np.round((total_loss / len(train_batches)).item(), 3)
            
            print("Total epoch loss: {}".format(total_loss))
            print("Avg epoch loss: {}".format(avg_epoch_loss))
    
    def predict(self, test_batches):
        pass

In [433]:
vocab = Vacabulary(df_str_1)
matrix = vocab.embedding_matrix()
result = []
for i, arr in enumerate(matrix):
    result.append(torch.as_tensor(arr))

In [422]:
train_batches, train_labels = PreparationPipeline.prepare(df_str_1[:len(train)], vocab, labels, BATCH_SIZE)
test_batches, test_labels = PreparationPipeline.prepare(df_str_1[len(train):], vocab, labels, BATCH_SIZE)

In [434]:
HIDDEN_DIM = glove.w2v.vector_size
BATCH_SIZE = 64

net = Net(
    torch.stack(result),
    HIDDEN_DIM,
    vocab.get_size(),
    len(labels)
)

net.fit(train_batches, train_labels)

Epoch: 0
torch.Size([65, 72])
tensor([[    0,     1,     2,  ..., 27318, 27318, 27318],
        [    0,     1,     2,  ..., 27318, 27318, 27318],
        [   12,    13, 27318,  ..., 27318, 27318, 27318],
        ...,
        [ 1136,  1137,  1138,  ..., 27318, 27318, 27318],
        [ 1143,  1144,  1145,  ..., 27318, 27318, 27318],
        [ 1150,  1151,  1152,  ..., 27318, 27318, 27318]])


RuntimeError: index out of range at ../aten/src/TH/generic/THTensorEvenMoreMath.cpp:193

# 4. Implement the calculation of token-level Precision / Recall / F1 / F0.5 scores for all classes in average. 

# IMPORTANT! Please, implement “micro-average” approach

# Provide the report the performances (F1 and F0.5 scores) on the dev / test subsets w.r.t epoch number during the training for the first 5 epochs for each strategy of loading the embeddings