In [31]:
import os

import pandas as pd
import numpy as np

from gensim.scripts.glove2word2vec import glove2word2vec

# 1. Implement functionality to read and process NER 2003 English Shared Task data in CoNNL file format

In [32]:
def read_file(path):
    with open(path, 'r') as f:
        return f.read()

def read_folder(folder, ext):
    return [
        read_file(os.path.join(folder, filename)) for filename in sorted(os.listdir(folder)) if filename.endswith(ext)
    ]

def read_connl_to_df(folder, ext):
    result = []
    for txt in read_folder(folder, ext):
        rows = []
        for line in txt.split('\n'):
            row = line.split(' ')
            rows.append(row)
        result.append(pd.DataFrame(rows))
    return result

def set_connl_df_naming(df):
    df.columns = ['word', 'part_of_speech', 'chunk', 'tag']
    return df.reset_index(drop=True)

In [33]:
dev, test, train = [set_connl_df_naming(df.iloc[2:]) for df in read_connl_to_df('./dataset/', '.txt')]

train.head()

Unnamed: 0,word,part_of_speech,chunk,tag
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O


# 2. Implement 3 strategies for loading the embeddings

1) load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

2) load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

3) load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding (20% of score).

In [34]:
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec('./glove.6B.100d.txt', word2vec_output_file)

(400000, 100)

In [48]:
class W2V:
    def __init__(self, w2v):
        self.w2v = w2v
    
    def get_vector(self, word):
        try:
            return self.w2v.get_vector(word)
        except KeyError:
            return self.w2v.get_vector("unk")
        
    def get_vector_lowercased(self, word):
        try:
            return self.w2v.get_vector(word.lower())
        except KeyError:
            return self.w2v.get_vector("unk")
        
    def get_vector_lowercased_onfail(self, word):
        unk = self.w2v.get_vector("unk")
        original_case = self.get_vector(word)
        
        if np.array_equal(unk, original_case):
            return self.get_vector_lowercased(word)
        
        return original_case
    
    def enreach_df_with_vector_representation_of_words(self, df, col_in, col_out):
        df[col_out] = df[col_in].apply(lambda word: self.get_vector_lowercased_onfail(word))
        return df
        
    @staticmethod
    def load_word2vec_format(path):
        embeddings = KeyedVectors.load_word2vec_format(path)
        return W2V(embeddings)

In [49]:
glove = W2V.load_word2vec_format(word2vec_output_file)

In [50]:
dev, test, train = [glove.enreach_df_with_vector_representation_of_words(df, "word", "vec") for df in [dev, test, train]]

In [52]:
train.head()

Unnamed: 0,word,part_of_speech,chunk,tag,vec
0,EU,NNP,B-NP,B-ORG,"[-0.32714, 0.082503, 1.2561, 0.24888, 0.066019..."
1,rejects,VBZ,B-VP,O,"[0.013857, 0.70729, 0.81856, 0.8307, 0.063785,..."
2,German,JJ,B-NP,B-MISC,"[0.50719, 0.53343, 0.20154, 0.67101, -0.3352, ..."
3,call,NN,I-NP,O,"[-0.57833, -0.0036551, 0.34658, -0.13135, -0.5..."
4,to,TO,B-VP,O,"[-0.1897, 0.050024, 0.19084, -0.049184, -0.089..."


# 3. Implement training on batches (20% of score)

# 4. Implement the calculation of token-level Precision / Recall / F1 / F0.5 scores for all classes in average. IMPORTANT! Please, imple- ment “micro-average” approach.

# Provide the report the performances (F1 and F0.5 scores) on the dev / test subsets w.r.t epoch number during the training for the first 5 epochs for each strategy of loading the embeddings