In [42]:
# !{sys.executable} -m pip install nltk 
# !{sys.executable} -m pip install  atlas
# !{sys.executable} -m pip install --upgrade scipy
# !{sys.executable} -m pip install gensim

import numpy as np 
import sys 
import os 
from io import open 

import nltk 
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('all')
# nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

from logging import debug, info, warning, error
from sklearn.metrics import accuracy_score


stop_words = set(stopwords.words('english')) 

Word2vec model

In [None]:
import gensim
from gensim.models import KeyedVectors
# these are the vectors from a pre-trained model. 
# download them at https://code.google.com/archive/p/word2vec/ (it's about 1.5 GB)
# There is also a way to load the model instead of loading the downloaded vectors, I wil look into that, 
# link to papers: https://arxiv.org/pdf/1310.4546.pdf https://arxiv.org/pdf/1301.3781.pdf
# alternatively, spacy also has a pre-trained model, but the model architecture doesn't seem to be documented as well https://github.com/explosion/floret
goog_model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
print(goog_model.get_vector(key='world', norm=True))

In [43]:
class PARSE_LAYER: 
    ORG = 0
    SYM = 2 
    SEM = 3 
    CAT = 4 
    SNS = 5 
    ROL = 6 
    POS = 7

### Reading data

In [44]:
# From the repository of the data
def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r'):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 7 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids


In [45]:
def read_data(directory, filename):
     path = os.path.join(directory, filename)
     docs, docs_ids = get_conll_blocks(path)
     return docs

In [46]:
def get_labels(data):
    labels = []
    for sentence in data:
        for word in sentence:
            if word[PARSE_LAYER.SYM].lower() in stop_words:
                continue
            elif word[PARSE_LAYER.SYM].lower() == '.':
                continue
            else:
                labels.append(word[PARSE_LAYER.SNS])
    return labels
    

In [47]:
language = 'en'
standard = 'gold'

directory = os.path.join('./data/4.0.0', language, standard)

train_data = read_data(os.path.join(directory), 'train.conll')
test_data = read_data(os.path.join(directory), 'test.conll')

train_labels = get_labels(train_data)
test_labels = get_labels(test_data)


Preprocessing

In [48]:
%%time
# Converts UPenn tags to tags relevant to WordNet
def upenn_to_wn_tag(tagged_sentence):
    wn_tags = []
    for tag in tagged_sentence:
        if tag[1].startswith('J'):
            wn_tags.append('a')
        elif tag[1].startswith('V'):
            wn_tags.append('v')
        elif tag[1].startswith('N'):
            wn_tags.append('n')
        elif tag[1].startswith('R'):
            wn_tags.append('r')
        else:
            wn_tags.append(None)

    return wn_tags

# Adds the WordNet POS tags to the PMB data
def add_pos_tag(data):
    for sentence in data:
        tagged_sentence = pos_tag([item[0] for item in sentence])
        wn_tags = upenn_to_wn_tag(tagged_sentence)
        for idx, word in enumerate(sentence):
            word.append(wn_tags[idx])
            
    return data

train_data = add_pos_tag(train_data)
test_data = add_pos_tag(test_data)

all_data = train_data + test_data

sem_tags_all = np.array([word[PARSE_LAYER.SEM] for sentence in all_data for word in sentence])
sem_classes = np.unique(sem_tags_all)

pos_tags_all = np.array([str(word[PARSE_LAYER.POS]) for sentence in all_data for word in sentence])
pos_classes = np.unique(pos_tags_all)

cat_tags_all = np.array([str(word[PARSE_LAYER.CAT]) for sentence in all_data for word in sentence])
cat_classes = np.unique(cat_tags_all)

CPU times: user 1.34 s, sys: 88.4 ms, total: 1.43 s
Wall time: 1.43 s


Getting 500 most frequent words in the training data,
and the sentences as lists e.g ['a', 'girl', 'giggles']

In [49]:
all_train_sentences = []
all_words = []
for sentence in train_data:
    sentence_words = []
    for word in sentence:
        if word[PARSE_LAYER.SYM] != '.':
            sentence_words.append(word[PARSE_LAYER.SYM])
            all_words.append(word[PARSE_LAYER.SYM])
    all_train_sentences.append(sentence_words)

unique_words = np.unique(all_words).tolist()

dictionary = {}
for word in unique_words:
    dictionary[word] = all_words.count(word)

sorted_dict = sorted(dictionary.items(), key=lambda x:x[1], reverse=True)

n_frequent_words = 300
most_frequent_words = [key for key, value in sorted_dict[0:n_frequent_words-1]]


### Features

Semantic, POS, and category tags of target word

In [10]:
def get_pos_idx(word_pos, pos_classes):
    return np.where(pos_classes == str(word_pos))[0][0]

def get_sem_idx(word_sem, sem_classes):
    return np.where(sem_classes == str(word_sem))[0][0]

def get_cat_idx(word_cat, cat_classes):
    return np.where(cat_classes == str(word_cat))[0][0]

def get_sempos_feats(data):
    features = []
    contents_per_sentence = []
    neighbours = 1
    for sentence in data:
        contents = []
        for word_idx in range(len((sentence))):
            word = sentence[word_idx]
            if word[PARSE_LAYER.SYM].lower() in stop_words:
                continue
            elif word[PARSE_LAYER.SYM] == '.':
                continue
            else:
                pos_idx = get_pos_idx(word[PARSE_LAYER.POS], pos_classes)
                sem_idx = get_sem_idx(word[PARSE_LAYER.SEM], sem_classes)
                cat_idx = get_cat_idx(word[PARSE_LAYER.CAT], cat_classes)
                features.append([sem_idx, pos_idx, cat_idx])
    return features

train_features = get_sempos_feats(train_data)
test_features = get_sempos_feats(test_data)

Sem, POS, category tags of target and neighbouring words

In [None]:
def get_n_feats(data):
    features = []
    contents_per_sentence = []
    neighbours_dist = 1
    for sentence in data:
        contents = []
        for word_idx in range(len((sentence))):
            word = sentence[word_idx]
            if word[PARSE_LAYER.SYM].lower() in stop_words:
                continue
            elif word[PARSE_LAYER.SYM] == '.':
                continue
            else:
                contents.append(word)
        neighs = []
        if len(contents) == 1:
            pos_idx = get_pos_idx(word[PARSE_LAYER.POS], pos_classes)
            sem_idx = get_sem_idx(word[PARSE_LAYER.SEM], sem_classes)
            cat_idx = get_cat_idx(word[PARSE_LAYER.CAT], cat_classes)
            neighs = [sem_idx, pos_idx, cat_idx, -1, -1, -1, -1, -1, -1]
            contents_per_sentence.append(neighs)
        else:
            word_idx = 0
            while word_idx != len(contents):
                word = contents[word_idx]
                pos_idx = get_pos_idx(word[PARSE_LAYER.POS], pos_classes)
                sem_idx = get_sem_idx(word[PARSE_LAYER.SEM], sem_classes)
                cat_idx = get_cat_idx(word[PARSE_LAYER.CAT], cat_classes)
                if len(neighs) != (neighbours_dist*2 + 1)*3:
                    neighs.append(sem_idx)
                    neighs.append(pos_idx)
                    neighs.append(cat_idx)
                    if word_idx == 0:
                        if len(contents) == 2:
                            n1 = contents[word_idx + 1]
                            neighs.append(get_sem_idx(n1[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n1[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n1[PARSE_LAYER.CAT], cat_classes))
                            neighs.append(-1)
                            neighs.append(-1)
                            neighs.append(-1)
                        else:
                            n1 = contents[word_idx + 1]
                            n2 = contents[word_idx + 2]
                            neighs.append(get_sem_idx(n1[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n1[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n1[PARSE_LAYER.CAT], cat_classes))
                            neighs.append(get_sem_idx(n2[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n2[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n2[PARSE_LAYER.CAT], cat_classes))
                    elif word_idx == len(contents) - 1:
                        if len(contents) == 2:
                            n1 = contents[word_idx - 1]
                            neighs.append(get_sem_idx(n1[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n1[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n1[PARSE_LAYER.CAT], cat_classes))
                            neighs.append(-1)
                            neighs.append(-1)
                            neighs.append(-1)
                        else:
                            n1 = contents[word_idx - 1]
                            n2 = contents[word_idx - 2]
                            neighs.append(get_sem_idx(n1[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n1[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n1[PARSE_LAYER.CAT], cat_classes))
                            neighs.append(get_sem_idx(n2[PARSE_LAYER.SEM], sem_classes))
                            neighs.append(get_pos_idx(n2[PARSE_LAYER.POS], pos_classes))
                            neighs.append(get_cat_idx(n2[PARSE_LAYER.CAT], cat_classes))
                    else:
                        n1 = contents[word_idx - 1]
                        n2 = contents[word_idx + 1]
                        neighs.append(get_sem_idx(n1[PARSE_LAYER.SEM], sem_classes))
                        neighs.append(get_pos_idx(n1[PARSE_LAYER.POS], pos_classes))
                        neighs.append(get_cat_idx(n1[PARSE_LAYER.CAT], cat_classes))
                        neighs.append(get_sem_idx(n2[PARSE_LAYER.SEM], sem_classes))
                        neighs.append(get_pos_idx(n2[PARSE_LAYER.POS], pos_classes))
                        neighs.append(get_cat_idx(n2[PARSE_LAYER.CAT], cat_classes))
                else:
                    contents_per_sentence.append(neighs)
                    neighs = []
                    word_idx += 1
                
    return contents_per_sentence

train_n_feats = get_n_feats(train_data)
test_n_feats = get_n_feats(test_data)

### Co-occurence feature
Count for the number of times a word co-occurs with the target word, per frequent word.

In [None]:
def get_co_occurence(target_word, word_list, sentences):
    co_occurences = [0 for i in word_list]
    for word_idx in range(len(word_list)):
        word = word_list[word_idx]
        for sentence in sentences:
            co_occurences[word_idx] += sentence.count(word)
    return co_occurences

def get_co_occurence_feature(data, frequent_words, full_sentences):
    features = []
    for sentence_idx in range(len(data)):
        print(sentence_idx)
        sentence = data[sentence_idx]
        for word in sentence:
            if word[PARSE_LAYER.SYM] in stop_words:
                continue
            elif word[PARSE_LAYER.SYM] != '.':
                co_occurences = get_co_occurence(word[PARSE_LAYER.SYM], frequent_words, full_sentences)
                features.append(co_occurences)
    return features

co_occurence_features_train = get_co_occurence_feature(train_data, most_frequent_words, all_train_sentences)
print('getting test features..')
co_occurence_features_test = get_co_occurence_feature(test_data, most_frequent_words, all_train_sentences)

In [54]:
np.save('co_feat_train.npy', co_occurence_features_train)
np.save('co_feat_test.npy', co_occurence_features_test)

In [55]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(co_occurence_features_train)
co_train_feats = scaler.transform(co_occurence_features_train)
co_test_feats = scaler.transform(co_occurence_features_test)

In [59]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
print(accuracy_score(test_labels, y_pred))



0.3696369636963696


In [60]:
model = RandomForestClassifier(random_state=42)
model.fit(train_n_feats, train_labels)
y_pred = model.predict(test_n_feats)
print(accuracy_score(test_labels, y_pred))

In [56]:
model = KNeighborsClassifier()
model.fit(co_train_feats, train_labels)
y_pred = model.predict(co_test_feats)
print(accuracy_score(test_labels, y_pred))

0.22239146991622238


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [None]:
%%time
def baseline_pred(data):
    pred = []
    for sentence in data:
        for word in sentence:
            if word[PARSE_LAYER.SYM].lower() in stop_words:
                continue
            elif word[PARSE_LAYER.SYM] == '.':
                continue
            if word[PARSE_LAYER.POS] != None:
                lemma = WordNetLemmatizer().lemmatize(word[0], pos=word[PARSE_LAYER.POS])
                syns = wn.synsets(word[PARSE_LAYER.SYM].replace('~', '_'), lang='eng', pos=word[PARSE_LAYER.POS])
                if len(syns) > 0:
                    pred.append(syns[0].name())
                else:
                    pred.append('O')
            else:
                pred.append('O')
    
    return pred

print("Accuracy:", accuracy_score(test_labels, baseline_pred(test_data)))

Accuracy: 0.5168824574765168
CPU times: user 23.6 ms, sys: 13.9 ms, total: 37.5 ms
Wall time: 36.4 ms
