# Word Sense Disambiguation
## Group 3

### Data import

In [19]:
import numpy as np 
import sys 
from io import open 
import os
from logging import debug, info, warning, error
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import gensim
import csv


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreeaioanatudor/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andreeaioanatudor/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreeaioanatudor/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/andreeaioanatudor/nltk_data...


In [20]:
class PARSE_LAYER: 
    SYM = 2 
    SEM = 3 
    CAT = 4 
    SNS = 5 
    ROL = 6
    POS = 7
    LEM = 8
    W2V = 9

In [21]:
# the strategies implemented for integrating a pre-trained word embedding in a supervised WSD system
class STRATEGY:
    AVERAGE = 0
    FRAC_DECAY = 1
    EXP_DECAY = 2

In [22]:
# From the repository of the data
def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r', encoding="utf-8"):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 7 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids


In [23]:
def read_data(directory, filename):
     path = os.path.join(directory, filename)
     docs, docs_ids = get_conll_blocks(path)
     return docs

language = 'en'
standard = 'gold'

directory = os.path.join("./data/4.0.0", language, standard)

train_data = read_data(os.path.join(directory), "train.conll")
test_data = read_data(os.path.join(directory), "test.conll")

train_data_labels = [word[PARSE_LAYER.SNS] for sentence in train_data for word in sentence]
test_data_labels = [word[PARSE_LAYER.SNS] for sentence in test_data for word in sentence]

#### Load preprocessed data

In [24]:
def load_preprocessed(filename):
    data = list(csv.reader(open(filename, encoding="utf-8")))
    for idx_sen, sentence in enumerate(data):
        for idx, word in enumerate(sentence):
            word = word.replace('[', '').replace(']', '').replace('\'', '').split(', ')
            data[idx_sen][idx] = word
    return data

### Preprocesing
Run if no preprocessed data can be loaded.

In [45]:
# Load word2vec model trained on Google News
# Can be downloaded from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing [1]
# [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. 
#     Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [58]:
%%time
# Converts UPenn tags to tags relevant to WordNet
def upenn_to_wn_tag(tagged_sentence):
    wn_tags = []
    for tag in tagged_sentence:
        if tag[1].startswith('J'):
            wn_tags.append('a')
        elif tag[1].startswith('V'):
            wn_tags.append('v')
        elif tag[1].startswith('N'):
            wn_tags.append('n')
        elif tag[1].startswith('R'):
            wn_tags.append('r')
        else:
            wn_tags.append('o')

    return wn_tags


# Uses word2vec word embeddings to find the closest word based on context
def get_closest_w2v_word(sentence, word_idx, n_neighbours=2, strategy=STRATEGY.AVERAGE):
    if strategy == STRATEGY.EXP_DECAY:
        return get_closest_exp_decay(sentence, word_idx, n_neighbours)
    return get_closest_average(sentence, word_idx, n_neighbours)
   
    
# Return the closest word based on context - average strategy
def get_closest_average(sentence, word_idx, n_neighbours=2):
    n = 0
    context_vec = np.zeros((300,))

    for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
        if neighbour_idx >=0 and neighbour_idx < len(sentence) and neighbour_idx != word_idx:
            try:
                context_vec += wv[sentence[neighbour_idx][PARSE_LAYER.LEM]]
                n += 1
            except:
                pass

    if n != 0:
        return wv.most_similar(positive=[context_vec/n], topn=1)[0][0]
    else:
        return sentence[word_idx][PARSE_LAYER.LEM]
    
    
# Return the closest word based on exponential decay
def get_closest_exp_decay(sentence, word_idx, n_neighbours=2):
    alpha = 1 - pow(0.1, 1 / (n_neighbors - 1))
    context_vec = np.zeros((300,))

    for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
        if neighbour_idx >=0 and neighbour_idx < len(sentence) and neighbour_idx != word_idx:
            try:
                context_vec += wv[sentence[neighbour_idx][PARSE_LAYER.LEM]] * pow((1 - alpha), abs(word_idx - neighbour_idx) - 1)
            except:
                pass
    return wv.most_similar(positive=[context_vec], topn=1)[0][0]

    
# Adds the WordNet POS tags, lemmas and closest word2vec word to the original data
def add_pos_lemma(data):
    for sentence in data:
        tagged_sentence = pos_tag([item[0] for item in sentence])
        wn_tags = upenn_to_wn_tag(tagged_sentence)
        for idx, word in enumerate(sentence):
            word.append(wn_tags[idx])
            if wn_tags[idx] != 'o':
                lemma = WordNetLemmatizer().lemmatize(word[0], pos=wn_tags[idx])
            else:
                lemma = WordNetLemmatizer().lemmatize(word[0])
            word.append(lemma.lower())
            word.append(get_closest_w2v_word(sentence, idx, strategy=STRATEGY.EXP_DECAY))
        
    return data


train_data = add_pos_lemma(train_data)
test_data = add_pos_lemma(test_data)

yes
yes
yes
yes
yes
yes


KeyboardInterrupt: 

In [48]:
with open("preprocessed_data/train_data_exp_3.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(train_data)

with open("preprocessed_data/test_data_exp_3.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(test_data)

### Baseline
Use the most frequent sense for that word based on its POS.

In [49]:
def baseline_pred(data):
    pred = []
    for sentence in data:
        for word in sentence:
            if word[PARSE_LAYER.POS] != 'o':
                syns = wn.synsets(word[PARSE_LAYER.LEM].replace('~', '_'), lang='eng', pos=word[PARSE_LAYER.POS])
                if len(syns) > 0:
                    pred.append(syns[0].name())
                else:
                    pred.append('O')
            else:
                pred.append('O')
    return pred

# Same as the function above, but can be used directly on the feature vectors.
def baseline_preprocessed(data, word, pos_tags):
    pred = []
    for feature in data:
        if pos_tags[feature[0]] != 'o':
            syns = wn.synsets(word.replace('~', '_'), lang='eng', pos=pos_tags[feature[0]])
            if len(syns) > 0:
                pred.append(syns[0].name())
            else:
                pred.append('O')
        else:
            pred.append('O')
    return pred

### RF with neighbouring POS tags and word embeddings
Use the lemma, POS tag and thematic role of the word alongside with the ones of two neighbors on the left and right.

#### Generate features

In [50]:
def return_labels(train_data, test_data):
    all_data = train_data + test_data
    all_lemmas = np.array([word[PARSE_LAYER.SYM] for sentence in all_data for word in sentence])
    lemma_labels = np.unique(all_lemmas)
    all_pos_tags = np.array([word[PARSE_LAYER.POS] for sentence in all_data for word in sentence])
    pos_labels = np.unique(all_pos_tags)
    all_w2v_words = np.array([word[PARSE_LAYER.W2V] for sentence in all_data for word in sentence])
    w2v_labels = np.unique(all_w2v_words)

    return lemma_labels, pos_labels, w2v_labels

def get_features(data, lemma_labels, pos_labels, w2v_labels, n_neighbours=3,):
    features = []

    for sentence in data:
        for word_idx in range(len(sentence)):
            feature = []
            feature.append(np.where(lemma_labels == sentence[word_idx][PARSE_LAYER.SYM])[0][0])
            feature.append(np.where(pos_labels == sentence[word_idx][PARSE_LAYER.POS])[0][0])

            for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
                if neighbour_idx != word_idx:
                    try:
                        feature.append(np.where(pos_labels == sentence[neighbour_idx][PARSE_LAYER.POS])[0][0])
                    except:
                        feature.append(-1)
            feature.append(np.where(w2v_labels == sentence[word_idx][PARSE_LAYER.W2V])[0][0])

            features.append(feature)

    return features

In [34]:
# train_data = load_preprocessed("preprocessed_data/train_data_3.csv")
# test_data = load_preprocessed("preprocessed_data/test_data_3.csv")

# print("Accuracy baseline:", accuracy_score(test_data_labels, baseline_pred(test_data)))

# # Random forest using just the lemma and pos
# lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
# train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels)
# test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels)
# train_features_pos = []

# for feature in train_features:
#     train_features_pos.append([feature[0], feature[1]])

# test_features_pos = []
# for feature in test_features:
#     test_features_pos.append([feature[0], feature[1]])

# rf_pos = RandomForestClassifier(n_estimators=50, random_state=42)
# rf_pos.fit(train_features_pos, train_data_labels)
# pred = rf_pos.predict(test_features_pos)
# print("Accuracy RF:", accuracy_score(test_data_labels, pred))

Accuracy baseline: 0.616185310071837
Accuracy RF: 0.8658554464154816


#### Train model for each word type

In [51]:
def get_word_based_dicts(data, labels):
    data_dict = {}
    labels_dict = {}
    
    for idx, feat_vec in enumerate(data):
        key = feat_vec[0]  # Lemma
        if key not in data_dict: 
            data_dict[key] = []
            labels_dict[key] = []
        # The features will be everything but the lemma
        data_dict[key].append(feat_vec[1:])
        labels_dict[key].append(labels[idx])

    return data_dict, labels_dict

##### Cross-validation

In [53]:
kf = KFold(shuffle=True, random_state=42)

for n_neighbors in [3, 4, 5]:
    train_data = load_preprocessed("preprocessed_data/train_data_" + str(n_neighbors) + ".csv")
    test_data = load_preprocessed("preprocessed_data/test_data_" + str(n_neighbors) + ".csv")
    lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
    train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels)
    test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels)
    train_data_dict, train_labels_dict = get_word_based_dicts(train_features, train_data_labels)
    test_data_dict, test_labels_dict = get_word_based_dicts(test_features, test_data_labels)
    
    print('RF classifier with context based on', n_neighbors, 'on each side')
    accuracies_all = []
    total_keys = 0
    for key in test_data_dict:
        if key in train_data_dict and len(test_data_dict[key]) >= 5:
            total_keys += 1
            for fold, (train_index, test_index) in enumerate(kf.split(train_data_dict[key])):
                train_data_dict[key] = np.array(train_data_dict[key])
                train_labels_dict[key] = np.array(train_labels_dict[key])
                accuracies = []
                correct = 0
                number_of_labels = 0

                rf = RandomForestClassifier(random_state=42)
                rf.fit(train_data_dict[key][train_index], train_labels_dict[key][train_index])
                pred_dict = rf.predict(train_data_dict[key][test_index])

                correct += sum(i==j for i, j in zip(pred_dict, train_labels_dict[key][test_index]))
                number_of_labels += len(pred_dict)
                accuracies.append(correct/number_of_labels)
            accuracies_all.append(sum(accuracies)/5)

    print('Validation accuracy:', sum(accuracies_all)/total_keys)

RF classifier with context based on 3 on each side
Validation accuracy: 0.1794931475288165
RF classifier with context based on 4 on each side
Validation accuracy: 0.17974555899891964
RF classifier with context based on 5 on each side
Validation accuracy: 0.17978463644220846


##### Testing

In [59]:
# The cell outputs the accuracy of using our training method on the testing keys (words)
# present in the training keys(words). Whenever a testing key is not present in the 
# training data, it uses the baseline.

train_data = load_preprocessed("preprocessed_data/train_data_exp_3.csv")
test_data = load_preprocessed("preprocessed_data/test_data_exp_3.csv")
lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels)
test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels)
train_data_dict, train_labels_dict = get_word_based_dicts(train_features, train_data_labels)
test_data_dict, test_labels_dict = get_word_based_dicts(test_features, test_data_labels)

rf_dict = {}
pred_dict = {}
correct = 0
number_of_labels = 0
for key in test_data_dict:
    rf = RandomForestClassifier(random_state=42)
    try:
        rf.fit(train_data_dict[key], train_labels_dict[key])
        pred_dict[key] = rf.predict(test_data_dict[key])
    except:
        pred_dict[key] = baseline_preprocessed(test_data_dict[key], lemma_labels[key], pos_labels)
    correct += sum(i==j for i, j in zip(pred_dict[key], test_labels_dict[key]))
    number_of_labels += len(pred_dict[key])
    
print("Accuracy:", correct / number_of_labels)

Accuracy: 0.901040903093388


In [56]:
# The cell only outputs the accuracy of using our training method on the testing keys (words)
# present in the training keys(words).

train_data = load_preprocessed("preprocessed_data/train_data_exp_3.csv")
test_data = load_preprocessed("preprocessed_data/test_data_exp_3.csv")
lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels)
test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels)
train_data_dict, train_labels_dict = get_word_based_dicts(train_features, train_data_labels)
test_data_dict, test_labels_dict = get_word_based_dicts(test_features, test_data_labels)

rf_dict = {}
pred_dict = {}
correct = 0
number_of_labels = 0
n_trained_keys = 0
for key in test_data_dict:
    rf = RandomForestClassifier(random_state=42)
    try:
        rf.fit(train_data_dict[key], train_labels_dict[key])
        pred_dict[key] = rf.predict(test_data_dict[key])
        correct += sum(i==j for i, j in zip(pred_dict[key], test_labels_dict[key]))
        number_of_labels += len(pred_dict[key])
        n_trained_keys += 1
    except:
        pass
    
print("Accuracy:", correct / number_of_labels)
print("Tested", str(n_trained_keys), "out of", str(len(test_data_dict)), "total words present in testing.")

Accuracy: 0.9307191568505889
Tested 1226 out of 1580 total words present in testing.
