# Word Sense Disambiguation
## Group 3

### Data import

In [1]:
import numpy as np 
import sys 
from io import open 
import os
from logging import debug, info, warning, error
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import gensim
import csv


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
class PARSE_LAYER: 
    SYM = 2 
    SEM = 3 
    CAT = 4 
    SNS = 5 
    ROL = 6
    POS = 7
    W2V = 8

In [3]:
# the strategies implemented for integrating a pre-trained word embedding in a supervised WSD system
class STRATEGY:
    AVERAGE = 0
    EXP_DECAY = 1

In [4]:
# From the repository of the data
def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r', encoding="utf-8"):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 7 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids


In [5]:
def read_data(directory, filename):
     path = os.path.join(directory, filename)
     docs, docs_ids = get_conll_blocks(path)
     return docs

language = 'en'
standard = 'gold'

directory = os.path.join("./data/4.0.0", language, standard)

train_data = read_data(os.path.join(directory), "train.conll")
test_data = read_data(os.path.join(directory), "test.conll")

train_data_labels = [word[PARSE_LAYER.SNS] for sentence in train_data for word in sentence]
test_data_labels = [word[PARSE_LAYER.SNS] for sentence in test_data for word in sentence]

#### Load preprocessed data

In [6]:
def load_preprocessed(filename):
    data = list(csv.reader(open(filename, encoding="utf-8")))
    for idx_sen, sentence in enumerate(data):
        for idx, word in enumerate(sentence):
            word = word.replace('[', '').replace(']', '').replace('\'', '').split(', ')
            data[idx_sen][idx] = word
    return data

### Preprocesing
Run if no preprocessed data can be loaded.

In [7]:
# Load word2vec model trained on Google News
# Can be downloaded from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing [1]
# [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. 
#     Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [8]:
# Converts UPenn tags to tags relevant to WordNet
def upenn_to_wn_tag(tagged_sentence):
    wn_tags = []
    for tag in tagged_sentence:
        if tag[1].startswith('J'):
            wn_tags.append('a')
        elif tag[1].startswith('V'):
            wn_tags.append('v')
        elif tag[1].startswith('N'):
            wn_tags.append('n')
        elif tag[1].startswith('R'):
            wn_tags.append('r')
        else:
            wn_tags.append('o')

    return wn_tags


# Uses word2vec word embeddings to find the closest word based on context
def get_closest_w2v_word(sentence, word_idx, n_neighbours=1, strategy=STRATEGY.AVERAGE):
    if strategy == STRATEGY.EXP_DECAY:
        return get_closest_exp_decay(sentence, word_idx, n_neighbours)
    return get_closest_average(sentence, word_idx, n_neighbours)
   
    
# Return the closest word based on context - average strategy
def get_closest_average(sentence, word_idx, n_neighbours=5):
    n = 0 # counts how many neighbours were found in w2v and included in the context
    context_vec = np.zeros((300,))

    for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
        if neighbour_idx >=0 and neighbour_idx < len(sentence) and neighbour_idx != word_idx:
            try:
                context_vec += wv[sentence[neighbour_idx][PARSE_LAYER.SYM]]
                n += 1
            except:
                pass

    if n != 0:
        return wv.most_similar(positive=[context_vec/n], topn=1)[0][0]
    else:
        return sentence[word_idx][PARSE_LAYER.SYM]
    
    
# Return the closest word based on exponential decay
def get_closest_exp_decay(sentence, word_idx, n_neighbours=3):
    alpha = 1 - pow(0.1, 1 / (n_neighbours - 1))
    n = 0 # counts how many neighbours were found in w2v and included in the context
    context_vec = np.zeros((300,))

    for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
        if neighbour_idx >=0 and neighbour_idx < len(sentence) and neighbour_idx != word_idx:
            try:
                context_vec += wv[sentence[neighbour_idx][PARSE_LAYER.SYM]] * pow((1 - alpha), abs(word_idx - neighbour_idx) - 1)
                n += 1
            except:
                pass
    if n != 0:
        return wv.most_similar(positive=[context_vec], topn=1)[0][0]
    else:
        return sentence[word_idx][PARSE_LAYER.SYM]

    
# Adds the WordNet POS tags and closest word2vec word to the original data
def add_pos_w2v(data):
    for sentence in data:
        tagged_sentence = pos_tag([item[0] for item in sentence])
        wn_tags = upenn_to_wn_tag(tagged_sentence)
        for idx, word in enumerate(sentence):
            word.append(wn_tags[idx])
            word.append(get_closest_w2v_word(sentence, idx, 5, strategy=STRATEGY.EXP_DECAY))
    return data

train_data = add_pos_w2v(test_data)
test_data = add_pos_w2v(test_data)

In [9]:
with open("preprocessed_data/train_data_exp_5.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(train_data)

with open("preprocessed_data/test_data_exp_5.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(test_data)

In [10]:
# Load any saved datasets.
train_data = load_preprocessed("preprocessed_data/train_data_1.csv")
test_data = load_preprocessed("preprocessed_data/test_data_1.csv")

print(f"Num. of nouns in training = {sum(w[PARSE_LAYER.POS] == 'n' for s in train_data for w in s)}")
print(f"Num. of verbs in training = {sum(w[PARSE_LAYER.POS] == 'v' for s in train_data for w in s)}")
print(f"Num. of adjectives in training = {sum(w[PARSE_LAYER.POS] == 'a' for s in train_data for w in s)}")
print(f"Num. of adverbs in training = {sum(w[PARSE_LAYER.POS] == 'r' for s in train_data for w in s)}")
print(f"Num. of unlabeled in training = {sum(w[PARSE_LAYER.POS] == 'o' for s in train_data for w in s)}\n")

print(f"Num. of nouns in testing = {sum(w[PARSE_LAYER.POS] == 'n' for s in test_data for w in s)}")
print(f"Num. of verbs in testing = {sum(w[PARSE_LAYER.POS] == 'v' for s in test_data for w in s)}")
print(f"Num. of adjectives in testing = {sum(w[PARSE_LAYER.POS] == 'a' for s in test_data for w in s)}")
print(f"Num. of adverbs in testing = {sum(w[PARSE_LAYER.POS] == 'r' for s in test_data for w in s)}")
print(f"Num. of unlabeled in testing = {sum(w[PARSE_LAYER.POS] == 'o' for s in test_data for w in s)}")

Num. of nouns in training = 12381
Num. of verbs in training = 10020
Num. of adjectives in training = 2287
Num. of adverbs in training = 1540
Num. of unlabeled in training = 24180

Num. of nouns in testing = 1636
Num. of verbs in testing = 1397
Num. of adjectives in testing = 315
Num. of adverbs in testing = 223
Num. of unlabeled in testing = 3250


### Baseline
Use the most frequent sense for that word based on its POS.

In [11]:
def baseline_pred(data):
    pred = []
    for sentence in data:
        for word in sentence:
            if word[PARSE_LAYER.POS] != 'o':
                syns = wn.synsets(word[PARSE_LAYER.SYM].replace('~', '_'), lang='eng', pos=word[PARSE_LAYER.POS])
                if len(syns) > 0:
                    pred.append(syns[0].name())
                else:
                    pred.append('O')
            else:
                pred.append('O')
    return pred

# Same as the function above, but can be used directly on the feature vectors.
def baseline_preprocessed(data, word, pos_tags):
    pred = []
    for feature in data:
        if pos_tags[feature[0]] != 'o':
            syns = wn.synsets(word.replace('~', '_'), lang='eng', pos=pos_tags[feature[0]])
            if len(syns) > 0:
                pred.append(syns[0].name())
            else:
                pred.append('O')
        else:
            pred.append('O')
    return pred

### RF with neighbouring POS tags and word embeddings
Use the lemma, POS tag and thematic role of the word alongside with the ones of two neighbors on the left and right.

#### Generate features

In [12]:
def return_labels(train_data, test_data):
    all_data = train_data + test_data
    all_lemmas = np.array([word[PARSE_LAYER.SYM] for sentence in all_data for word in sentence])
    lemma_labels = np.unique(all_lemmas)
    all_pos_tags = np.array([word[PARSE_LAYER.POS] for sentence in all_data for word in sentence])
    pos_labels = np.unique(all_pos_tags)
    all_w2v_words = np.array([word[PARSE_LAYER.W2V] for sentence in all_data for word in sentence])
    w2v_labels = np.unique(all_w2v_words)

    return lemma_labels, pos_labels, w2v_labels

def get_features(data, lemma_labels, pos_labels, w2v_labels, n_neighbours=3):
    features = []

    for sentence in data:
        for word_idx in range(len(sentence)):
            feature = []
            feature.append(np.where(lemma_labels == sentence[word_idx][PARSE_LAYER.SYM])[0][0])
            feature.append(np.where(pos_labels == sentence[word_idx][PARSE_LAYER.POS])[0][0])

            for neighbour_idx in range(word_idx - n_neighbours, word_idx + n_neighbours + 1):
                if neighbour_idx != word_idx:
                    try:
                        feature.append(np.where(pos_labels == sentence[neighbour_idx][PARSE_LAYER.POS])[0][0])
                    except:
                        feature.append(-1)
            feature.append(np.where(w2v_labels == sentence[word_idx][PARSE_LAYER.W2V])[0][0])

            features.append(feature)

    return features

#### Train model for each word type

In [13]:
def get_word_based_dicts(data, labels):
    data_dict = {}
    labels_dict = {}
    
    for idx, feat_vec in enumerate(data):
        key = feat_vec[0]  # Lemma
        if key not in data_dict: 
            data_dict[key] = []
            labels_dict[key] = []
        # The features will be everything but the lemma
        data_dict[key].append(feat_vec[1:])
        labels_dict[key].append(labels[idx])

    return data_dict, labels_dict

#### Evaluation

##### Baseline

In [14]:
# Load any of the trained data sets for baseline
train_data = load_preprocessed("preprocessed_data/train_data_1.csv")
test_data = load_preprocessed("preprocessed_data/test_data_1.csv")

print(f"Accuracy baseline: {accuracy_score(test_data_labels, baseline_pred(test_data))}\n")

test_n = [[test_data[i][j] for i in range(len(test_data)) for j in range(len(test_data[i]))\
    if test_data[i][j][PARSE_LAYER.POS] == 'n']]
test_v = [[test_data[i][j] for i in range(len(test_data)) for j in range(len(test_data[i]))\
    if test_data[i][j][PARSE_LAYER.POS] == 'v']]
test_a = [[test_data[i][j] for i in range(len(test_data)) for j in range(len(test_data[i]))\
    if test_data[i][j][PARSE_LAYER.POS] == 'a']]
test_r = [[test_data[i][j] for i in range(len(test_data)) for j in range(len(test_data[i]))\
    if test_data[i][j][PARSE_LAYER.POS] == 'r']]
test_o = [[test_data[i][j] for i in range(len(test_data)) for j in range(len(test_data[i]))\
    if test_data[i][j][PARSE_LAYER.POS] == 'o']]

print(f"Accuracy nouns: {accuracy_score([w[PARSE_LAYER.SNS] for w in test_n[0]], baseline_pred(test_n))}")
print(f"Accuracy verbs: {accuracy_score([w[PARSE_LAYER.SNS] for w in test_v[0]], baseline_pred(test_v))}")
print(f"Accuracy adj.: {accuracy_score([w[PARSE_LAYER.SNS] for w in test_a[0]], baseline_pred(test_a))}")
print(f"Accuracy adv.: {accuracy_score([w[PARSE_LAYER.SNS] for w in test_r[0]], baseline_pred(test_r))}")
print(f"Accuracy none: {accuracy_score([w[PARSE_LAYER.SNS] for w in test_o[0]], baseline_pred(test_o))}")

Accuracy baseline: 0.6195572496701364

Accuracy nouns: 0.508557457212714
Accuracy verbs: 0.2884753042233357
Accuracy adj.: 0.4603174603174603
Accuracy adv.: 0.16591928251121077
Accuracy none: 0.8643076923076923


##### POS + context

In [15]:
# The cell outputs the accuracy of using our training method on the testing keys (words)
# present in the training keys(words). Whenever a testing key is not present in the 
# training data, it uses the baseline.
for nn_w2v in [1, 3, 5]:
    for decay in [STRATEGY.AVERAGE, STRATEGY.EXP_DECAY]:
        if decay == STRATEGY.AVERAGE:
            train_data = load_preprocessed("preprocessed_data/train_data_" + str(nn_w2v) + ".csv")
            test_data = load_preprocessed("preprocessed_data/test_data_" + str(nn_w2v) + ".csv")
        elif nn_w2v > 1:
            train_data = load_preprocessed("preprocessed_data/train_data_exp_" + str(nn_w2v) + ".csv")
            test_data = load_preprocessed("preprocessed_data/test_data_exp_" + str(nn_w2v) + ".csv")
        lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
        for nn_pos in [0, 1, 3, 5]:
            train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels, nn_pos)
            test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels, nn_pos)
            train_data_dict, train_labels_dict = get_word_based_dicts(train_features, train_data_labels)
            test_data_dict, test_labels_dict = get_word_based_dicts(test_features, test_data_labels)

            rf_dict = {}
            pred_dict = {}
            correct = 0
            number_of_labels = 0
            for key in test_data_dict:
                rf = RandomForestClassifier(random_state=42)
                try:
                    rf.fit(train_data_dict[key], train_labels_dict[key])
                    pred_dict[key] = rf.predict(test_data_dict[key])
                except:
                    pred_dict[key] = baseline_preprocessed(test_data_dict[key], lemma_labels[key], pos_labels)
                correct += sum(i==j for i, j in zip(pred_dict[key], test_labels_dict[key]))
                number_of_labels += len(pred_dict[key])
            
            acc_msg = f"Accuracy (nn_w2v = {nn_w2v}, decay = {decay}, nn_pos = {nn_pos}): {correct / number_of_labels}"
            with open('results.txt', 'a') as f:
                f.write(acc_msg + '\n')
            print(acc_msg)

Accuracy (nn_w2v = 1, decay = 0, nn_pos = 0): 0.8840345990323999
Accuracy (nn_w2v = 1, decay = 0, nn_pos = 1): 0.8957630845917021
Accuracy (nn_w2v = 1, decay = 0, nn_pos = 3): 0.9019205395103357
Accuracy (nn_w2v = 1, decay = 0, nn_pos = 5): 0.9035332062747398
Accuracy (nn_w2v = 1, decay = 1, nn_pos = 0): 0.8840345990323999
Accuracy (nn_w2v = 1, decay = 1, nn_pos = 1): 0.8957630845917021
Accuracy (nn_w2v = 1, decay = 1, nn_pos = 3): 0.9019205395103357
Accuracy (nn_w2v = 1, decay = 1, nn_pos = 5): 0.9035332062747398
Accuracy (nn_w2v = 3, decay = 0, nn_pos = 0): 0.8806626594341006
Accuracy (nn_w2v = 3, decay = 0, nn_pos = 1): 0.8915115085764551
Accuracy (nn_w2v = 3, decay = 0, nn_pos = 3): 0.8986952059815276
Accuracy (nn_w2v = 3, decay = 0, nn_pos = 5): 0.902067145579827
Accuracy (nn_w2v = 3, decay = 1, nn_pos = 0): 0.883008356545961
Accuracy (nn_w2v = 3, decay = 1, nn_pos = 1): 0.8945902360357719
Accuracy (nn_w2v = 3, decay = 1, nn_pos = 3): 0.9008942970238968
Accuracy (nn_w2v = 3, decay

In [16]:
train_data = load_preprocessed("preprocessed_data/train_data_exp_5.csv")
test_data = load_preprocessed("preprocessed_data/test_data_exp_5.csv")
lemma_labels, pos_labels, w2v_labels = return_labels(train_data, test_data)
train_features = get_features(train_data, lemma_labels, pos_labels, w2v_labels, 5)
test_features = get_features(test_data, lemma_labels, pos_labels, w2v_labels, 5)
train_data_dict, train_labels_dict = get_word_based_dicts(train_features, train_data_labels)
test_data_dict, test_labels_dict = get_word_based_dicts(test_features, test_data_labels)

rf_dict = {}
pred_dict = {}
correct = [0, 0, 0, 0, 0]
number_of_labels = [0, 0, 0, 0, 0]
for key in test_data_dict:
    rf = RandomForestClassifier(random_state=42)
    try:
        rf.fit(train_data_dict[key], train_labels_dict[key])
        pred_dict[key] = rf.predict(test_data_dict[key])
    except:
        pred_dict[key] = baseline_preprocessed(test_data_dict[key], lemma_labels[key], pos_labels)
    for idx, w in enumerate(test_data_dict[key]):
        if pred_dict[key][idx] == test_labels_dict[key][idx]:
            correct[w[0]] += 1
        number_of_labels[w[0]] += 1

for idx, pos in enumerate(pos_labels):
    print(f"Accuracy of {pos}: {correct[idx]/number_of_labels[idx]}")

Accuracy of a: 0.7873015873015873
Accuracy of n: 0.8245721271393643
Accuracy of o: 0.9947692307692307
Accuracy of r: 0.9237668161434978
Accuracy of v: 0.8103078024337866


In [18]:
with open('mistakes.txt', 'a') as f:
    f.write("lemma, predicted, gold standard\n")
for key in pred_dict:
    for idx in range(len(pred_dict[key])):
        if pred_dict[key][idx] != test_labels_dict[key][idx]:
            with open('mistakes.txt', 'a') as f:
                f.write(f"{lemma_labels[key]}, {pred_dict[key][idx]}, {test_labels_dict[key][idx]}\n")