# Word Sense Disambiguation - Baseline
## Group 3

### Data import

In [1]:
import numpy as np 
import sys 
from io import open 
import os
from logging import debug, info, warning, error
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alina\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
class PARSE_LAYER: 
    SYM = 2 
    SEM = 3 
    CAT = 4 
    SNS = 5 
    ROL = 6
    POS = 7

In [3]:
# From the repository of the data
def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r', encoding="utf-8"):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 7 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids


In [4]:
def read_data(directory, filename):
     path = os.path.join(directory, filename)
     docs, docs_ids = get_conll_blocks(path)
     return docs

language = 'en'
standard = 'gold'

directory = os.path.join('./data/4.0.0', language, standard)

train_data = read_data(os.path.join(directory), 'train.conll')
test_data = read_data(os.path.join(directory), 'test.conll')

train_data_labels = [word[PARSE_LAYER.SNS] for sentence in train_data for word in sentence]
test_data_labels = [word[PARSE_LAYER.SNS] for sentence in test_data for word in sentence]

### Preprocesing

In [5]:
%%time
# Converts UPenn tags to tags relevant to WordNet
def upenn_to_wn_tag(tagged_sentence):
    wn_tags = []
    for tag in tagged_sentence:
        if tag[1].startswith('J'):
            wn_tags.append('a')
        elif tag[1].startswith('V'):
            wn_tags.append('v')
        elif tag[1].startswith('N'):
            wn_tags.append('n')
        elif tag[1].startswith('R'):
            wn_tags.append('r')
        else:
            wn_tags.append(None)

    return wn_tags

# Adds the WordNet POS tags to the PMB data
def add_pos_tag(data):
    for sentence in data:
        tagged_sentence = pos_tag([item[0] for item in sentence])
        wn_tags = upenn_to_wn_tag(tagged_sentence)
        for idx, word in enumerate(sentence):
            word.append(wn_tags[idx])
            
    return data

train_data = add_pos_tag(train_data)
test_data = add_pos_tag(test_data)

CPU times: total: 4.09 s
Wall time: 4.17 s


### Baseline
Use the most frequent sense for that word based on its POS.

In [6]:
%%time
def baseline_pred(data):
    pred = []
    for sentence in data:
        for word in sentence:
            if word[PARSE_LAYER.POS] != None:
                lemma = WordNetLemmatizer().lemmatize(word[0], pos=word[PARSE_LAYER.POS])
                syns = wn.synsets(word[PARSE_LAYER.SYM].replace('~', '_'), lang='eng', pos=word[PARSE_LAYER.POS])
                if len(syns) > 0:
                    pred.append(syns[0].name())
                else:
                    pred.append('O')
            else:
                pred.append('O')
    
    return pred

print("Accuracy:", accuracy_score(test_data_labels, baseline_pred(test_data)))

Accuracy: 0.6195572496701364
CPU times: total: 1.66 s
Wall time: 1.66 s
