In [20]:
import numpy as np 
import sys 
from io import open 
import os 
!{sys.executable} -m pip install nltk 
import nltk 
nltk.download('wordnet')
nltk.download('omw-1.4')
from logging import debug, info, warning, error
from nltk.corpus import wordnet as wn
from sklearn.metrics import accuracy_score




[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lisakoopmans/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lisakoopmans/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
class PARSE_LAYER: 
    SYM = 2 
    SEM = 3 
    CAT = 4 
    SNS = 5 
    ROL = 6 

In [22]:
def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r'):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 7 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids


In [31]:
def read_data(directory, filename):
     path = os.path.join(directory, filename)
     docs, docs_ids = get_conll_blocks(path)
     return docs

language = 'en'
standard = 'gold'

directory = os.path.join('./data/4.0.0', language, standard)
print(directory)

train_data = read_data(os.path.join(directory), 'train.conll')
test_data = read_data(os.path.join(directory), 'test.conll')
all_data = train_data + test_data

sem_tags_all = np.array([word[PARSE_LAYER.SEM] for sentence in all_data for word in sentence])
sem_classes = np.unique(sem_tags_all)
sem_classes = np.append(sem_classes, '.')

sem_tag_train = [np.where(sem_classes == sem_tags_all[idx])[0][0] for idx in range(len(train_data))]
sem_tag_test = [np.where(sem_classes == sem_tags_all[idx])[0][0] for idx in range(len(train_data), len(test_data))]

train_data_labels = [word[PARSE_LAYER.SNS] for sentence in train_data for word in sentence]
test_data_labels = [word[PARSE_LAYER.SNS] for sentence in test_data for word in sentence]

./data/4.0.0/en/gold
1053


In [27]:
def baseline(train_data, test_data, train_data_labels, test_data_labels):
    train_data_lemmas = [word[PARSE_LAYER.SYM] for sentence in train_data for word in sentence]
    test_data_lemmas = [word[PARSE_LAYER.SYM] for sentence in test_data for word in sentence]

    pred = []
    for lemma in test_data_lemmas:
        syns = wn.synsets(lemma, lang='eng')
        if len(syns) > 0:
            pred.append(syns[0].name())
        else:
            pred.append('O')
    print(len(test_data_labels), len(pred))
    return accuracy_score(test_data_labels, pred)


print(baseline(train_data, test_data, train_data_labels, test_data_labels))

6821 6821
0.43835214777891807


In [28]:
def get_sem_features(data):
    features = []
    for sentence in train_data:
        for word_idx in range(len(sentence)):
            # print(word_idx, len(sentence))
            if word_idx == 0 and len(sentence) >= 2:
                feature = ['.', sentence[word_idx][PARSE_LAYER.SEM], sentence[word_idx + 1][PARSE_LAYER.SEM]]
                feature = [np.where(sem_classes == feature[idx])[0][0] for idx in range(len(feature))]
            elif word_idx == len(sentence) - 1 and len(sentence) >= 2:
                feature = [sentence[word_idx - 1][PARSE_LAYER.SEM], sentence[word_idx][PARSE_LAYER.SEM], '.']
                feature = [np.where(sem_classes == feature[idx])[0][0] for idx in range(len(feature))]
                # print(feature, word_idx)
            else:
                if len(sentence) == 1:
                    feature = ['.', sentence[word_idx][PARSE_LAYER.SEM], '.']
                    feature = [np.where(sem_classes == feature[idx])[0][0] for idx in range(len(feature))]
                    # print(feature, word_idx)
                else:
                    feature = [sentence[word_idx - 1][PARSE_LAYER.SEM], sentence[word_idx][PARSE_LAYER.SEM], sentence[word_idx + 1][PARSE_LAYER.SEM]]
                    feature = [np.where(sem_classes == feature[idx])[0][0] for idx in range(len(feature))]
        # print(feature, word_idx)
            features.append(feature)
        
    return features
    
sem_train_features = get_sem_features(train_data)
sem_test_features = get_sem_features(test_data)
print(len(train_data_labels), len(test_data_labels))

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score




50408 6821


In [30]:
#  Random forest
rf = RandomForestClassifier()
rf.fit(sem_train_features, train_data_labels)
pred = rf.predict(sem_test_features)
print(len(test_data_labels), len(pred))
print(accuracy_score(test_data_labels, pred))

6821 50408


ValueError: Found input variables with inconsistent numbers of samples: [6821, 50408]

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(sem_train_features, train_data_labels)
pred = lr.predict(sem_test_features)
print(accuracy_score(test_data_labels, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5201753689890494


In [None]:
## SVM
# svm = SVC()
# svm.fit(sem_train_features, train_data_labels)
# pred = svm.predict(sem_test_features)
# print(accuracy_score(test_data_labels, pred))
# 0.5465600698301857 runs for about 3 hours