In [None]:
import nltk
import numpy as np
nltk.download('senseval')
nltk.download('stopwords')
from nltk.corpus import senseval
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import random
from nltk.classify import accuracy, NaiveBayesClassifier, MaxentClassifier
from collections import defaultdict

[nltk_data] Downloading package senseval to /root/nltk_data...
[nltk_data]   Unzipping corpora/senseval.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def extract_vocab_frequency(instances, stopwords=[], n=300):
    fd = nltk.FreqDist()
    for i in instances:
        (target, suffix) = i.word.split('-')
        words = (c[0] for c in i.context if not c[0] == target)
        for word in set(words) - set(stopwords):
            fd[word] += 1
    return fd.most_common()[:n+1]

In [None]:
def extract_vocab(instances, stopwords=[], n=300):
    return [w for w,f in extract_vocab_frequency(instances,stopwords,n)]

In [None]:
def wsd_context_features(instance, vocab, dist=3):
    '''Extract context features (words) of a instance given a distance'''
    features = {}
    ind = instance.position
    con = instance.context
    for i in range(max(0, ind-dist), ind):
        j = ind-i
        features[con[i][0]] = True

    for i in range(ind+1, min(ind+dist+1, len(con))):
        j = i-ind
        features[con[i][0]] = True

    return features

In [None]:
def wsd_context_features_pos(instance, vocab, dist=3):
    '''Extract context features (words and parts of speech) of a instance given a distance'''
    features = {}
    ind = instance.position
    con = instance.context
    for i in range(max(0, ind-dist), ind):
        j = ind-i
        features[con[i][0]] = con[i][1]

    for i in range(ind+1, min(ind+dist+1, len(con))):
        j = i-ind
        features[con[i][0]] = con[i][1]

    return features

In [None]:
def get_sk(instances):
  '''Calculate alphas as p(sk) as number of occurences of sense sk divided by number of occurences of any sense sk'''
  labels = [i[1] for i in instances]
  senses = set(labels)
  occ = {}
  for sense in senses:
    occ[sense] = 0.0
  
  for label in labels:
    occ[label] += 1
   
  total = len(labels)
  for sense in senses:
    occ[sense] = float(occ[sense]/total)

  return occ

In [None]:
def theta_k_j(training_data, v, sk, unique_labels, vocab, window_number):
  '''Calculate theta_k_j as number of occurences of each context feature that consists in a word in a context of sense sk divided by number of occurences of each context feature in a context of any sense sk'''
  occ = {}
  for vj in v:
    occ[vj] = 0
  
  for i in training_data:
    if sk == i[1]:
      words = wsd_context_features(i[0], vocab, window_number)
      for word in words:
        if word in occ:
          occ[word] += 1
      

  occ_total = {}
  for vj in v:
    occ_total[vj] = 0
  
  for i in training_data:
    words =  wsd_context_features(i[0], vocab, window_number)
    for word in words:
      if word in occ_total:
        occ_total[word] += 1
  

  #applied laplacian smoothing to each probability term
  res = {}
  for feature in occ:
    res[feature] = (1 + occ[feature])/(len(unique_labels) + occ_total[feature])
  

  return res

In [None]:
def theta_k_j_pos(training_data, v, sk, unique_labels, vocab, window_number):
  '''Calculate theta_k_j as number of occurences of each context feature that consists in the word and pos in a context of sense sk divided by number of occurences of each context feature in a context of any sense sk'''
  occ = {}
  for vj in v:
    occ[(vj, v[vj])] = 0

  for i in training_data:
    if sk == i[1]:
      words = wsd_context_features_pos(i[0], vocab, window_number)
      for word in words:
        if (word, words[word]) in occ:
          occ[(word, words[word])] += 1


  occ_total = {}
  for vj in v:
    occ_total[vj, v[vj]] = 0
  
  for i in training_data:
    words =  wsd_context_features_pos(i[0], vocab, window_number)
    for word in words:
      if (word, words[word]) in occ_total:
        occ_total[(word, words[word])] += 1
  

  res = {}
  for feature in occ:
    res[feature] = (1 + occ[feature])/(len(unique_labels) + occ_total[feature])
  
  return res

In [None]:
def train_NB(instances, stopwords_list, training_data, test_data, unique_labels, vocab_n, windows_n):
  #extract the vocabulary based on instances
  vocab = extract_vocab(instances, stopwords=stopwords_list, n=vocab_n)
  sk = get_sk(training_data)
  predicted_labels = []
  true_labels = []
  for i in test_data:
    instance = i[0]
    true_label = i[1]
    #get the context features of each test data 
    features = wsd_context_features(instance, vocab, windows_n)
    p = []
    #calculate each probability p(sk/context) 
    for label in unique_labels:
      thetas = theta_k_j(training_data, features, label, unique_labels, vocab, windows_n)
      p_c_s = sk[label]
      for feature in thetas:
        p_c_s = p_c_s * thetas[feature]
          
      p.append(p_c_s)
    #the predicted label will be the one that got the maximum value for the probability p(sk/context)
    predicted_labels.append(unique_labels[np.argmax(p)])
    true_labels.append(true_label)
  
  #return the true and predicted labels
  return predicted_labels, true_labels

In [None]:
def train_NB_pos(instances, stopwords_list, training_data, test_data, unique_labels, vocab_n, windows_n):
  #extract the vocabulary based on instances
  vocab = extract_vocab(instances, stopwords=stopwords_list, n=vocab_n)
  sk = get_sk(training_data)
  predicted_labels = []
  true_labels = []
  for i in test_data:
    instance = i[0]
    true_label = i[1]
    #get the context features of each test data including pos tagging
    features = wsd_context_features_pos(instance, vocab, windows_n)
    p = []
    #calculate each probability p(sk/context) 
    for label in unique_labels:
      thetas = theta_k_j(training_data, features, label, unique_labels, vocab, windows_n)
      p_c_s = sk[label]
      for feature in thetas:
        p_c_s = p_c_s * thetas[feature]
          
      p.append(p_c_s)
    #the predicted label will be the one that got the maximum value for the probability p(sk/context)
    predicted_labels.append(unique_labels[np.argmax(p)])
    true_labels.append(true_label)
  
  #return the true and predicted labels
  return predicted_labels, true_labels

In [None]:
def training_testing(word):
  #get the senses and instances
  senses = []
  instances = []
  for i in senseval.instances(word):
    senses.append((i, i.senses[0]))
    instances.append(i)

  #get the stopwords list in order to use for vocab extract
  stopwords_list = nltk.corpus.stopwords.words('english')
  #get the unique labels
  unique_labels = list(set([l for (i, l) in senses]))
  print("Labels: {}".format(unique_labels))

  random.seed(334)
  random.shuffle(senses)
  n = len(senses)
  #split the data in: 60% training, 20% validation, 20% testing
  training_data = senses[:int(0.6 * n)]
  validation_data = senses[int(0.6 * n):int(0.8 * n)]
  test_data = senses[int(0.8 * n):n]

  #different values for hypertuning the model
  window_numbers = [2,3,4]
  vocab_numbers = [200, 300]

  best_param_window = 2
  best_param_vocab = 200
  best_acc = 0

  #try different combinations for the window and vocab to achieve the best NB model and use the best parameters to train and test the model using testing data
  print("Validation...")
  for windows_n in window_numbers:
    for vocab_n in vocab_numbers:
      predicted_labels, true_labels = train_NB(instances, stopwords_list, training_data, validation_data, unique_labels, vocab_n, windows_n)

      
      print("Window size: {}, Vocab size: {}, Validation Accuracy: {}".format(windows_n, vocab_n, accuracy_score(true_labels, predicted_labels)))
      if accuracy_score(true_labels, predicted_labels) > best_acc:
        best_acc = accuracy_score(true_labels, predicted_labels)
        best_param_window = windows_n
        best_param_vocab = vocab_n


  #best params used for testing
  print("Best param: window: {}, vocab: {}".format(best_param_window, best_param_vocab))
  print("Testing...")
  predicted_labels, true_labels = train_NB(instances, stopwords_list, training_data, test_data, unique_labels, best_param_vocab, best_param_window)

  print("Accuracy: {}".format(accuracy_score(true_labels, predicted_labels)))
  print("Confusion matrix:")
  print(confusion_matrix(true_labels, predicted_labels))
  examples = 0
  #print different types of errors
  print("Examples of errors")
  for i in range(len(true_labels)):
    if true_labels[i] != predicted_labels[i]:
      print(test_data[i][0])
      print("True: {}, Predicted: {}".format(true_labels[i], predicted_labels[i]))
      examples += 1
    if examples == 10:
      break

In [None]:
training_testing('hard.pos')

Labels: ['HARD3', 'HARD1', 'HARD2']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.8327566320645905
Window size: 2, Vocab size: 300, Validation Accuracy: 0.8327566320645905
Window size: 3, Vocab size: 200, Validation Accuracy: 0.8166089965397924
Window size: 3, Vocab size: 300, Validation Accuracy: 0.8166089965397924
Window size: 4, Vocab size: 200, Validation Accuracy: 0.8143021914648212
Window size: 4, Vocab size: 300, Validation Accuracy: 0.8143021914648212
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.8362168396770473
Confusion matrix:
[[701   0   0]
 [ 73  20   0]
 [ 69   0   4]]
Examples of errors
SensevalInstance(word='hard-a', position=23, context=[('i', 'PRP'), ('liked', 'VBD'), ('the', 'DT'), ('low-fat', 'JJ'), ('(', '('), ('no', 'DT'), ('shortening', 'VBG'), ('but', 'CC'), ('with', 'IN'), ('nuts', 'NNS'), ('or', 'CC'), ('chocolate', 'NN'), (')', 'SYM'), ('better', 'JJR'), ('than', 'IN'), ('the', 'DT'), ('no-fat', 'JJ'), (',', ','), ('which',

In [None]:
training_testing('interest.pos')

Labels: ['interest_5', 'interest_4', 'interest_3', 'interest_2', 'interest_1', 'interest_6']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.7025316455696202
Window size: 2, Vocab size: 300, Validation Accuracy: 0.7025316455696202
Window size: 3, Vocab size: 200, Validation Accuracy: 0.6624472573839663
Window size: 3, Vocab size: 300, Validation Accuracy: 0.6624472573839663
Window size: 4, Vocab size: 200, Validation Accuracy: 0.6561181434599156
Window size: 4, Vocab size: 300, Validation Accuracy: 0.6561181434599156
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.6751054852320675
Confusion matrix:
[[ 21   0   0   0   4  46]
 [  0   0   0   0   0   4]
 [  0   0   0   0   0   8]
 [  0   0   0   1   1  39]
 [  2   0   0   0  69  49]
 [  1   0   0   0   0 229]]
Examples of errors
SensevalInstance(word='interest-n', position=10, context=[('smithkline', 'NN'), ('is', 'VBZ'), ('n', 'NN'), ("'t", 'VBG'), ('the', 'DT'), ('only', 'JJ'), ('company', 'NN'), ('to', '

In [None]:
training_testing('line.pos')

Labels: ['formation', 'product', 'phone', 'text', 'cord', 'division']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.5753920386007237
Window size: 2, Vocab size: 300, Validation Accuracy: 0.5753920386007237
Window size: 3, Vocab size: 200, Validation Accuracy: 0.5536791314837153
Window size: 3, Vocab size: 300, Validation Accuracy: 0.5536791314837153
Window size: 4, Vocab size: 200, Validation Accuracy: 0.5476477683956574
Window size: 4, Vocab size: 300, Validation Accuracy: 0.5476477683956574
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.6108433734939759
Confusion matrix:
[[  2   0   0   0  79   0]
 [  0  25   0   0  52   0]
 [  0   0   7   0  65   0]
 [  0   0   0  15  68   1]
 [  0   0   0   0 451   0]
 [  0   0   0   0  58   7]]
Examples of errors
SensevalInstance(word='line-n', position=64, context=[('from', 'IN'), ('canada', 'NNP'), ('comes', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('of', 'IN'), ('this', 'DT'), ('genre', 'NN'), (',', ','), ('zer

In [None]:
training_testing('serve.pos')

Labels: ['SERVE2', 'SERVE12', 'SERVE6', 'SERVE10']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.7465753424657534
Window size: 2, Vocab size: 300, Validation Accuracy: 0.7465753424657534
Window size: 3, Vocab size: 200, Validation Accuracy: 0.7876712328767124
Window size: 3, Vocab size: 300, Validation Accuracy: 0.7876712328767124
Window size: 4, Vocab size: 200, Validation Accuracy: 0.7796803652968036
Window size: 4, Vocab size: 300, Validation Accuracy: 0.7796803652968036
Best param: window: 3, vocab: 200
Testing...
Accuracy: 0.7899543378995434
Confusion matrix:
[[362   7   2   0]
 [ 29 218   1   0]
 [ 16  51 101   0]
 [ 52  22   4  11]]
Examples of errors
SensevalInstance(word='serve-v', position=29, context=[('he', 'PRP'), ('has', 'VBZ'), ('declined', 'VBN'), ('requests', 'NNS'), ('for', 'IN'), ('comment', 'NN'), (',', ','), ('and', 'CC'), ('it', 'PRP'), ('isn', 'VBP'), ("'t", 'VBG'), ('clear', 'JJ'), ('what', 'WP'), ('positions', 'VBZ'), ('he', 'PRP'), ('wo

### BONUS: POS features

In [None]:
def training_testing_pos(word):
  #almost the same function as training_testing, but it uses also the pos features
  #get the senses and instances
  senses = []
  instances = []
  for i in senseval.instances(word):
    senses.append((i, i.senses[0]))
    instances.append(i)

  #get the stopwords list in order to use for vocab extract
  stopwords_list = nltk.corpus.stopwords.words('english')
  #get the unique labels
  unique_labels = list(set([l for (i, l) in senses]))
  print("Labels: {}".format(unique_labels))

  random.seed(334)
  random.shuffle(senses)
  n = len(senses)
  #split the data in: 60% training, 20% validation, 20% testing
  training_data = senses[:int(0.6 * n)]
  validation_data = senses[int(0.6 * n):int(0.8 * n)]
  test_data = senses[int(0.8 * n):n]

  #different values for hypertuning the model
  window_numbers = [2,3,4]
  vocab_numbers = [200, 300]

  best_param_window = 2
  best_param_vocab = 200
  best_acc = 0

  #try different combinations for the window and vocab to achieve the best NB model and use the best parameters to train and test the model using testing data
  print("Validation...")
  for windows_n in window_numbers:
    for vocab_n in vocab_numbers:
      predicted_labels, true_labels = train_NB(instances, stopwords_list, training_data, validation_data, unique_labels, vocab_n, windows_n)

      
      print("Window size: {}, Vocab size: {}, Validation Accuracy: {}".format(windows_n, vocab_n, accuracy_score(true_labels, predicted_labels)))
      if accuracy_score(true_labels, predicted_labels) > best_acc:
        best_acc = accuracy_score(true_labels, predicted_labels)
        best_param_window = windows_n
        best_param_vocab = vocab_n


  #best params used for testing
  print("Best param: window: {}, vocab: {}".format(best_param_window, best_param_vocab))
  print("Testing...")
  predicted_labels, true_labels = train_NB_pos(instances, stopwords_list, training_data, test_data, unique_labels, best_param_vocab, best_param_window)

  print("Accuracy: {}".format(accuracy_score(true_labels, predicted_labels)))
  print("Confusion matrix:")
  print(confusion_matrix(true_labels, predicted_labels))
  examples = 0
  #print different types of errors
  print("Examples of errors")
  for i in range(len(true_labels)):
    if true_labels[i] != predicted_labels[i]:
      print(test_data[i][0])
      print("True: {}, Predicted: {}".format(true_labels[i], predicted_labels[i]))
      examples += 1
    if examples == 10:
      break

In [None]:
training_testing_pos('hard.pos')

Labels: ['HARD1', 'HARD2', 'HARD3']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.8512110726643599
Window size: 2, Vocab size: 300, Validation Accuracy: 0.8512110726643599
Window size: 3, Vocab size: 200, Validation Accuracy: 0.8200692041522492
Window size: 3, Vocab size: 300, Validation Accuracy: 0.8200692041522492
Window size: 4, Vocab size: 200, Validation Accuracy: 0.8154555940023068
Window size: 4, Vocab size: 300, Validation Accuracy: 0.8154555940023068
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.8512110726643599
Confusion matrix:
[[701   0   0]
 [ 61  32   0]
 [ 68   0   5]]
Examples of errors
SensevalInstance(word='hard-a', position=23, context=[('i', 'PRP'), ('liked', 'VBD'), ('the', 'DT'), ('low-fat', 'JJ'), ('(', '('), ('no', 'DT'), ('shortening', 'VBG'), ('but', 'CC'), ('with', 'IN'), ('nuts', 'NNS'), ('or', 'CC'), ('chocolate', 'NN'), (')', 'SYM'), ('better', 'JJR'), ('than', 'IN'), ('the', 'DT'), ('no-fat', 'JJ'), (',', ','), ('which',

In [None]:
training_testing_pos('interest.pos')

Labels: ['interest_3', 'interest_2', 'interest_6', 'interest_4', 'interest_5', 'interest_1']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.70042194092827
Window size: 2, Vocab size: 300, Validation Accuracy: 0.70042194092827
Window size: 3, Vocab size: 200, Validation Accuracy: 0.6645569620253164
Window size: 3, Vocab size: 300, Validation Accuracy: 0.6645569620253164
Window size: 4, Vocab size: 200, Validation Accuracy: 0.6540084388185654
Window size: 4, Vocab size: 300, Validation Accuracy: 0.6540084388185654
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.6729957805907173
Confusion matrix:
[[ 20   0   0   0   4  47]
 [  0   0   0   0   0   4]
 [  0   0   0   0   0   8]
 [  0   0   0   1   1  39]
 [  2   0   0   0  69  49]
 [  1   0   0   0   0 229]]
Examples of errors
SensevalInstance(word='interest-n', position=10, context=[('smithkline', 'NN'), ('is', 'VBZ'), ('n', 'NN'), ("'t", 'VBG'), ('the', 'DT'), ('only', 'JJ'), ('company', 'NN'), ('to', 'TO')

In [None]:
training_testing_pos('line.pos')

Labels: ['division', 'formation', 'cord', 'phone', 'text', 'product']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.571773220747889
Window size: 2, Vocab size: 300, Validation Accuracy: 0.571773220747889
Window size: 3, Vocab size: 200, Validation Accuracy: 0.5524728588661038
Window size: 3, Vocab size: 300, Validation Accuracy: 0.5524728588661038
Window size: 4, Vocab size: 200, Validation Accuracy: 0.5440289505428226
Window size: 4, Vocab size: 300, Validation Accuracy: 0.5440289505428226
Best param: window: 2, vocab: 200
Testing...
Accuracy: 0.6108433734939759
Confusion matrix:
[[  1   0   0   1  79   0]
 [  0  25   0   0  52   0]
 [  0   0   7   0  65   0]
 [  0   0   0  16  68   0]
 [  0   0   0   0 451   0]
 [  0   0   0   0  58   7]]
Examples of errors
SensevalInstance(word='line-n', position=64, context=[('from', 'IN'), ('canada', 'NNP'), ('comes', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('of', 'IN'), ('this', 'DT'), ('genre', 'NN'), (',', ','), ('zero'

In [None]:
training_testing_pos('serve.pos')

Labels: ['SERVE6', 'SERVE12', 'SERVE2', 'SERVE10']
Validation...
Window size: 2, Vocab size: 200, Validation Accuracy: 0.7465753424657534
Window size: 2, Vocab size: 300, Validation Accuracy: 0.7465753424657534
Window size: 3, Vocab size: 200, Validation Accuracy: 0.7876712328767124
Window size: 3, Vocab size: 300, Validation Accuracy: 0.7876712328767124
Window size: 4, Vocab size: 200, Validation Accuracy: 0.7796803652968036
Window size: 4, Vocab size: 300, Validation Accuracy: 0.7796803652968036
Best param: window: 3, vocab: 200
Testing...
Accuracy: 0.7899543378995434
Confusion matrix:
[[362   7   2   0]
 [ 29 218   1   0]
 [ 16  51 101   0]
 [ 52  22   4  11]]
Examples of errors
SensevalInstance(word='serve-v', position=29, context=[('he', 'PRP'), ('has', 'VBZ'), ('declined', 'VBN'), ('requests', 'NNS'), ('for', 'IN'), ('comment', 'NN'), (',', ','), ('and', 'CC'), ('it', 'PRP'), ('isn', 'VBP'), ("'t", 'VBG'), ('clear', 'JJ'), ('what', 'WP'), ('positions', 'VBZ'), ('he', 'PRP'), ('wo