In [17]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('senseval')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package senseval to /root/nltk_data...
[nltk_data]   Package senseval is already up-to-date!


True

In [18]:
import random
import string
from nltk.classify import accuracy
from nltk.corpus import senseval, wordnet, stopwords
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split

In [19]:
def get_pos_tag(tag):
	if len(tag) == 0:
		return tag
	if tag[0] == "J":
		return wordnet.ADJ
	elif tag[0] == 'V':
		return wordnet.VERB
	elif tag[0] == 'N':
		return wordnet.NOUN
	elif tag[0] == 'R':
		return wordnet.ADV
	else:	
		return wordnet.NOUN

In [20]:
def get_features(instance, window_size):
	position = instance.position
	context = instance.context
	sense = instance.senses[0]
	assert len(instance.senses) == 1, "{}".format(instance.senses)
	features = {}
	features["initial_position_idx"] = position
	features["len_context"] = len(context)
	new_context, new_position = [], position
	for i in range(len(context)):
		if not isinstance(context[i], (tuple, list)):
			if i < position:
				new_position -= 1
		elif context[i][0].lower() in string.punctuation or context[i][0].lower() in stopwords.words("english") or context[i][0] == "``" or context[i][0] == "''" or len(context[i][0]) == 0:
			if i < position:
				new_position -= 1
		else:
			new_context.append(context[i])
	new_context = [(0, "None1")] * (window_size // 2) + new_context + [(0, "None2")] * (window_size // 2)
	new_position += window_size // 2
	features["new_position_idx"] = new_position
	features["len_context_after_removal"] = len(new_context)
	for i in range(new_position - window_size // 2, new_position + window_size // 2 + 1):
		tag = new_context[i][1]
		reduced_tag = get_pos_tag(tag)
		features["part_of_speech_{}".format(i - new_position)] = tag
		features["reduced_part_of_speech_{}".format(i - new_position)] = reduced_tag
		features["{}".format(tag)] = features.get("{}".format(tag), 0) + 1 
		features["{}".format(reduced_tag)] = features.get("{}".format(reduced_tag), 0) + 1 
	return (features, sense)

In [21]:
for word in ["line", "hard", "serve", "interest"]:
	inst = senseval.instances("{}.pos".format(word))
	SIZE_SPLIT = 0.75
	data = [get_features(inst[i], window_size=12) for i in range(len(inst))]
	train, validation = data[:int(SIZE_SPLIT * len(data))], data[int(SIZE_SPLIT * len(data)):]
	model = nltk.NaiveBayesClassifier.train(train)
	with open("./NaiveBayes_{}".format(word), "w+") as f:
		f.write("Word: {}\n".format(word))
		f.write("Data length: {}\n".format(len(data)))
		f.write("Word senses: {}\n".format(set([sense for (_, sense) in data])))
		f.write("Accuracy on train: {}\n".format(accuracy(model, train)))
		f.write("Accuracy on validation: {}\n".format(accuracy(model, validation)))
		table = PrettyTable()
		table.field_names = ["Predicted", "Truth"]
		for i in range(min(150, len(validation))):
			predicted_label = model.classify(validation[i][0])
			actual_label = validation[i][1]
			table.add_row([predicted_label, actual_label])
		f.write(str(table))