In [11]:
import os, string, re, codecs, random
from collections import Counter
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV

In [2]:
def warn(*args, **kwargs):
	pass
import warnings
warnings.warn = warn

In [1]:
types = ['train.txt', 'valid.txt', 'test.txt', ]
average = 'micro'

In [2]:
def preprocess(file):
	translator = str.maketrans(" ", " ", string.punctuation)
	with open(file, 'r', encoding="utf-8") as f:
		text = f.read()
	text = text.lower().replace('\t', ' ').replace('<br /><br />', ' ').translate(translator)
	return text

In [14]:
def feature_extraction(set, n):
	file = preprocess(ds_path + set + types[0])
	word_list = file.split(" ")
	counter = Counter(word_list).most_common(n)
	dict = {}

	writer = open(set.split('-')[0] + '-vocab.txt', 'w')

	# save top words
	for i in range(n):
		word = counter[i][0]
		dict[word] = i + 1
		
		text = ("{}\t{}\t{}\n".format(word, i + 1, counter[i][1]))
		writer.write(text)

	for type in types:
		file = preprocess(ds_path + set + type)

		examples = file.split("\n")[:-1]
		ds_output = [i[-1] for i in examples]

		writer = open(set.split('-')[0] + '-' + type.split('.')[0] + '.txt', 'w')
		for i in range(len(examples)):
		    text = ""
		    for word in examples[i].split(' ')[:-1]:
		        if word in dict.keys(): 
		            text = "{} {}".format(text, dict[word])
		    if len(text) == 0: text = ' '
		    text = "{}\t{}\n".format(text, ds_output[i])
		    writer.write(text[1:])

	return dict


In [4]:
def get_bow(dict, set):
	bow = {}
	bow_f = {}
	for type in types: 
		name = type.split('.')[0]
		text  = preprocess(ds_path + set + type).split('\n')

		text = list(filter(None, text))

		output = np.array([int(line[-1]) for line in text])
		examples = [line[:-1] for line in text]

		vectorizer = CountVectorizer(vocabulary = dict.keys())

		vectors = np.asarray(vectorizer.fit_transform(examples).todense())

		freq = sparse.csr_matrix(normalize(vectors))
		vectors[vectors > 1] = 1
		binary = sparse.csr_matrix(vectors)

		bow[name] = [binary, output]
		bow_f[name] = [freq, output]

	return bow, bow_f


In [5]:
def train_model(set, clf, params, n_folds):
	train = set['train']
	valid = set['valid']
	test = set['test']

	train_input = train[0]
	valid_input = valid[0]
	test_input = test[0]

	train_truth = train[1]
	valid_truth = valid[1]
	test_truth = test[1]

	if params != None:
		clf = GridSearchCV(clf, params, cv=n_folds, refit=True)
	clf.fit(train_input, train_truth)

	best_param = None if params==None else clf.best_params_
	
	f1_train = f1_score(train_truth, clf.predict(train_input), average = average)
	f1_valid = f1_score(valid_truth, clf.predict(valid_input), average = average)
	f1_test = f1_score(test_truth, clf.predict(test_input), average = average)

	return f1_train, f1_valid, f1_test, best_param


In [6]:
def random_class(set):
	train_truth = set['train'][1]
	valid_truth = set['valid'][1]
	test_truth = set['test'][1]

	classes = len(np.unique(train_truth))

	f1_train = f1_score(train_truth, np.rint(np.random.random(len(train_truth)) * (classes - 1)), average = average)
	f1_valid = f1_score(valid_truth, np.rint(np.random.random(len(valid_truth)) * (classes - 1)), average = average)
	f1_test = f1_score(test_truth, np.rint(np.random.random(len(test_truth)) * (classes - 1)), average = average)
	return f1_train, f1_valid, f1_test


In [7]:
def majority_class(set):
	train_truth = set['train'][1]
	valid_truth = set['valid'][1]
	test_truth = set['test'][1]

	maj = np.argmax(np.bincount(train_truth))

	f1_train = f1_score(train_truth, np.array([maj for i in range(len(train_truth))]), average = average)
	f1_valid = f1_score(valid_truth, np.array([maj for i in range(len(valid_truth))]), average = average)
	f1_test = f1_score(test_truth, np.array([maj for i in range(len(test_truth))]), average = average)

	return f1_train, f1_valid, f1_test


In [8]:
n = 10000
ds_path = './hwk3_datasets/'
sets = ['yelp-', 'IMDB-']


### YELP DATASET 

In [15]:
set = sets[0]
vocab_list = feature_extraction(set, n)
yelp_bow, yelp_bowf = get_bow(vocab_list, set)


###### Binary Bag of Words

In [23]:
pred = random_class(yelp_bow)
print(set, "Random Classifier \n(train, valid, test) = ", [round(elem, 5) for elem in pred])

yelp- Random Classifier 
(train, valid, test) =  [0.12642999999999999, 0.13300000000000001, 0.123]
yelp- Random Classifier train f1_score 0.12642857142857142
yelp- Random Classifier valid f1_score 0.133
yelp- Random Classifier test f1_score 0.123



In [29]:
pred = majority_class(yelp_bow)
print(set, "Majority Classifier \n(train, valid, test) = ", pred)

yelp- Majority Classifier 
(train, valid, test) =  (0.35257142857142859, 0.35599999999999998, 0.35099999999999998)


In [30]:
param = [{'alpha': np.arange(0.6, 0.8, 0.01)}]
pred = train_model(yelp_bow, BernoulliNB(), param, 5)
print(set, "Naive Bayes Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

yelp- Naive Bayes Classifier 
(train, valid, test) =  (0.62471428571428567, 0.39600000000000002, 0.41549999999999998)
best params = {'alpha': 0.60999999999999999}



In [41]:
param = [{'max_iter': np.arange(400, 600, 50)}]
pred = train_model(yelp_bow, LinearSVC(), param, 5)
print(set, "Linear SVM Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}".format(pred[3]))

yelp- Linear SVM Classifier 
(train, valid, test) =  (0.99642857142857144, 0.45700000000000002, 0.44700000000000001)
best params = {'max_iter': 400}


In [33]:
param = [{'max_depth': [i for i in range(10, 20)], 'max_features': [1000 * i for i in range(2, 7)], 'max_leaf_nodes': [1000 * i for i in range(3, 6)]}]
pred = train_model(yelp_bow, DecisionTreeClassifier(), param, 5)
print(set, "Decision Tree \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

yelp- Decision Tree 
(train, valid, test) =  (0.56571428571428573, 0.38400000000000001, 0.38800000000000001)
best params = {'max_depth': 11, 'max_features': 2000, 'max_leaf_nodes': 5000}



###### Frequency Bag of Words

In [34]:
pred = random_class(yelp_bowf)
print(set, "Random Classifier \n(train, valid, test) = ", [round(elem, 5) for elem in pred])

yelp- Random Classifier 
(train, valid, test) =  [0.12014, 0.14000000000000001, 0.1195]


In [35]:
pred = majority_class(yelp_bowf)
print(set, "Majority Classifier \n(train, valid, test) = ", pred)

yelp- Majority Classifier 
(train, valid, test) =  (0.35257142857142859, 0.35599999999999998, 0.35099999999999998)


In [38]:
param = [{'alpha': np.arange(0.6, 0.8, 0.01)}]
pred = train_model(yelp_bowf, BernoulliNB(), param, 5)
print(set, "Naive Bayes Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

yelp- Naive Bayes Classifier 
(train, valid, test) =  (0.62471428571428567, 0.39600000000000002, 0.41549999999999998)
best params = {'alpha': 0.60999999999999999}



In [37]:
param = [{'max_iter': [500 * i for i in range(5)]}]
pred = train_model(yelp_bowf, LinearSVC(), param, 5)
print(set, "Linear SVM Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}".format(pred[3]))

yelp- Linear SVM Classifier 
(train, valid, test) =  (0.81742857142857139, 0.51200000000000001, 0.52400000000000002)
best params = {'max_iter': 500}


In [40]:
yelp_bowf['train'][0] = yelp_bowf['train'][0].todense()
yelp_bowf['valid'][0] = yelp_bowf['valid'][0].todense()
yelp_bowf['test'][0] = yelp_bowf['test'][0].todense()

pred = train_model(yelp_bowf, GaussianNB(), None, 5)
print(set, "Naive Bayes\n(train, valid, test) = ", pred[:3])

yelp- Naive Bayes
(train, valid, test) =  (0.73857142857142843, 0.27300000000000002, 0.28499999999999998)


<br>

### IMBD DATASET

<br>

###### Binary Bag of Words

In [None]:
set = sets[1]
vocab_list = feature_extraction(set, n)
IMDB_bow, IMDB_bowf = get_bow(vocab_list, set)

In [42]:
pred = random_class(IMDB_bow)
print(set, "Random Classifier \n(train, valid, test) = ", pred)

NameError: name 'IMDB_bow' is not defined

In [None]:
pred = majority_class(IMDB_bow)
print(set, "Majority Classifier \n(train, valid, test) = ", pred)

In [None]:
param = [{'alpha': np.arange(0.6, 0.8, 0.01)}]
pred = train_model(IMDB_bow, BernoulliNB(), param, 5)
print(set, "Naive Bayes Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

In [None]:
param = [{'max_depth': [i for i in range(10, 20)], 'max_features': [1000 * i for i in range(2, 7)], 'max_leaf_nodes': [1000 * i for i in range(3, 6)]}]
pred = train_model(IMDB_bow, DecisionTreeClassifier(), param, 5)
print(set, "Decision Tree \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

In [None]:
param = [{'max_iter': [500 * i for i in range(5)]}]
pred = train_model(IMDB_bow, LinearSVC(), param, 5)
print(set, "Linear SVM Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}".format(pred[3]))

###### Frequency Bag of Words

In [None]:
pred = random_class(IMDB_bowf)
print(set, "Random Classifier \n(train, valid, test) = ", pred)

In [None]:
pred = majority_class(IMDB_bowf)
print(set, "Majority Classifier \n(train, valid, test) = ", pred)

In [None]:
param = [{'max_depth': [i for i in range(10, 20)], 'max_features': [1000 * i for i in range(2, 7)], 'max_leaf_nodes': [1000 * i for i in range(3, 6)]}]
pred = train_model(IMDB_bowf, DecisionTreeClassifier(), param, 5)
print(set, "Decision Tree \n(train, valid, test) = ", pred[:3])
print("best params = {}\n".format(pred[3]))

In [None]:
param = [{'max_iter': [500 * i for i in range(5)]}]
pred = train_model(IMDB_bowf, LinearSVC(), param, 5)
print(set, "Linear SVM Classifier \n(train, valid, test) = ", pred[:3])
print("best params = {}".format(pred[3]))

In [None]:
IMDB_bowf['train'][0] = IMDB_bowf['train'][0].todense()
IMDB_bowf['valid'][0] = IMDB_bowf['valid'][0].todense()
IMDB_bowf['test'][0] = IMDB_bowf['test'][0].todense()

pred = train_model(IMDB_bowf, GaussianNB(), None, 5)
print(set, "Naive Bayes Classifier \n(train, valid, test) = ", pred[:3])