In [1]:
#Convert datasets into binary and frequency bag of words representations
import numpy as np
import pandas as pd
import string
import random
from collections import Counter
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV


# read datasets from csv files, first column is review, second column is rating/sentiment 
imdb_training_data = pd.read_csv('IMDB-train.txt', sep = "\t", header = None)
imdb_validation_data = pd.read_csv('IMDB-valid.txt', sep = "\t", header = None)
imdb_test_data = pd.read_csv('IMDB-test.txt', sep = "\t", header = None)

yelp_training_data = pd.read_csv('yelp-train.txt', sep = "\t", header = None)
yelp_validation_data = pd.read_csv('yelp-valid.txt', sep = "\t", header = None)
yelp_test_data = pd.read_csv('yelp-test.txt', sep = "\t", header = None)

In [2]:
# First, convert datasets into binary and frequency bag of words representations

# Method takes as input a review Data Frame and returns the top 10,000 words with highest frequency in the form of a tuple:
# (top words dictionary with word as key and rank as value, top words information in format specified by pdf
# that will be output into file)
def find_top_words(reviews_data):
    words = []
    top_words = []
    
    for review in reviews_data[0]:
        # preprocess by removing punctuation and <br /><br />, and converting reviews to lower case
        words.extend(review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' '))
        
    # remove empty strings from words list
    words = list(filter(None, words))
    
    # Use Counter to find most 10,000 common words from word list, returned as tuple (string word, integer count)
    top_words = Counter(words).most_common(10000)
    
    # First element of tuple is dictionary of most common word String as key, frequency rank (starting at index 0) as value
    # Second element of tuple is List of Strings (1 for each top word) which consists of word, rank, and frequency delimited by tabs
    # enumerate enumerates top_words list, so that we can assign unique ID/rank to dictionary, and output string
    return {top_word[0]: index for index, top_word in enumerate(top_words)}, [top_word[0] + '\t' + str(index) + '\t' + str(top_word[1])  for index, top_word in enumerate(top_words)]
    
# find top words and output from training data sets
imdb_top_words, imdb_output = find_top_words(imdb_training_data)
yelp_top_words, yelp_output = find_top_words(yelp_training_data)
    
# method takes as input tuple list top_words (String word, frequency) and review data and returns array of vector representations of reviews, with a 
# 1 in the index of vector if the word at that index in top_words appears in the specific review
def generate_binary_bag_of_words_representation(top_words, reviews_data):
    vectors = []
    
    for review in reviews_data[0]:
        # preprocess review and split into individual words
        review_words = review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' ')
        # initialize vector to contain 10,000 0's
        current_vector = [0] * 10000
        # for each word in review, if word is in list of top words, set current_vector at index corresponding to id of top word to 1
        for word in review_words:
            if word in top_words:
                current_vector[top_words[word]] = 1
        vectors.append(current_vector)
    return np.array(vectors)

# method takes as input tuple list top_words (String word, frequency) and review data and returns array of vector representations of reviews, based on
# frequency representation. Values of each vector will sum to 1, and each non-zero value in vector will correspond to its proportional
# occurrence weight in the specific review (feature[id] = (#id in review)/#(all ids in review)
def generate_frequency_bag_of_words_representation(top_words, reviews_data):
    vectors = []
    
    for review in reviews_data[0]:
        # preprocess review and split into individual words
        review_words = review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' ')
        # initialize vector to contain 10,000 0's
        current_vector = [0] * 10000
        # for each word in review, if word is in list of top words, set current_vector at index corresponding to id of top word to 1
        for word in review_words:
            # if word is in top_words, increment vector at index given by word id from dictionary
            if word in top_words:
                current_vector[top_words[word]] += 1
        
        # calculate sum of all top_word frequencies in current review
        top_words_sum = sum(current_vector)
        
        # if there is at least 1 top word in current review, divide all frequencies in vector by total number of top word frequencies in vector
        # to yield proportion of given top words then add to vectors list, otherwise just add zero vector to vectors list
        if top_words_sum > 0:
            current_vector = np.divide(current_vector, top_words_sum)
        vectors.append(current_vector)
    return np.array(vectors)


# convert datasets to binary and frequency bag of words representations
imdb_training_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_training_data)
imdb_validation_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_validation_data)
imdb_test_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_test_data)
imdb_training_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_training_data)
imdb_validation_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_validation_data)
imdb_test_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_test_data)

yelp_training_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_training_data)
yelp_validation_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_validation_data)
yelp_test_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_test_data)
yelp_training_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_training_data)
yelp_validation_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_validation_data)
yelp_test_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_test_data)

ps_imdb_bbow = PredefinedSplit([-1 for s in imdb_training_binary_bow_data] + [0 for s in imdb_validation_binary_bow_data])
ps_imdb_fbow = PredefinedSplit([-1 for s in imdb_training_frequency_bow_data] + [0 for s in imdb_validation_frequency_bow_data])
ps_yelp_bbow = PredefinedSplit([-1 for s in yelp_training_binary_bow_data] + [0 for s in yelp_validation_binary_bow_data])
ps_yelp_fbow = PredefinedSplit([-1 for s in yelp_training_frequency_bow_data] + [0 for s in yelp_validation_frequency_bow_data])


In [3]:
# Test yelp data set with binary bag of words representation

# method calculates and returns the random classifier f1_score performance given the input data classifications
# and range of classification values which will be used to generate random classifications
def report_random_classifier_performance(data, classification_range):
    random_classifications = np.random.choice(classification_range, len(data))
    return sklearn.metrics.f1_score(data, random_classifications, average = 'micro')
    
# method takes as input classification data and returns majority classifier performance in the form of a f1_score
def report_majority_class_classifier_performance(data):
    # np.bincount returns array of counts for each index value seen in input data
    # np.argmax returns the index of the highest count, which results in the majority class
    majority_class = np.argmax(np.bincount(data))
    majority_classifications = [majority_class] * len(data)
    return sklearn.metrics.f1_score(data, majority_classifications, average = 'micro')
    
def main(): 
    print('Random classifier performance on yelp training data: ',report_random_classifier_performance(yelp_training_data[1], range(1,6)))
    print('Random classifier performance on yelp validation data: ',report_random_classifier_performance(yelp_validation_data[1], range(1,6)))
    print('Random classifier performance on yelp test data: ',report_random_classifier_performance(yelp_test_data[1], range(1,6)))
    print('Majority classifier performance on yelp training data: ',report_majority_class_classifier_performance(yelp_training_data[1]))
    print('Majority classifier performance on yelp validation data: ',report_majority_class_classifier_performance(yelp_validation_data[1]))
    print('Majority classifier performance on yelp test data: ',report_majority_class_classifier_performance(yelp_test_data[1]))
main()

Random classifier performance on yelp training data:  0.19942857142857146
Random classifier performance on yelp validation data:  0.199
Random classifier performance on yelp test data:  0.17900000000000002
Majority classifier performance on yelp training data:  0.3525714285714286
Majority classifier performance on yelp validation data:  0.356
Majority classifier performance on yelp test data:  0.351


In [4]:
# method tunes for optimal hyperparameters for the Bernoulli Naive Bayes classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_BNB_hyperparameters_yelp_bbow():
    parameters = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}
    
    clf = BernoulliNB()
    grid_bnb = GridSearchCV(clf, parameters, cv=ps_yelp_bbow)
    grid_bnb.fit(np.concatenate((yelp_training_binary_bow_data, yelp_validation_binary_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Bernoulli Naive Bayes:", grid_bnb.best_params_)

    print('Optimal accuracy of Bernoulli Naive Bayes on Yelp dataset (bbow):', grid_bnb.score(yelp_test_binary_bow_data, yelp_test_data[1]))
    
    
# method tunes for optimal hyperparameters for the Decision Trees classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_decision_tree_hyperparameters_yelp_bbow():
    parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],  'max_depth':np.linspace(6,15,10)}
    
    clf = DecisionTreeClassifier()
    grid_tree = GridSearchCV(clf, parameters, cv=ps_yelp_bbow)
    grid_tree.fit(np.concatenate((yelp_training_binary_bow_data, yelp_validation_binary_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_tree.best_params_)

    print('Optimal accuracy of Decision Tree Classifier on Yelp dataset (bbow):', grid_tree.score(yelp_test_binary_bow_data, yelp_test_data[1]))

# method tunes for optimal hyperparameters for the linear Support Vector Machine classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_SVM_hyperparameters_yelp_bbow():
    parameters={'C':np.linspace(0.001, 10, 5), 'tol':np.linspace(1e-9, 1e-5, 5), "max_iter": range(1000, 10001, 1000)}
    
    clf = LinearSVC()
    grid_svc = GridSearchCV(clf, parameters, cv=ps_yelp_bbow)
    grid_svc.fit(np.concatenate((yelp_training_binary_bow_data, yelp_validation_binary_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_svc.best_params_)

    print('Optimal accuracy of Linear SVM Classifier on Yelp dataset (bbow):', grid_svc.score(yelp_test_binary_bow_data, yelp_test_data[1]))
    
def main():
    warnings.filterwarnings("ignore")  
    hypertune_BNB_hyperparameters_yelp_bbow()
    hypertune_decision_tree_hyperparameters_yelp_bbow()
    hypertune_SVM_hyperparameters_yelp_bbow()
    
main()

Best params for Bernoulli Naive Bayes: {'alpha': 0.01}
Optimal accuracy of Bernoulli Naive Bayes on Yelp dataset (bbow): 0.4415
Best params for Multinomial Naive Bayes: {'criterion': 'entropy', 'max_depth': 10.0, 'splitter': 'best'}
Optimal accuracy of Decision Tree Classifier on Yelp dataset (bbow): 0.3965
Best params for Multinomial Naive Bayes: {'C': 0.001, 'max_iter': 1000, 'tol': 1e-09}
Optimal accuracy of Linear SVM Classifier on Yelp dataset (bbow): 0.504


In [5]:
#Test Yelp data with Frequency bag of words representation

def hypertune_GNB_hyperparameters_yelp_fbow():
    parameters = {"var_smoothing":np.linspace(1, 5, 50)}
    
    clf = GaussianNB()
    grid_gnb = GridSearchCV(clf, parameters, cv=ps_yelp_fbow)
    grid_gnb.fit(np.concatenate((yelp_training_frequency_bow_data, yelp_validation_frequency_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Gaussian Naive Bayes:", grid_gnb.best_params_)

    print('Optimal accuracy of Gaussian Naive Bayes on Yelp dataset (fbow):', grid_gnb.score(yelp_test_frequency_bow_data, yelp_test_data[1]))
  
def hypertune_decision_tree_hyperparameters_yelp_fbow():
    parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],  'max_depth':np.linspace(6,15,10)}
    
    clf = DecisionTreeClassifier()
    grid_tree = GridSearchCV(clf, parameters, cv=ps_yelp_fbow)
    grid_tree.fit(np.concatenate((yelp_training_frequency_bow_data, yelp_validation_frequency_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_tree.best_params_)

    print('Optimal accuracy of Decision Tree Classifier on Yelp dataset (fbow):', grid_tree.score(yelp_test_frequency_bow_data, yelp_test_data[1]))
    
def hypertune_SVM_hyperparameters_yelp_fbow():
    parameters={'C':np.linspace(0.001, 10, 5), 'tol':np.linspace(1e-9, 1e-5, 5), "max_iter": range(1000, 10001, 1000)}
    
    clf = LinearSVC()
    grid_svc = GridSearchCV(clf, parameters, cv=ps_yelp_fbow)
    grid_svc.fit(np.concatenate((yelp_training_frequency_bow_data, yelp_validation_frequency_bow_data)), list(yelp_training_data[1]) + list(yelp_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_svc.best_params_)

    print('Optimal accuracy of Linear SVM Classifier on Yelp dataset (fbow):', grid_svc.score(yelp_test_frequency_bow_data, yelp_test_data[1]))
    
def main():
    warnings.filterwarnings("ignore")
    hypertune_GNB_hyperparameters_yelp_fbow()
    hypertune_decision_tree_hyperparameters_yelp_fbow()
    hypertune_SVM_hyperparameters_yelp_fbow()
        
main()

Best params for Gaussian Naive Bayes: {'var_smoothing': 1.0816326530612246}
Optimal accuracy of Gaussian Naive Bayes on Yelp dataset (fbow): 0.3635
Best params for Multinomial Naive Bayes: {'criterion': 'gini', 'max_depth': 8.0, 'splitter': 'best'}
Optimal accuracy of Decision Tree Classifier on Yelp dataset (fbow): 0.391
Best params for Multinomial Naive Bayes: {'C': 7.50025, 'max_iter': 1000, 'tol': 1e-09}
Optimal accuracy of Linear SVM Classifier on Yelp dataset (fbow): 0.5075


In [6]:
# Test IMDB data set with binary bag of words representation
# Random classifier

# method calculates and returns the random classifier f1_score performance given the input data classifications
# and range of classification values which will be used to generate random classifications
def report_random_classifier_performance(data, classification_range):
    random_classifications = np.random.choice(classification_range, len(data))
    return sklearn.metrics.f1_score(data, random_classifications, average = 'micro')
    
# method takes as input classification data and returns majority classifier performance in the form of a f1_score
def report_majority_class_classifier_performance(data):
    # np.bincount returns array of counts for each index value seen in input data
    # np.argmax returns the index of the highest count, which results in the majority class
    majority_class = np.argmax(np.bincount(data))
    majority_classifications = [majority_class] * len(data)
    return sklearn.metrics.f1_score(data, majority_classifications, average = 'micro')
    
def main(): 
    print('Random classifier performance on IMDB training data: ',report_random_classifier_performance(imdb_training_data[1], range(0,2)))
    print('Random classifier performance on IMDB validation data: ',report_random_classifier_performance(imdb_validation_data[1], range(0,2)))
    print('Random classifier performance on IMDB test data: ',report_random_classifier_performance(imdb_test_data[1], range(0,2)))
    print('Majority classifier performance on IMDB training data: ',report_majority_class_classifier_performance(imdb_training_data[1]))
    print('Majority classifier performance on IMDB validation data: ',report_majority_class_classifier_performance(imdb_validation_data[1]))
    print('Majority classifier performance on IMDB test data: ',report_majority_class_classifier_performance(imdb_test_data[1]))
main()


Random classifier performance on IMDB training data:  0.4984
Random classifier performance on IMDB validation data:  0.4969
Random classifier performance on IMDB test data:  0.496
Majority classifier performance on IMDB training data:  0.5
Majority classifier performance on IMDB validation data:  0.5
Majority classifier performance on IMDB test data:  0.5


In [7]:
def hypertune_BNB_hyperparameters_imdb_bbow():
    parameters = {"alpha":np.linspace(0.001, 1, 100)}
    
    clf = BernoulliNB()
    grid_bnb = GridSearchCV(clf, parameters, cv=ps_imdb_bbow)
    grid_bnb.fit(np.concatenate((imdb_training_binary_bow_data, imdb_validation_binary_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Bernoulli Naive Bayes:", grid_bnb.best_params_)

    print('Optimal accuracy of Bernoulli Naive Bayes on IMDB dataset (bbow):', grid_bnb.score(imdb_test_binary_bow_data, imdb_test_data[1]))
    
def hypertune_decision_tree_hyperparameters_imdb_bbow():
    parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],  'max_depth':np.linspace(6,15,10)}
    
    clf = DecisionTreeClassifier()
    grid_tree = GridSearchCV(clf, parameters, cv=ps_imdb_bbow)
    grid_tree.fit(np.concatenate((imdb_training_binary_bow_data, imdb_validation_binary_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_tree.best_params_)

    print('Optimal accuracy of Decision Tree Classifier on IMDB dataset (bbow):', grid_tree.score(imdb_test_binary_bow_data, imdb_test_data[1]))

def hypertune_SVM_hyperparameters_imdb_bbow():
    parameters={'C':np.linspace(0.001, 10, 5), 'tol':np.linspace(1e-9, 1e-5, 5), "max_iter": range(1000, 10001, 1000)}
    
    clf = LinearSVC()
    grid_svc = GridSearchCV(clf, parameters, cv=ps_imdb_bbow)
    grid_svc.fit(np.concatenate((imdb_training_binary_bow_data, imdb_validation_binary_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_svc.best_params_)

    print('Optimal accuracy of Linear SVM Classifier on IMDB dataset (bbow):', grid_svc.score(imdb_test_binary_bow_data, imdb_test_data[1]))
    
    
def main():
    warnings.filterwarnings("ignore")
    hypertune_BNB_hyperparameters_imdb_bbow()
    hypertune_decision_tree_hyperparameters_imdb_bbow()
    hypertune_SVM_hyperparameters_imdb_bbow()
    
main()

Best params for Bernoulli Naive Bayes: {'alpha': 0.28354545454545454}
Optimal accuracy of Bernoulli Naive Bayes on IMDB dataset (bbow): 0.84104
Best params for Multinomial Naive Bayes: {'criterion': 'gini', 'max_depth': 15.0, 'splitter': 'best'}
Optimal accuracy of Decision Tree Classifier on IMDB dataset (bbow): 0.72968
Best params for Multinomial Naive Bayes: {'C': 0.001, 'max_iter': 1000, 'tol': 1e-09}
Optimal accuracy of Linear SVM Classifier on IMDB dataset (bbow): 0.87976


In [8]:
#Test imdb data with Frequency bag of words representation

def hypertune_GNB_hyperparameters_imdb_fbow():
    parameters = {"var_smoothing":range(0.001, 5, 100)}
    
    clf = GaussianNB()
    grid_gnb = GridSearchCV(clf, parameters, cv=ps_imdb_fbow)
    grid_gnb.fit(np.concatenate((imdb_training_frequency_bow_data, imdb_validation_frequency_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Gaussian Naive Bayes:", grid_gnb.best_params_)

    print('Optimal accuracy of Gaussian Naive Bayes on IMDB dataset (fbow):', grid_gnb.score(imdb_test_frequency_bow_data, imdb_test_data[1]))
  
def hypertune_decision_tree_hyperparameters_imdb_fbow():
    parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],  'max_depth':np.linspace(6,15,10)}
    
    clf = DecisionTreeClassifier()
    grid_tree = GridSearchCV(clf, parameters, cv=ps_imdb_fbow)
    grid_tree.fit(np.concatenate((imdb_training_frequency_bow_data, imdb_validation_frequency_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_tree.best_params_)

    print('Optimal accuracy of Decision Tree Classifier on IMDB dataset (fbow):', grid_tree.score(imdb_test_frequency_bow_data, imdb_test_data[1]))
    
def hypertune_SVM_hyperparameters_imdb_fbow():
    parameters={'C':np.linspace(0.001, 10, 5), 'tol':np.linspace(1e-9, 1e-5, 5), "max_iter": range(1000, 10001, 1000)}
    
    clf = LinearSVC()
    grid_svc = GridSearchCV(clf, parameters, cv=ps_imdb_fbow)
    grid_svc.fit(np.concatenate((imdb_training_frequency_bow_data, imdb_validation_frequency_bow_data)), list(imdb_training_data[1]) + list(imdb_validation_data[1]))
    print("Best params for Multinomial Naive Bayes:", grid_svc.best_params_)

    print('Optimal accuracy of Linear SVM Classifier on IMDB dataset (fbow):', grid_svc.score(imdb_test_frequency_bow_data, imdb_test_data[1]))
    
def main():
    warnings.filterwarnings("ignore")
    hypertune_GNB_hyperparameters_imdb_fbow()
    hypertune_decision_tree_hyperparameters_imdb_fbow()
    hypertune_SVM_hyperparameters_imdb_fbow()
        
main()

AttributeError: module 'numpy' has no attribute 'range'