In [1]:
#Convert datasets into binary and frequency bag of words representations
import numpy as np
import pandas as pd
import string
import random
from collections import Counter
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.svm
import warnings

# read datasets from csv files, first column is review, second column is rating/sentiment 
imdb_training_data = pd.read_csv('IMDB-train.txt', sep = "\t", header = None)
imdb_validation_data = pd.read_csv('IMDB-valid.txt', sep = "\t", header = None)
imdb_test_data = pd.read_csv('IMDB-test.txt', sep = "\t", header = None)

yelp_training_data = pd.read_csv('yelp-train.txt', sep = "\t", header = None)
yelp_validation_data = pd.read_csv('yelp-valid.txt', sep = "\t", header = None)
yelp_test_data = pd.read_csv('yelp-test.txt', sep = "\t", header = None)

In [2]:
# First, convert datasets into binary and frequency bag of words representations

# Method takes as input a review Data Frame and returns the top 10,000 words with highest frequency in the form of a tuple:
# (top words dictionary with word as key and rank as value, top words information in format specified by pdf
# that will be output into file)
def find_top_words(reviews_data):
    words = []
    top_words = []
    
    for review in reviews_data[0]:
        # preprocess by removing punctuation and <br /><br />, and converting reviews to lower case
        words.extend(review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' '))
        
    # remove empty strings from words list
    words = list(filter(None, words))
    
    # Use Counter to find most 10,000 common words from word list, returned as tuple (string word, integer count)
    top_words = Counter(words).most_common(10000)
    
    # First element of tuple is dictionary of most common word String as key, frequency rank (starting at index 0) as value
    # Second element of tuple is List of Strings (1 for each top word) which consists of word, rank, and frequency delimited by tabs
    # enumerate enumerates top_words list, so that we can assign unique ID/rank to dictionary, and output string
    return {top_word[0]: index for index, top_word in enumerate(top_words)}, [top_word[0] + '\t' + str(index) + '\t' + str(top_word[1])  for index, top_word in enumerate(top_words)]
    
# find top words and output from training data sets
imdb_top_words, imdb_output = find_top_words(imdb_training_data)
yelp_top_words, yelp_output = find_top_words(yelp_training_data)
    
# method takes as input tuple list top_words (String word, frequency) and review data and returns array of vector representations of reviews, with a 
# 1 in the index of vector if the word at that index in top_words appears in the specific review
def generate_binary_bag_of_words_representation(top_words, reviews_data):
    vectors = []
    
    for review in reviews_data[0]:
        # preprocess review and split into individual words
        review_words = review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' ')
        # initialize vector to contain 10,000 0's
        current_vector = [0] * 10000
        # for each word in review, if word is in list of top words, set current_vector at index corresponding to id of top word to 1
        for word in review_words:
            if word in top_words:
                current_vector[top_words[word]] = 1
        vectors.append(current_vector)
    return np.array(vectors)

# method takes as input tuple list top_words (String word, frequency) and review data and returns array of vector representations of reviews, based on
# frequency representation. Values of each vector will sum to 1, and each non-zero value in vector will correspond to its proportional
# occurrence weight in the specific review (feature[id] = (#id in review)/#(all ids in review)
def generate_frequency_bag_of_words_representation(top_words, reviews_data):
    vectors = []
    
    for review in reviews_data[0]:
        # preprocess review and split into individual words
        review_words = review.lower().replace('<br /><br />', ' ').translate(str.maketrans("","", string.punctuation)).split(' ')
        # initialize vector to contain 10,000 0's
        current_vector = [0] * 10000
        # for each word in review, if word is in list of top words, set current_vector at index corresponding to id of top word to 1
        for word in review_words:
            # if word is in top_words, increment vector at index given by word id from dictionary
            if word in top_words:
                current_vector[top_words[word]] += 1
        
        # calculate sum of all top_word frequencies in current review
        top_words_sum = sum(current_vector)
        
        # if there is at least 1 top word in current review, divide all frequencies in vector by total number of top word frequencies in vector
        # to yield proportion of given top words then add to vectors list, otherwise just add zero vector to vectors list
        if top_words_sum > 0:
            current_vector = np.divide(current_vector, top_words_sum)
        vectors.append(current_vector)
    return np.array(vectors)


# convert datasets to binary and frequency bag of words representations
imdb_training_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_training_data)
imdb_validation_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_validation_data)
imdb_test_binary_bow_data = generate_binary_bag_of_words_representation(imdb_top_words, imdb_test_data)
imdb_training_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_training_data)
imdb_validation_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_validation_data)
imdb_test_frequency_bow_data = generate_frequency_bag_of_words_representation(imdb_top_words, imdb_test_data)

yelp_training_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_training_data)
yelp_validation_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_validation_data)
yelp_test_binary_bow_data = generate_binary_bag_of_words_representation(yelp_top_words, yelp_test_data)
yelp_training_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_training_data)
yelp_validation_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_validation_data)
yelp_test_frequency_bow_data = generate_frequency_bag_of_words_representation(yelp_top_words, yelp_test_data)

In [3]:
# Test yelp data set with binary bag of words representation

# method calculates and returns the random classifier f1_score performance given the input data classifications
# and range of classification values which will be used to generate random classifications
def report_random_classifier_performance(data, classification_range):
    random_classifications = np.random.choice(classification_range, len(data))
    return sklearn.metrics.f1_score(data, random_classifications, average = 'micro')
    
# method takes as input classification data and returns majority classifier performance in the form of a f1_score
def report_majority_class_classifier_performance(data):
    # np.bincount returns array of counts for each index value seen in input data
    # np.argmax returns the index of the highest count, which results in the majority class
    majority_class = np.argmax(np.bincount(data))
    majority_classifications = [majority_class] * len(data)
    return sklearn.metrics.f1_score(data, majority_classifications, average = 'micro')
    
def main(): 
    print('Random classifier performance on yelp training data: ',report_random_classifier_performance(yelp_training_data[1], range(1,6)))
    print('Random classifier performance on yelp validation data: ',report_random_classifier_performance(yelp_validation_data[1], range(1,6)))
    print('Random classifier performance on yelp test data: ',report_random_classifier_performance(yelp_test_data[1], range(1,6)))
    print('Majority classifier performance on yelp training data: ',report_majority_class_classifier_performance(yelp_training_data[1]))
    print('Majority classifier performance on yelp validation data: ',report_majority_class_classifier_performance(yelp_validation_data[1]))
    print('Majority classifier performance on yelp test data: ',report_majority_class_classifier_performance(yelp_test_data[1]))
main()

Random classifier performance on yelp training data:  0.19542857142857142
Random classifier performance on yelp validation data:  0.21499999999999997
Random classifier performance on yelp test data:  0.202
Majority classifier performance on yelp training data:  0.3525714285714286
Majority classifier performance on yelp validation data:  0.356
Majority classifier performance on yelp test data:  0.351


In [10]:
# method trains Bernoulli Naive Bayes classifier with default hyperparameters on yelp training data, then tests the classifer on test data
def train_BNB_classifier_default_yelp_bbow():
    bernoulli_nb = sklearn.naive_bayes.BernoulliNB()
    bernoulli_nb.fit(yelp_training_binary_bow_data, yelp_training_data[1])
    predicted_classifications = bernoulli_nb.predict(yelp_test_binary_bow_data)
    
    print('Default Bernoulli Naive Bayes classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))

# method tunes for optimal hyperparameters for the Bernoulli Naive Bayes classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_BNB_hyperparameters_yelp_bbow():
    alpha_values = np.linspace(0.001, 1, 100)
    best_f1 = 0
    best_alpha = 0
    
    for a in alpha_values:
        bernoulli_nb = sklearn.naive_bayes.BernoulliNB(alpha = a)
        bernoulli_nb.fit(yelp_training_binary_bow_data, yelp_training_data[1])
        predicted_classifications = bernoulli_nb.predict(yelp_validation_binary_bow_data)
        current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')
        
        if(current_f1 > best_f1):
            best_f1 = current_f1
            best_alpha = a
            
    print('The best value of alpha is ', best_alpha)
    
    bernoulli_nb = sklearn.naive_bayes.BernoulliNB(alpha = best_alpha)
    bernoulli_nb.fit(yelp_training_binary_bow_data, yelp_training_data[1])
    
    predicted_classifications = bernoulli_nb.predict(yelp_training_binary_bow_data)
    print("The F1-measure score on the training data using the best alpha is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = bernoulli_nb.predict(yelp_validation_binary_bow_data)
    print("The F1-measure score on the validation data using the best alpha is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = bernoulli_nb.predict(yelp_test_binary_bow_data)
    print("The F1-measure score on the test data using the best alpha is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
# method trains Decision Tree classifier with default hyperparameters specified on yelp training data, then tests the classifer on test data
def train_decision_tree_default_yelp_bbow():
    decision_tree = sklearn.tree.DecisionTreeClassifier()
    decision_tree.fit(yelp_training_binary_bow_data, yelp_training_data[1])
    predicted_classifications = decision_tree.predict(yelp_test_binary_bow_data)
    print('Default Decision Trees classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
# method tunes for optimal hyperparameters for the Decision Trees classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_decision_tree_hyperparameters_yelp_bbow():
    # only tune using hyperparameters seen in class
    criterions = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_depths = np.linspace(6,15,10)
    best_f1 = 0
    best_hyperparameters = []
    
    for max_depth in max_depths:
        for criterion in criterions:
            for splitter in splitters:
                decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
                decision_tree.fit(yelp_training_binary_bow_data, yelp_training_data[1])
                predicted_classifications = decision_tree.predict(yelp_validation_binary_bow_data)
                current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')

                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_hyperparameters = [criterion, splitter, max_depth]

    print('The optimal hyperparameters are criterion =',best_hyperparameters[0],', splitter =', best_hyperparameters[1], ',max_depth =', best_hyperparameters[2])

    decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = best_hyperparameters[0], splitter = best_hyperparameters[1], max_depth = best_hyperparameters[2])
    decision_tree.fit(yelp_training_binary_bow_data, yelp_training_data[1])

    predicted_classifications = decision_tree.predict(yelp_training_binary_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(yelp_validation_binary_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(yelp_test_binary_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
  
# method trains Decision Tree classifier with default hyperparameterson yelp training data, then tests the classifer on test data
def train_SVM_default_yelp_bbow():
    svc = sklearn.svm.LinearSVC()
    svc.fit(yelp_training_binary_bow_data, yelp_training_data[1])
    predicted_classifications = svc.predict(yelp_test_binary_bow_data)
    print('Default SVM classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))

# method tunes for optimal hyperparameters for the linear Support Vector Machine classifier using the yelp training and validation data,
# then classifies the test data using the trained classifier with the optimal hyperparameters
def hypertune_SVM_hyperparameters_yelp_bbow():
    C = np.linspace(0.001, 10, 5)
    tolerances = np.linspace(1e-9, 1e-5, 5)
    best_f1 = 0
    best_hyperparameters = []

    for c in C:
        for tol in tolerances:
            svc = sklearn.svm.LinearSVC(C = c, tol = tol)
            svc.fit(yelp_training_binary_bow_data, yelp_training_data[1])
            predicted_classifications = svc.predict(yelp_validation_binary_bow_data)
            current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')

            if current_f1 > best_f1:
                best_f1 = current_f1
                best_hyperparameters = [c, tol]
                    
    
    print('The optimal hyperparameters are C =',best_hyperparameters[0],', tolerance =', best_hyperparameters[1])
    
    svc = sklearn.svm.LinearSVC(C = best_hyperparameters[0], tol = best_hyperparameters[1])
        
    svc.fit(yelp_training_binary_bow_data, yelp_training_data[1])
    predicted_classifications = svc.predict(yelp_training_binary_bow_data) 
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))

    predicted_classifications = svc.predict(yelp_validation_binary_bow_data)  
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    
    predicted_classifications = svc.predict(yelp_test_binary_bow_data)  
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
def main():
    warnings.filterwarnings("ignore")  
    train_BNB_classifier_default_yelp_bbow()
    hypertune_BNB_hyperparameters_yelp_bbow()
    train_decision_tree_default_yelp_bbow()
    hypertune_decision_tree_hyperparameters_yelp_bbow()
    train_SVM_default_yelp_bbow()
    hypertune_SVM_hyperparameters_yelp_bbow()
    
main()

Default Bernoulli Naive Bayes classifier F1-measure without no hyperparameters specified: 0.4115
The best value of alpha is  0.021181818181818184
The F1-measure score on the training data using the best alpha is  0.7397142857142858
The F1-measure score on the validation data using the best alpha is  0.421
The F1-measure score on the test data using the best alpha is  0.4345
Default Decision Trees classifier F1-measure without no hyperparameters specified: 0.351
The optimal hyperparameters are criterion = gini , splitter = random ,max_depth = 8.0
The F1-measure score on the training data using the best hyperparameters is  0.48128571428571426
The F1-measure score on the validation data using the best hyperparameters is  0.405
The F1-measure score on the test data using the best hyperparameters is  0.398
Default SVM classifier F1-measure without no hyperparameters specified: 0.4465
The optimal hyperparameters are C = 0.001 , tolerance = 1e-09
The F1-measure score on the training data usin

In [5]:
#Test Yelp data with Frequency bag of words representation

# method trains Gaussian naive bayes classifier with default hyperparameters on yelp training data, then tests the classifer on test data
def train_GNB_classifier_default_yelp_fbow():
    gaussian_nb = sklearn.naive_bayes.GaussianNB()
    gaussian_nb.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    predicted_classifications = gaussian_nb.predict(yelp_test_frequency_bow_data)
    
    print('Default Gaussian Naive Bayes classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_GNB_hyperparameters_yelp_fbow():
    smoothing_values = np.linspace(1, 5, 50)
    best_f1 = 0
    best_smoothing = 0
    
    for a in smoothing_values:
        gaussian_nb = sklearn.naive_bayes.GaussianNB(var_smoothing = a)
        gaussian_nb.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
        predicted_classifications = gaussian_nb.predict(yelp_validation_frequency_bow_data)
        current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')
        
        if(current_f1 > best_f1):
            best_f1 = current_f1
            best_smoothing = a
            
    print('The best value of the smoothing variable is ', best_smoothing)
    
    gaussian_nb = sklearn.naive_bayes.GaussianNB(var_smoothing = best_smoothing)
    gaussian_nb.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    
    predicted_classifications = gaussian_nb.predict(yelp_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best smoothing variable is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = gaussian_nb.predict(yelp_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best smoothing variable is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = gaussian_nb.predict(yelp_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best smoothing value is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
  
def train_decision_tree_default_yelp_fbow():
    decision_tree = sklearn.tree.DecisionTreeClassifier()
    decision_tree.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    predicted_classifications = decision_tree.predict(yelp_test_frequency_bow_data)
    print('Default Decision Trees classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
def hypertune_decision_tree_hyperparameters_yelp_fbow():
    # only tune using hyperparameters seen in class
    criterions = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_depths = np.linspace(6, 15, 10)
    best_f1 = 0
    best_hyperparameters = []
    
    for max_depth in max_depths:
        for criterion in criterions:
            for splitter in splitters:
                decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
                decision_tree.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
                predicted_classifications = decision_tree.predict(yelp_validation_frequency_bow_data)
                current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')

                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_hyperparameters = [criterion, splitter, max_depth]
            
    print('The optimal hyperparameters are criterion =',best_hyperparameters[0],', splitter =', best_hyperparameters[1], ',max_depth =', best_hyperparameters[2])

    decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = best_hyperparameters[0], splitter = best_hyperparameters[1], max_depth = best_hyperparameters[2])
    decision_tree.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    
    predicted_classifications = decision_tree.predict(yelp_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(yelp_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(yelp_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
def train_SVM_default_yelp_fbow():
    svc = sklearn.svm.LinearSVC()
    svc.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    predicted_classifications = svc.predict(yelp_test_frequency_bow_data)
    print('Default SVM classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_SVM_hyperparameters_yelp_fbow():
    C = np.linspace(0.001, 10, 10)
    tolerances = np.linspace(1e-9, 1e-5, 5)
    best_f1 = 0
    best_hyperparameters = []

    for c in C:
        for tol in tolerances:
            svc = sklearn.svm.LinearSVC(C = c, tol = tol)
            svc.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
            predicted_classifications = svc.predict(yelp_validation_frequency_bow_data)
            current_f1 = sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro')

            if current_f1 > best_f1:
                best_f1 = current_f1
                best_hyperparameters = [c, tol]
                    
    
    print('The optimal hyperparameters are C =',best_hyperparameters[0],', tolerance =', best_hyperparameters[1])
    
    svc = sklearn.svm.LinearSVC(C = best_hyperparameters[0], tol = best_hyperparameters[1])
        
    svc.fit(yelp_training_frequency_bow_data, yelp_training_data[1])
    
    predicted_classifications = svc.predict(yelp_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(yelp_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(yelp_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(yelp_test_data[1], predicted_classifications, average = 'micro'))
    
def main():
    warnings.filterwarnings("ignore")
    train_GNB_classifier_default_yelp_fbow() 
    hypertune_GNB_hyperparameters_yelp_fbow()
    train_decision_tree_default_yelp_fbow()
    hypertune_decision_tree_hyperparameters_yelp_fbow()
    train_SVM_default_yelp_fbow()
    hypertune_SVM_hyperparameters_yelp_fbow()
        
main()

Default Gaussian Naive Bayes classifier F1-measure without no hyperparameters specified: 0.302
The best value of the smoothing variable is  1.0816326530612246
The F1-measure score on the training data using the best smoothing variable is  0.37457142857142856
The F1-measure score on the validation data using the best smoothing variable is  0.37
The F1-measure score on the test data using the best smoothing value is  0.3605
Default Decision Trees classifier F1-measure without no hyperparameters specified: 0.346
The optimal hyperparameters are criterion = gini , splitter = best ,max_depth = 11.0
The F1-measure score on the training data using the best hyperparameters is  0.5775714285714286
The F1-measure score on the validation data using the best hyperparameters is  0.413
The F1-measure score on the test data using the best hyperparameters is  0.3855
Default SVM classifier F1-measure without no hyperparameters specified: 0.461
The optimal hyperparameters are C = 6.667000000000001 , toler

In [6]:
# Test IMDB data set with binary bag of words representation
# Random classifier

# method calculates and returns the random classifier f1_score performance given the input data classifications
# and range of classification values which will be used to generate random classifications
def report_random_classifier_performance(data, classification_range):
    random_classifications = np.random.choice(classification_range, len(data))
    return sklearn.metrics.f1_score(data, random_classifications, average = 'micro')
    
# method takes as input classification data and returns majority classifier performance in the form of a f1_score
def report_majority_class_classifier_performance(data):
    # np.bincount returns array of counts for each index value seen in input data
    # np.argmax returns the index of the highest count, which results in the majority class
    majority_class = np.argmax(np.bincount(data))
    majority_classifications = [majority_class] * len(data)
    return sklearn.metrics.f1_score(data, majority_classifications, average = 'micro')
    
def main(): 
    print('Random classifier performance on IMDB training data: ',report_random_classifier_performance(imdb_training_data[1], range(0,2)))
    print('Random classifier performance on IMDB validation data: ',report_random_classifier_performance(imdb_validation_data[1], range(0,2)))
    print('Random classifier performance on IMDB test data: ',report_random_classifier_performance(imdb_test_data[1], range(0,2)))
    print('Majority classifier performance on IMDB training data: ',report_majority_class_classifier_performance(imdb_training_data[1]))
    print('Majority classifier performance on IMDB validation data: ',report_majority_class_classifier_performance(imdb_validation_data[1]))
    print('Majority classifier performance on IMDB test data: ',report_majority_class_classifier_performance(imdb_test_data[1]))
main()


Random classifier performance on IMDB training data:  0.5012
Random classifier performance on IMDB validation data:  0.5012
Random classifier performance on IMDB test data:  0.50376
Majority classifier performance on IMDB training data:  0.5
Majority classifier performance on IMDB validation data:  0.5
Majority classifier performance on IMDB test data:  0.5


In [7]:
# method trains Bernoulli Naive Bayes classifier with default hyperparameters on imdb training data, then tests the classifer on test data
def train_BNB_classifier_default_imdb_bbow():
    bernoulli_nb = sklearn.naive_bayes.BernoulliNB()
    bernoulli_nb.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    predicted_classifications = bernoulli_nb.predict(imdb_test_binary_bow_data)
    
    print('Default Bernoulli Naive Bayes classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_BNB_hyperparameters_imdb_bbow():
    alpha_values = np.linspace(0.001, 1, 100)
    best_f1 = 0
    best_alpha = 0
    
    for a in alpha_values:
        bernoulli_nb = sklearn.naive_bayes.BernoulliNB(alpha = a)
        bernoulli_nb.fit(imdb_training_binary_bow_data, imdb_training_data[1])
        predicted_classifications = bernoulli_nb.predict(imdb_validation_binary_bow_data)
        current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')
        
        if(current_f1 > best_f1):
            best_f1 = current_f1
            best_alpha = a
            
    print('The best value of alpha is ', best_alpha)
    
    bernoulli_nb = sklearn.naive_bayes.BernoulliNB(alpha = best_alpha)
    bernoulli_nb.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    
    predicted_classifications = bernoulli_nb.predict(imdb_training_binary_bow_data)
    print("The F1-measure score on the training data using the best alpha is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = bernoulli_nb.predict(imdb_validation_binary_bow_data)
    print("The F1-measure score on the validation data using the best alpha is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = bernoulli_nb.predict(imdb_test_binary_bow_data)
    print("The F1-measure score on the test data using the best alpha is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def train_decision_tree_default_imdb_bbow():
    decision_tree = sklearn.tree.DecisionTreeClassifier()
    decision_tree.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    predicted_classifications = decision_tree.predict(imdb_test_binary_bow_data)
    print('Default Decision Trees classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def hypertune_decision_tree_hyperparameters_imdb_bbow():
    criterions = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_depths = np.linspace(10, 20, 10)
    best_f1 = 0
    best_hyperparameters = []
    
    for max_depth in max_depths:  
        for criterion in criterions:
            for splitter in splitters:
                decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
                decision_tree.fit(imdb_training_binary_bow_data, imdb_training_data[1])
                predicted_classifications = decision_tree.predict(imdb_validation_binary_bow_data)
                current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')

                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_hyperparameters = [criterion, splitter, max_depth]
            
    print('The optimal hyperparameters are criterion =',best_hyperparameters[0],', splitter =', best_hyperparameters[1], ', max_depth =', best_hyperparameters[2])

    decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = best_hyperparameters[0], splitter = best_hyperparameters[1], max_depth = best_hyperparameters[2])
    decision_tree.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    
    predicted_classifications = decision_tree.predict(imdb_training_binary_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(imdb_validation_binary_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(imdb_test_binary_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters  is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
  
def train_SVM_default_imdb_bbow():
    svc = sklearn.svm.LinearSVC()
    svc.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    predicted_classifications = svc.predict(imdb_test_binary_bow_data)
    print('Default SVM classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_SVM_hyperparameters_imdb_bbow():
    C = np.linspace(1e-3, 10, 10)
    tolerances = np.linspace(1e-9, 1e-5, 5)
    best_f1 = 0
    best_hyperparameters = []

    for c in C:
        for tol in tolerances:
            svc = sklearn.svm.LinearSVC(C = c, tol = tol)
            svc.fit(imdb_training_binary_bow_data, imdb_training_data[1])
            predicted_classifications = svc.predict(imdb_validation_binary_bow_data)
            current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')

            if current_f1 > best_f1:
                best_f1 = current_f1
                best_hyperparameters = [c, tol]
                    
    
    print('The optimal hyperparameters are C =',best_hyperparameters[0],', tolerance =', best_hyperparameters[1])
    
    svc = sklearn.svm.LinearSVC(C = best_hyperparameters[0], tol = best_hyperparameters[1])
        
    svc.fit(imdb_training_binary_bow_data, imdb_training_data[1])
    
    predicted_classifications = svc.predict(imdb_training_binary_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(imdb_validation_binary_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(imdb_test_binary_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def main():
    warnings.filterwarnings("ignore")
    
    train_BNB_classifier_default_imdb_bbow()
    hypertune_BNB_hyperparameters_imdb_bbow()
    train_decision_tree_default_imdb_bbow()
    hypertune_decision_tree_hyperparameters_imdb_bbow()
    train_SVM_default_imdb_bbow()
    hypertune_SVM_hyperparameters_imdb_bbow()
    
main()

Default Bernoulli Naive Bayes classifier F1-measure without no hyperparameters specified: 0.8368400000000001
The best value of alpha is  0.28354545454545454
The F1-measure score on the training data using the best alpha is  0.8729333333333333
The F1-measure score on the validation data using the best alpha is  0.8457
The F1-measure score on the test data using the best alpha is  0.8366
Default Decision Trees classifier F1-measure without no hyperparameters specified: 0.69528
The optimal hyperparameters are criterion = gini , splitter = best , max_depth = 16.0
The F1-measure score on the training data using the best hyperparameters is  0.8388666666666666
The F1-measure score on the validation data using the best hyperparameters is  0.7205999999999999
The F1-measure score on the test data using the best hyperparameters  is  0.72456
Default SVM classifier F1-measure without no hyperparameters specified: 0.83632
The optimal hyperparameters are C = 0.001 , tolerance = 1e-09
The F1-measure s

In [8]:
#Test imdb data with Frequency bag of words representation

def train_GNB_classifier_default_imdb_fbow():
    gaussian_nb = sklearn.naive_bayes.GaussianNB()
    gaussian_nb.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    predicted_classifications = gaussian_nb.predict(imdb_test_frequency_bow_data)
    
    print('Default Gaussian Naive Bayes classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_GNB_hyperparameters_imdb_fbow():
    smoothing_values = np.linspace(1e-13, 1e-9, 50)
    best_f1 = 0
    best_smoothing = 0
    
    for a in smoothing_values:
        gaussian_nb = sklearn.naive_bayes.GaussianNB(var_smoothing = a)
        gaussian_nb.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
        predicted_classifications = gaussian_nb.predict(imdb_validation_frequency_bow_data)
        current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')
        
        if(current_f1 > best_f1):
            best_f1 = current_f1
            best_smoothing = a
            
    print('The best value of var_smoothing is ', best_smoothing)
    
    gaussian_nb = sklearn.naive_bayes.GaussianNB(var_smoothing = best_smoothing)
    gaussian_nb.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    
    predicted_classifications = gaussian_nb.predict(imdb_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best smoothing value is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = gaussian_nb.predict(imdb_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best smoothing value is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = gaussian_nb.predict(imdb_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best smoothing value is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
  
def train_decision_tree_default_imdb_fbow():
    decision_tree = sklearn.tree.DecisionTreeClassifier()
    decision_tree.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    predicted_classifications = decision_tree.predict(imdb_test_frequency_bow_data)
    print('Default Decision Trees classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def hypertune_decision_tree_hyperparameters_imdb_fbow():
    # only tune using hyperparameters seen in class
    criterions = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_depths = np.linspace(10, 20, 10)
    best_f1 = 0
    best_hyperparameters = []
    
    for max_depth in max_depths: 
        for criterion in criterions:
            for splitter in splitters:
                decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
                decision_tree.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
                predicted_classifications = decision_tree.predict(imdb_validation_frequency_bow_data)
                current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')

                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_hyperparameters = [criterion, splitter, max_depth]
            
    print('The optimal hyperparameters are criterion =',best_hyperparameters[0],', splitter =', best_hyperparameters[1], ', max_depth =', best_hyperparameters[2])

    decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = best_hyperparameters[0], splitter = best_hyperparameters[1], max_depth = best_hyperparameters[2])
    decision_tree.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    
    predicted_classifications = decision_tree.predict(imdb_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(imdb_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = decision_tree.predict(imdb_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def train_SVM_default_imdb_fbow():
    svc = sklearn.svm.LinearSVC()
    svc.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    predicted_classifications = svc.predict(imdb_test_frequency_bow_data)
    print('Default SVM classifier F1-measure without no hyperparameters specified:', sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))

def hypertune_SVM_hyperparameters_imdb_fbow():
    C = np.linspace(10, 20, 10)
    tolerances = np.linspace(1e-9, 1e-5, 10)
    best_f1 = 0
    best_hyperparameters = []

    for c in C:
        for tol in tolerances:
            svc = sklearn.svm.LinearSVC(C = c, tol = tol)
            svc.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
            predicted_classifications = svc.predict(imdb_validation_frequency_bow_data)
            current_f1 = sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro')

            if current_f1 > best_f1:
                best_f1 = current_f1
                best_hyperparameters = [c, tol]
                    
    
    print('The optimal hyperparameters are C =',best_hyperparameters[0],', tolerance =', best_hyperparameters[1])
    
    svc = sklearn.svm.LinearSVC(C = best_hyperparameters[0], tol = best_hyperparameters[1])
        
    svc.fit(imdb_training_frequency_bow_data, imdb_training_data[1])
    
    predicted_classifications = svc.predict(imdb_training_frequency_bow_data)
    print("The F1-measure score on the training data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_training_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(imdb_validation_frequency_bow_data)
    print("The F1-measure score on the validation data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_validation_data[1], predicted_classifications, average = 'micro'))
    predicted_classifications = svc.predict(imdb_test_frequency_bow_data)
    print("The F1-measure score on the test data using the best hyperparameters is ", sklearn.metrics.f1_score(imdb_test_data[1], predicted_classifications, average = 'micro'))
    
def main():
    warnings.filterwarnings("ignore")
    train_GNB_classifier_default_imdb_fbow() 
    hypertune_GNB_hyperparameters_imdb_fbow()
    train_decision_tree_default_imdb_fbow()
    hypertune_decision_tree_hyperparameters_imdb_fbow()
    train_SVM_default_imdb_fbow()
    hypertune_SVM_hyperparameters_imdb_fbow()
        
main()

Default Gaussian Naive Bayes classifier F1-measure without no hyperparameters specified: 0.69824
The best value of var_smoothing is  6.939081632653061e-10
The F1-measure score on the training data using the best smoothing value is  0.8628
The F1-measure score on the validation data using the best smoothing value is  0.7607999999999999
The F1-measure score on the test data using the best smoothing value is  0.6978
Default Decision Trees classifier F1-measure without no hyperparameters specified: 0.69832
The optimal hyperparameters are criterion = gini , splitter = random , max_depth = 18.0
The F1-measure score on the training data using the best hyperparameters is  0.8202666666666667
The F1-measure score on the validation data using the best hyperparameters is  0.7192
The F1-measure score on the test data using the best hyperparameters is  0.72216
Default SVM classifier F1-measure without no hyperparameters specified: 0.79332
The optimal hyperparameters are C = 20.0 , tolerance = 1e-09
