# Classifiers Jupyter Notebook
## Luke Mason & Karsen Hansen

In [265]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import os

## Load and clean the data

In [266]:
mytable = MyPyTable()
fName = os.path.join("input_data", "trimmed_data.csv")
dataset = mytable.load_from_file(fName)
dataset.remove_rows_with_missing_values()
rows_to_delete = ['Review Count', 'Attributes', 'Review Length']
myutils.remove_rows_from_data(rows_to_delete, dataset)
myutils.get_friend_count(2, dataset)
print(dataset.column_names)
print(dataset.data[0])

['Fans', 'Compliment Plain', 'Friends', 'Useful']
[12.0, 30.0, 37.0, 179.0]


## Stratified k-fold Accuracy Check (Naive Bayes and Decision Tree)

In [267]:
import copy
#==================================
#      Get stratified training    =
#      and testing sets           =
#==================================
k = 10
X, y = myutils.split_x_y_train(dataset.data)

# Get the training and testing folds
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

# Get the traininga and testing sets from the folds
x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)


print("Naive Bayes vs. Decision Tree Accuracy Comparison")
print("-----------------------------------")
#==================================
#      Naive Bayes Classifier     =
#==================================
myNB = MyNaiveBayesClassifier()
myNB.fit(x_train, y_train)

# Compare predicted with actual
y_predict_nb = myNB.predict(x_test)
count = 0
for i in range(len(y_predict_nb)):
    binned_predict = myutils.get_useful_bin(y_predict_nb[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

# Calculate accuracy and error
accuracy = count / len(y_predict_nb)
error = (len(y_predict_nb) - count) / len(y_predict_nb)

print("Naive Bayes: accuracy =", accuracy, "error =", error)

#==================================
#     Decision Tree Classifier    =
#==================================
myDT = MyDecisionTreeClassifier()
myDT_x_train = copy.deepcopy(x_train)
myDT_y_train = copy.deepcopy(y_train)
myDT.fit(myDT_x_train, myDT_y_train)

y_predict_dt = myDT.predict(x_test)
count = 0
for i in range(len(y_predict_dt)):
    binned_predict = myutils.get_useful_bin(y_predict_dt[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_dt)
error = (len(y_predict_dt) - count) / len(y_predict_dt)

print("Decision Tree: accuracy =", accuracy, "error =", error)
print("-----------------------------------")

Naive Bayes vs. Decision Tree Accuracy Comparison
-----------------------------------
Naive Bayes: accuracy = 0.8597194388777555 error = 0.1402805611222445
Decision Tree: accuracy = 0.5424181696726786 error = 0.4575818303273213
-----------------------------------


## Confusion Matrices (Naive Bayes & Decision Tree)

In [268]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_nb[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Naive Bayes')
myutils.print_tabulate(complete_matrix, table_header)


Naive Bayes
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  167   21    9    4    1    0    0    0    0     0      202              82.67
       2   63  130   14    5    3    0    0    0    0     0      215              60.47
       3   23   18  120    8    4    0    0    0    0     0      173              69.36
       4    8    7   12  108    3    0    0    0    0     0      138              78.26
       5    0    2    3    2  251    0    0    0    0     0      258              97.29
       6    0    0    0    0    0  183    0    0    0     0      183             100
       7    0    0    0    0    0    0   81    0    0     0       81             100
       8    0    0    0    0    0    0    0   64    0     0       64             100
       9    0    0    0    0    0    0    0    0   49     0       49             100
      10    0    0    0    0    0    0    0    0    0   134      134             100


In [269]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_dt[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Decision Tree')
myutils.print_tabulate(complete_matrix, table_header)


Decision Tree
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  152   29    5    5    5    3    0    2    1     0      202              75.25
       2   67   96   14   15   11    9    2    1    0     0      215              44.65
       3   48   37   60   12   11    5    0    0    0     0      173              34.68
       4   38   19   13   54   10    4    0    0    0     0      138              39.13
       5   24   53   25   23  111   12    6    1    2     1      258              43.02
       6   11   25   12   10   15   99    7    2    1     1      183              54.1
       7    7    9    5    1    3    2   50    1    3     0       81              61.73
       8    2    5    3    4    8    5    1   34    1     1       64              53.12
       9    1    2    2    1    4    0    2    0   37     0       49              75.51
      10    0    1    2    0    1    0    3    4    4   119      134              88.81


## Random Forest Parameter Tuning

In [270]:
import math
#==================================
#      Get stratified training    =
#      and testing sets           =
#==================================

X, y = myutils.split_x_y_train(dataset.data)
x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

# Get the training and testing folds
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

# Get the traininga and testing sets from the folds
x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)

#==================================
#     Decision Tree Classifier    =
#==================================
myDT = MyDecisionTreeClassifier()
myDT_x_train = copy.deepcopy(x_train)
myDT_y_train = copy.deepcopy(y_train)
myDT.fit(myDT_x_train, myDT_y_train)

y_predict_dt = myDT.predict(x_test)
count = 0
for i in range(len(y_predict_dt)):
    binned_predict = myutils.get_useful_bin(y_predict_dt[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_dt)
error = (len(y_predict_dt) - count) / len(y_predict_dt)

print("Decision Tree vs. Random Forest Trials to Tune Parameters")
print("-----------------------------------")

print("Decision Tree: accuracy =", accuracy, "error =", error)

print("-----------------------------------")
#==================================
#     Random Forest Classifier    =
#==================================

#==================================
#          M: 10, N: 100          =
#==================================
myutils.tune_parameters(10, 100, 3, dataset)

#==================================
#          M: 10, N: 100          =
#==================================
myutils.tune_parameters(10, 100, 2, dataset)


print("-----------------------------------")
#==================================
#          M: 10, N: 500          =
#==================================
myutils.tune_parameters(10, 500, 3, dataset)

#==================================
#          M: 10, N: 500          =
#==================================
myutils.tune_parameters(10, 500, 2, dataset)


print("-----------------------------------")
#==================================
#          M: 100, N: 500         =
#==================================
myutils.tune_parameters(100, 500, 3, dataset)

#==================================
#          M: 100, N: 500         =
#==================================
myutils.tune_parameters(100, 500, 2, dataset)


print("-----------------------------------")
#==================================
#          M: 25, N: 50           =
#==================================
myutils.tune_parameters(25, 50, 3, dataset)

#==================================
#          M: 25, N: 50           =
#==================================
myutils.tune_parameters(25, 50, 2, dataset)


Decision Tree vs. Random Forest Trials to Tune Parameters
-----------------------------------
Decision Tree: accuracy = 0.5410821643286573 error = 0.4589178356713427
-----------------------------------
M = 10 N = 100 F = 3
0 -- accuracy = 0.25656565656565655 error = 0.7434343434343434
1 -- accuracy = 0.26666666666666666 error = 0.7333333333333333
2 -- accuracy = 0.27070707070707073 error = 0.7292929292929293
3 -- accuracy = 0.23636363636363636 error = 0.7636363636363637
4 -- accuracy = 0.28888888888888886 error = 0.7111111111111111
M = 10 N = 100 F = 2
0 -- accuracy = 0.5191919191919192 error = 0.4808080808080808
1 -- accuracy = 0.5575757575757576 error = 0.44242424242424244
2 -- accuracy = 0.5555555555555556 error = 0.4444444444444444
3 -- accuracy = 0.498989898989899 error = 0.501010101010101
4 -- accuracy = 0.5737373737373738 error = 0.4262626262626263
-----------------------------------
M = 10 N = 500 F = 3
0 -- accuracy = 0.25656565656565655 error = 0.7434343434343434
1 -- accurac

## Observations:
It seems as though an M, high N, and F as 2 yields the best results, so let's try to take that to the extreme. We are going to set M=5, N=1000, and F=2

In [271]:
print("Random Forest Hypothesis Tests")
print("-----------------------------------")
for i in range(5):
    print("Trial", i)
    myutils.tune_parameters(5, 1000, 2, dataset)
    print("-----------------------------------")

Random Forest Hypothesis Tests
-----------------------------------
Trial 0
M = 5 N = 1000 F = 2
0 -- accuracy = 0.5313131313131313 error = 0.4686868686868687
1 -- accuracy = 0.5696969696969697 error = 0.4303030303030303
2 -- accuracy = 0.6121212121212121 error = 0.3878787878787879
3 -- accuracy = 0.6161616161616161 error = 0.3838383838383838
4 -- accuracy = 0.5717171717171717 error = 0.42828282828282827
-----------------------------------
Trial 1
M = 5 N = 1000 F = 2
0 -- accuracy = 0.7191919191919192 error = 0.2808080808080808
1 -- accuracy = 0.6606060606060606 error = 0.3393939393939394
2 -- accuracy = 0.7131313131313132 error = 0.2868686868686869
3 -- accuracy = 0.6747474747474748 error = 0.32525252525252524
4 -- accuracy = 0.7535353535353535 error = 0.24646464646464647
-----------------------------------
Trial 2
M = 5 N = 1000 F = 2
0 -- accuracy = 0.35353535353535354 error = 0.6464646464646465
1 -- accuracy = 0.34545454545454546 error = 0.6545454545454545
2 -- accuracy = 0.3393939

## Results
The trials show mixed signals in terms of F's real impact. It seems there are attributes that fit the data better, and likely one that doesn't have the strongest relationship with classifying a useful review. That being said, the highest percentages have come from the above trials, so we will choose M = 5, N = 1000, and F = 2 for our parameters

In [282]:
adjusted_dataset = myutils.select_random_attributes(2, dataset.data)

X, y = myutils.split_x_y_train(adjusted_dataset)
x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

remainder = []

for j in range(len(x_train)):
    row = x_train[j]
    row.append(y_train[j])
    remainder.append(row)
    
myRF = MyRandomForestClassifier()
myRF.fit(remainder, 5, 1000)
y_predict_rf = myRF.predict(x_test)

count = 0
for i in range(len(y_predict_rf)):
    binned_predict = myutils.get_useful_bin(y_predict_rf[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_rf)
error = (len(y_predict_dt) - count) / len(y_predict_rf)

print("Random Forest: accuracy =", accuracy, "error =", error)

# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_rf[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Random Forest')
myutils.print_tabulate(complete_matrix, table_header)

Random Forest: accuracy = 0.7292929292929293 error = 2.294949494949495

Random Forest
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  320    8    2    0    0    0    0    0    0     0      330              96.97
       2   33   13   16    3    7    0    0    0    0     0       72              18.06
       3    3    6    7    0    1    0    0    0    0     6       23              30.43
       4    1    2    5    5    3    0    0    0    0     1       17              29.41
       5    0    3    3    2   10    1    0    0    0    11       30              33.33
       6    0    1    0    0    2    3    0    0    0     7       13              23.08
       7    0    0    0    0    0    1    0    0    0     1        2               0
       8    0    0    0    0    0    0    0    0    0     4        4               0
       9    0    0    0    0    0    0    0    0    0     1        1               0
      10    0    0    0    0    0    0    0