In [78]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import os

## Load and clean the data

In [79]:
mytable = MyPyTable()
fName = os.path.join("input_data", "trimmed_data.csv")
dataset = mytable.load_from_file(fName)
dataset.remove_rows_with_missing_values()
rows_to_delete = ['Review Count', 'Attributes', 'Review Length']
myutils.remove_rows_from_data(rows_to_delete, dataset)
myutils.get_friend_count(2, dataset)
print(dataset.column_names)
print(dataset.data[0])

['Fans', 'Compliment Plain', 'Friends', 'Useful']
[12.0, 30.0, 37.0, 179.0]


## Stratified k-fold Accuracy Check

In [84]:
import copy
#==================================
#      Get stratified training    =
#      and testing sets           =
#==================================
k = 10
X, y = myutils.split_x_y_train(dataset.data)
x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

# Get the training and testing folds
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

# Get the traininga and testing sets from the folds
x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)


#==================================
#      Naive Bayes Classifier     =
#==================================
myNB = MyNaiveBayesClassifier()
myNB.fit(x_train, y_train)

# Compare predicted with actual
y_predict_nb = myNB.predict(x_test)
count = 0
for i in range(len(y_predict_nb)):
    binned_predict = myutils.get_useful_bin(y_predict_nb[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

# Calculate accuracy and error
accuracy = count / len(y_predict_nb)
error = (len(y_predict_nb) - count) / len(y_predict_nb)

print("Naive Bayes: accuracy =", accuracy, "error =", error)

#==================================
#     Decision Tree Classifier    =
#==================================
myDT = MyDecisionTreeClassifier()
myDT_x_train = copy.deepcopy(x_train)
myDT_y_train = copy.deepcopy(y_train)
myDT.fit(myDT_x_train, myDT_y_train)

y_predict_dt = myDT.predict(x_test)
count = 0
for i in range(len(y_predict_dt)):
    binned_predict = myutils.get_useful_bin(y_predict_dt[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_dt)
error = (len(y_predict_dt) - count) / len(y_predict_dt)

print("Decision Tree: accuracy =", accuracy, "error =", error)

#==================================
#     Random Forest Classifier    =
#==================================
remainder = []
for i in range(len(x_train)):
    row = x_train[i]
    row.append(y_train[i])
    remainder.append(row)

myRF = MyRandomForestClassifier()
myRF.fit(remainder, 10, 100)

y_predict_rf = myRF.predict(x_test)
count = 0
for i in range(len(y_predict_rf)):
    binned_predict = myutils.get_useful_bin(y_predict_rf[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_rf)
error = (len(y_predict_rf) - count) / len(y_predict_rf)
print("Random Forest: accuracy =", accuracy, "error =", error)

Naive Bayes: accuracy = 0.8597194388777555 error = 0.1402805611222445
Decision Tree: accuracy = 0.1683366733466934 error = 0.8316633266533067
Random Forest: accuracy = 0.14963259853039412 error = 0.8503674014696059


## Confusion Matrices

In [7]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_nb[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Naive Bayes')
myutils.print_tabulate(complete_matrix, table_header)


Naive Bayes
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  167   21    9    4    1    0    0    0    0     0      202              82.67
       2   63  130   14    5    3    0    0    0    0     0      215              60.47
       3   23   18  120    8    4    0    0    0    0     0      173              69.36
       4    8    7   12  108    3    0    0    0    0     0      138              78.26
       5    0    2    3    2  251    0    0    0    0     0      258              97.29
       6    0    0    0    0    0  183    0    0    0     0      183             100
       7    0    0    0    0    0    0   81    0    0     0       81             100
       8    0    0    0    0    0    0    0   64    0     0       64             100
       9    0    0    0    0    0    0    0    0   49     0       49             100
      10    0    0    0    0    0    0    0    0    0   135      135             100


In [8]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_dt[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Decision Tree')
myutils.print_tabulate(complete_matrix, table_header)


Naive Bayes
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  165   23    3    3    4    3    0    1    0     0      202              81.68
       2  150   51    1    3    0    7    2    1    0     0      215              23.72
       3  131   35    5    2    0    0    0    0    0     0      173               2.89
       4  107   23    3    5    0    0    0    0    0     0      138               3.62
       5  134   94   15   13    1    1    0    0    0     0      258               0.39
       6   24   67   37   39    9    7    0    0    0     0      183               3.83
       7   17   27    6   17   10    4    0    0    0     0       81               0
       8   16   12    9   12   10    5    0    0    0     0       64               0
       9   16    5    1    7   17    2    1    0    0     0       49               0
      10   12   21   12    5   30   23    4    2    3    23      135              17.04


In [85]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_rf[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Random Forest')
myutils.print_tabulate(complete_matrix, table_header)


Random Forest
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  202    0    0    0    0    0    0    0    0     0      202             100
       2  215    0    0    0    0    0    0    0    0     0      215               0
       3  173    0    0    0    0    0    0    0    0     0      173               0
       4  138    0    0    0    0    0    0    0    0     0      138               0
       5  224   34    0    0    0    0    0    0    0     0      258               0
       6   62  110   11    0    0    0    0    0    0     0      183               0
       7    2   61   18    0    0    0    0    0    0     0       81               0
       8    0   24   34    5    1    0    0    0    0     0       64               0
       9    0    0   30   17    1    1    0    0    0     0       49               0
      10    0    1    5   47   51    7    0    1    0    22      134              16.42
