# Classifiers Jupyter Notebook
## Luke Mason & Karsen Hansen

In [39]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import os

## Load and clean the data

In [40]:
mytable = MyPyTable()
fName = os.path.join("input_data", "trimmed_data.csv")
dataset = mytable.load_from_file(fName)
dataset.remove_rows_with_missing_values()
rows_to_delete = ['Review Count', 'Attributes', 'Review Length']
myutils.remove_rows_from_data(rows_to_delete, dataset)
myutils.get_friend_count(2, dataset)
print(dataset.column_names)
print(dataset.data[0])

['Fans', 'Compliment Plain', 'Friends', 'Useful']
[12.0, 30.0, 37.0, 179.0]


## Stratified k-fold Accuracy Check (Naive Bayes and Decision Tree)

In [41]:
import copy
#==================================
#      Get stratified training    =
#      and testing sets           =
#==================================
k = 10
X, y = myutils.split_x_y_train(dataset.data)
x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

# Get the training and testing folds
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

# Get the traininga and testing sets from the folds
x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)


#==================================
#      Naive Bayes Classifier     =
#==================================
myNB = MyNaiveBayesClassifier()
myNB.fit(x_train, y_train)

# Compare predicted with actual
y_predict_nb = myNB.predict(x_test)
count = 0
for i in range(len(y_predict_nb)):
    binned_predict = myutils.get_useful_bin(y_predict_nb[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

# Calculate accuracy and error
accuracy = count / len(y_predict_nb)
error = (len(y_predict_nb) - count) / len(y_predict_nb)

print("Naive Bayes: accuracy =", accuracy, "error =", error)

#==================================
#     Decision Tree Classifier    =
#==================================
myDT = MyDecisionTreeClassifier()
myDT_x_train = copy.deepcopy(x_train)
myDT_y_train = copy.deepcopy(y_train)
myDT.fit(myDT_x_train, myDT_y_train)

y_predict_dt = myDT.predict(x_test)
count = 0
for i in range(len(y_predict_dt)):
    binned_predict = myutils.get_useful_bin(y_predict_dt[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_dt)
error = (len(y_predict_dt) - count) / len(y_predict_dt)

print("Decision Tree: accuracy =", accuracy, "error =", error)

Naive Bayes: accuracy = 0.8597194388777555 error = 0.1402805611222445
Decision Tree: accuracy = 0.5404141616566466 error = 0.45958583834335337


## Confusion Matrices (Naive Bayes & Decision Tree)

In [42]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_nb[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Naive Bayes')
myutils.print_tabulate(complete_matrix, table_header)


Naive Bayes
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  167   21    9    4    1    0    0    0    0     0      202              82.67
       2   63  130   14    5    3    0    0    0    0     0      215              60.47
       3   23   18  120    8    4    0    0    0    0     0      173              69.36
       4    8    7   12  108    3    0    0    0    0     0      138              78.26
       5    0    2    3    2  251    0    0    0    0     0      258              97.29
       6    0    0    0    0    0  183    0    0    0     0      183             100
       7    0    0    0    0    0    0   81    0    0     0       81             100
       8    0    0    0    0    0    0    0   64    0     0       64             100
       9    0    0    0    0    0    0    0    0   49     0       49             100
      10    0    0    0    0    0    0    0    0    0   134      134             100


In [43]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_dt[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Decision Tree')
myutils.print_tabulate(complete_matrix, table_header)


Decision Tree
  Useful    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
       1  157   26    3    5    5    3    0    2    1     0      202              77.72
       2   74   89   16   13   11    9    2    1    0     0      215              41.4
       3   50   37   61    9   11    5    0    0    0     0      173              35.26
       4   39   18   15   52   10    4    0    0    0     0      138              37.68
       5   30   47   28   20  111   12    6    1    2     1      258              43.02
       6   13   20   15   10   15   99    7    2    1     1      183              54.1
       7    7    9    5    1    3    2   50    1    3     0       81              61.73
       8    2    5    3    4    8    5    1   34    1     1       64              53.12
       9    1    2    2    1    4    0    2    0   37     0       49              75.51
      10    0    1    2    0    1    0    3    4    4   119      134              88.81


## Random Forest Parameter Tuning

In [48]:
import math
#==================================
#      Get stratified training    =
#      and testing sets           =
#==================================
# k = 10
X, y = myutils.split_x_y_train(dataset.data)
x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

# Get the training and testing folds
train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

# Get the traininga and testing sets from the folds
x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)

#==================================
#     Decision Tree Classifier    =
#==================================
myDT = MyDecisionTreeClassifier()
myDT_x_train = copy.deepcopy(x_train)
myDT_y_train = copy.deepcopy(y_train)
myDT.fit(myDT_x_train, myDT_y_train)

y_predict_dt = myDT.predict(x_test)
count = 0
for i in range(len(y_predict_dt)):
    binned_predict = myutils.get_useful_bin(y_predict_dt[i])
    binned_test = myutils.get_useful_bin(y_test[i])
    if (binned_predict == binned_test):
        count = count + 1;

accuracy = count / len(y_predict_dt)
error = (len(y_predict_dt) - count) / len(y_predict_dt)

print("Decision Tree: accuracy =", accuracy, "error =", error)


#==================================
#     Random Forest Classifier    =
#==================================

#==================================
#          M: 10, N: 100          =
#==================================
print("M=10, N=100")
for i in range(5):
    X, y = myutils.split_x_y_train(dataset.data)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

    # Get the training and testing folds
    train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

    # Get the traininga and testing sets from the folds
    x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)

    remainder = []

    for j in range(len(x_train)):
        row = x_train[j]
        row.append(y_train[j])
        remainder.append(row)

    myRF = MyRandomForestClassifier()
    myRF.fit(remainder, 10, 100)

    y_predict_rf = myRF.predict(x_test)
    count = 0
    for l in range(len(y_predict_rf)):
        binned_predict = myutils.get_useful_bin(y_predict_rf[l])
        binned_test = myutils.get_useful_bin(y_test[l])
        if (binned_predict == binned_test):
            count = count + 1;

    accuracy = count / len(y_predict_rf)
    error = (len(y_predict_rf) - count) / len(y_predict_rf)
    print(i, "-- accuracy =", accuracy, "error =", error)

#==================================
#          M: 10, N: 500          =
#==================================
print("M=10, N=500")
for i in range(5):
    X, y = myutils.split_x_y_train(dataset.data)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

    # Get the training and testing folds
    train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

    # Get the traininga and testing sets from the folds
    x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)
    
    remainder = []

    for j in range(len(x_train)):
        row = x_train[j]
        row.append(y_train[j])
        remainder.append(row)

    myRF = MyRandomForestClassifier()
    myRF.fit(remainder, 10, 500)

    y_predict_rf = myRF.predict(x_test)
    count = 0
    for l in range(len(y_predict_rf)):
        binned_predict = myutils.get_useful_bin(y_predict_rf[l])
        binned_test = myutils.get_useful_bin(y_test[l])
        if (binned_predict == binned_test):
            count = count + 1;

    accuracy = count / len(y_predict_rf)
    error = (len(y_predict_rf) - count) / len(y_predict_rf)
    print(i, "-- accuracy =", accuracy, "error =", error)

#==================================
#          M: 100, N: 500         =
#==================================
print("M=100, N=500")
for i in range(5):
    X, y = myutils.split_x_y_train(dataset.data)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

    # Get the training and testing folds
    train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

    # Get the traininga and testing sets from the folds
    x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)
    
    remainder = []

    for j in range(len(x_train)):
        row = x_train[j]
        row.append(y_train[j])
        remainder.append(row)

    myRF = MyRandomForestClassifier()
    myRF.fit(remainder, 100, 500)

    y_predict_rf = myRF.predict(x_test)
    count = 0
    for l in range(len(y_predict_rf)):
        binned_predict = myutils.get_useful_bin(y_predict_rf[l])
        binned_test = myutils.get_useful_bin(y_test[l])
        if (binned_predict == binned_test):
            count = count + 1;

    accuracy = count / len(y_predict_rf)
    error = (len(y_predict_rf) - count) / len(y_predict_rf)
    print(i, "-- accuracy =", accuracy, "error =", error)

#==================================
#          M: 25, N: 50           =
#==================================
print("M=25, N=50")
for i in range(5):
    X, y = myutils.split_x_y_train(dataset.data)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(X, y, shuffle=True)

    # Get the training and testing folds
    train_folds, test_folds = myevaluation.stratified_kfold_cross_validation(X, y, k)

    # Get the traininga and testing sets from the folds
    x_train, y_train, x_test, y_test = myutils.get_values_from_folds(X, y, train_folds, test_folds)
    
    remainder = []

    for j in range(len(x_train)):
        row = x_train[j]
        row.append(y_train[j])
        remainder.append(row)

    for p in range(5):
        print(remainder[p])

    myRF = MyRandomForestClassifier()
    myRF.fit(remainder, 25, 50)

    y_predict_rf = myRF.predict(x_test)
    count = 0
    for l in range(len(y_predict_rf)):
        binned_predict = myutils.get_useful_bin(y_predict_rf[l])
        binned_test = myutils.get_useful_bin(y_test[l])
        if (binned_predict == binned_test):
            count = count + 1;

    accuracy = count / len(y_predict_rf)
    error = (len(y_predict_rf) - count) / len(y_predict_rf)
    print(i, "-- accuracy =", accuracy, "error =", error)



Decision Tree: accuracy = 0.5390781563126252 error = 0.46092184368737477
M=10, N=100
0 -- accuracy = 0.8403473613894455 error = 0.15965263861055445
1 -- accuracy = 0.8403473613894455 error = 0.15965263861055445
2 -- accuracy = 0.8403473613894455 error = 0.15965263861055445
3 -- accuracy = 0.8403473613894455 error = 0.15965263861055445
4 -- accuracy = 0.8403473613894455 error = 0.15965263861055445
M=10, N=500
0 -- accuracy = 0.8403473613894455 error = 0.15965263861055445


KeyboardInterrupt: 

In [6]:
# Get the x values to perform the matrix over
x = []
for i in range(10):
    x.append(i + 1)

actual = []
predicted = []
for i in range(len(y_test)):
    actual.append(myutils.get_useful_bin(y_test[i]))
    predicted.append(myutils.get_useful_bin(y_predict_rf[i]))

# Get the matrix from stratified
matrix = myevaluation.confusion_matrix(actual, predicted, x)

# Make the table header and calculate the statistics
table_header = ['Useful', 1,2,3,4,5,6,7,8,9,10, 'Total', 'Recognition (%)']
complete_matrix = myutils.calc_matrix_stats(matrix, False)

print()
print('Random Forest')
myutils.print_tabulate(complete_matrix, table_header)

NameError: name 'y_predict_rf' is not defined