# Classifier Evaluation
## Classifier 1: Decision Tree (unpruned, \_4444 attributes)

We're evaluating the natural, \_4444 format of the classifier, meaning without considering Scoring Margin and using 4 bins for all the attributes and for the classifier.

In [27]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyKNeighborsClassifier

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import copy
import random
from tabulate import tabulate

header, data = myutils.load_from_file("input_data/NCAA_Statistics_24444.csv")

# Now, we can move to create some decision trees. Let's first create trees over the whole dataset, then
# test upon our stratisfied k-fold splitting method.

class_col = myutils.get_column(data, header, "Win Percentage")
data = myutils.drop_column(data, header, "Win Percentage")
data = myutils.drop_column(data, header, "Scoring Margin")
atts = header[1:-1]

# Let's stratisfy
X_indices = range(len(class_col))
X_train_folds, X_test_folds = myevaluation.stratified_kfold_cross_validation(X_indices, class_col, n_splits=10)

y_preds = []
y_reals = []
correct = 0
total = 0
    
for fold_index in range(len(X_train_folds)):
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    
    for train_index in X_train_folds[fold_index]:
        X_train.append(copy.deepcopy(data[train_index]))
        y_train.append(copy.deepcopy(class_col[train_index]))
        
    for test_index in X_test_folds[fold_index]:
        X_test.append(copy.deepcopy(data[test_index]))
        y_test.append(copy.deepcopy(class_col[test_index]))
        
    # Get a classifier in here...
    my_dt = MyDecisionTreeClassifier()

    # Fitting...
    my_dt.fit(X_train, y_train)
    # ... and predicting!
    y_pred = my_dt.predict(X_test)
    
    # Counting and recording...
    for i in range(len(y_pred)):
        total += 1
        if y_pred[i] == y_test[i]:
            correct += 1
            
        y_preds.append(copy.deepcopy(y_pred[i]))
        y_reals.append(copy.deepcopy(y_test[i]))
        
print("Predictive Accuracy:", str(round(correct / total, 3)))
print("Error Rate:", str(1 - round(correct / total, 3)))
print()
print("Confusion Matrix:")
print()

labels = ["1", "2", "3", "4"]
conf_matrix = myevaluation.confusion_matrix(y_reals, y_preds, labels)

for index in range(len(conf_matrix)):
    conf_matrix[index].append(sum(conf_matrix[index]))
    if conf_matrix[index][-1] == 0:
        conf_matrix[index].append(0)
    else:
        conf_matrix[index].append(round(100 * conf_matrix[index][index] / conf_matrix[index][-1], 2))
    conf_matrix[index] = [index+1] + conf_matrix[index]
    
header = ["Win% Tier"]
for index in labels:
    header.append(index)
header.append("Total")
header.append("Recognition (%%)")

print(tabulate(conf_matrix, headers=header, tablefmt="rst", numalign="right"))

Predictive Accuracy: 0.562
Error Rate: 0.43799999999999994

Confusion Matrix:

  Win% Tier    1    2    3    4    Total    Recognition (%%)
          1    6   25    3    0       34               17.65
          2    8   60   46    2      116               51.72
          3    0   33  120    4      157               76.43
          4    0    3   25    5       33               15.15


## Classifier 2: K Nearest Neighbors

We're evaluating the natural, \_4444 format of the classifier, meaning without considering Scoring Margin and using 4 bins for all the attributes and for the classifier.

In [28]:
importlib.reload(myutils)

ncaa_path = os.path.join("input_data","NCAA_Statistics_24444.csv")
ncaa_data = MyPyTable().load_from_file(ncaa_path)
win_percentage = ncaa_data.get_column("Win Percentage")
scoring_margin = ncaa_data.get_column("Scoring Margin")
efg = ncaa_data.get_column("eFG%")
spg_bpg = ncaa_data.get_column("SPG+BPG")
Rebound_margin = ncaa_data.get_column("Rebound Margin")

X_indices = range(len(win_percentage))
X_train_folds, X_test_folds = myevaluation.stratified_kfold_cross_validation(X_indices,win_percentage,n_splits=10)


knn = MyKNeighborsClassifier(n_neighbors=10)
knn_predictions = []
knn_actual = []

for i in range(len(X_train_folds)):
    y_train_1 = []
    X_train_1 = []
    y_test_1 = []
    X_test_1 = []
    for index in X_train_folds[i]:
        X_train_1.append([scoring_margin[index],efg[index],spg_bpg[index],Rebound_margin[index]])
        y_train_1.append(win_percentage[index])
    for index in X_test_folds[i]:
        X_test_1.append([scoring_margin[index],efg[index],spg_bpg[index],Rebound_margin[index]])
        y_test_1.append(win_percentage[index])
    knn.fit(X_train_1,y_train_1)
    knn_predictions.append(knn.predict(X_test_1))
    knn_actual.append(y_test_1)

knn_predictions_1d_1 = []
for i in knn_predictions:
    for k in i:
        knn_predictions_1d_1.append(k)
knn_actual_1d_1 = []
for i in knn_actual:
    for j in i:
        knn_actual_1d_1.append(j)


knn_total_correct = 0
knn_total_predictions = len(knn_actual_1d_1)
for i in range(len(knn_predictions_1d_1)):
    if knn_predictions_1d_1[i] == knn_actual_1d_1[i]:
        knn_total_correct += 1

knn_accuracy = knn_total_correct /knn_total_predictions
knn_error_rate = 1 - knn_accuracy 

print()
print("KNN: " +"accuracy = "+str(knn_accuracy)+ ", error rate = " + str(knn_error_rate))

column_names = [1,2,3,4]

knn_matrix = myevaluation.confusion_matrix(knn_actual_1d_1,knn_predictions_1d_1,column_names)
sum_matrix_1 = []
for i in knn_matrix:
    sum_matrix_1.append(sum(i))

recognition_1 = []
for i in range(len(knn_matrix)):
    if sum_matrix_1[i] > 0:
        recognition_1.append((knn_matrix[i][i] / sum_matrix_1[i]) * 100)
    else:
        recognition_1.append(0)

for i in range(len(knn_matrix)):
    knn_matrix[i].append(sum_matrix_1[i])
for i in range(len(knn_matrix)):
    knn_matrix[i].insert(0,column_names[i])
for i in range(len(knn_matrix)):
    knn_matrix[i].append(recognition_1[i])

column_names_2= ["Win% Tier",1,2,3,4,"total","Recognition (%)"]
print()
print(tabulate(knn_matrix,column_names_2))
print()







KNN: accuracy = 0.5294117647058824, error rate = 0.47058823529411764

  Win% Tier    1    2    3    4    total    Recognition (%)
-----------  ---  ---  ---  ---  -------  -----------------
          1   24    9    1    0       34           70.5882
          2   28   69   19    0      116           59.4828
          3    2   68   84    3      157           53.5032
          4    0    8   22    3       33            9.09091

