# Classifier Evaluation
## Classifier 1: Decision Tree (unpruned, \_4444 attributes)

We're evaluating the natural, \_4444 format of the classifier, meaning without considering Scoring Margin and using 4 bins for all the attributes and for the classifier.

In [12]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyDecisionTreeClassifier

import copy
import random
from tabulate import tabulate

header, data = myutils.load_from_file("input_data/NCAA_Statistics_24444.csv")

# Now, we can move to create some decision trees. Let's first create trees over the whole dataset, then
# test upon our stratisfied k-fold splitting method.

class_col = myutils.get_column(data, header, "Win Percentage")
data = myutils.drop_column(data, header, "Win Percentage")
data = myutils.drop_column(data, header, "Scoring Margin")
atts = header[1:-1]

# Let's stratisfy
X_indices = range(len(class_col))
X_train_folds, X_test_folds = myevaluation.stratified_kfold_cross_validation(X_indices, class_col, n_splits=10)

y_preds = []
y_reals = []
correct = 0
total = 0
    
for fold_index in range(len(X_train_folds)):
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    
    for train_index in X_train_folds[fold_index]:
        X_train.append(copy.deepcopy(data[train_index]))
        y_train.append(copy.deepcopy(class_col[train_index]))
        
    for test_index in X_test_folds[fold_index]:
        X_test.append(copy.deepcopy(data[test_index]))
        y_test.append(copy.deepcopy(class_col[test_index]))
        
    # Get a classifier in here...
    my_dt = MyDecisionTreeClassifier()

    # Fitting...
    my_dt.fit(X_train, y_train)
    # ... and predicting!
    y_pred = my_dt.predict(X_test)
    
    # Counting and recording...
    for i in range(len(y_pred)):
        total += 1
        if y_pred[i] == y_test[i]:
            correct += 1
            
        y_preds.append(copy.deepcopy(y_pred[i]))
        y_reals.append(copy.deepcopy(y_test[i]))
        
print("Predictive Accuracy:", str(round(correct / total, 3)))
print("Error Rate:", str(1 - round(correct / total, 3)))
print()
print("Confusion Matrix:")
print()

labels = ["1", "2", "3", "4"]
conf_matrix = myevaluation.confusion_matrix(y_reals, y_preds, labels)

for index in range(len(conf_matrix)):
    conf_matrix[index].append(sum(conf_matrix[index]))
    if conf_matrix[index][-1] == 0:
        conf_matrix[index].append(0)
    else:
        conf_matrix[index].append(round(100 * conf_matrix[index][index] / conf_matrix[index][-1], 2))
    conf_matrix[index] = [index+1] + conf_matrix[index]
    
header = ["MPG"]
for index in labels:
    header.append(index)
header.append("Total")
header.append("Recognition (%%)")

print(tabulate(conf_matrix, headers=header, tablefmt="rst", numalign="right"))

Predictive Accuracy: 0.462
Error Rate: 0.538

Confusion Matrix:

  MPG    1    2    3    4    Total    Recognition (%%)
    1   54    8    6    1       69               78.26
    2   35   25   12    9       81               30.86
    3   23   24   26   28      101               25.74
    4    3    9   25   52       89               58.43
