# Decision Tree Classification
## CS/DSA 5970

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, export_graphviz

import pickle as pkl

##################
# Default parameters
FIGURESIZE=(10,6)
FONTSIZE=18

plt.rcParams['figure.figsize'] = FIGURESIZE
plt.rcParams['font.size'] = FONTSIZE

plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18

In [None]:
def scatter_plot(ins, pred):
    elems_true = np.where(pred == 1)[0]
    elems_false = np.where(pred == 0)[0]
    
    fig, ax = plt.subplots(figsize=FIGURESIZE)
    ax.plot(ins[elems_true,0], ins[elems_true,1], 'r.')
    ax.plot(ins[elems_false,0], ins[elems_false,1], 'g.')
    fig.legend(['Positive', 'Negative'], fontsize=18)
    plt.xlabel('x[0]', fontsize=FONTSIZE)
    plt.ylabel('x[1]', fontsize=FONTSIZE)

In [None]:
def plot_probs(outs, proba):
    pred = proba[:,0] >= 0.5
    confusion = confusion_matrix(outs, pred)
    print("Confusion:", confusion)
    
    # Evaluate
    print("log loss: ", log_loss(outs, proba))
    
    # TPR/FPR plot
    fpr, tpr, thresholds = roc_curve(outs, proba[:,0])
    fig, ax = plt.subplots(figsize=FIGURESIZE)
    ax.plot(thresholds, tpr, color='b')
    ax.plot(thresholds, fpr, color='r')
    ax.plot(thresholds, tpr - fpr, color='g')
    ax.invert_xaxis()
    ax.set_xlabel('threshold', fontsize=FONTSIZE)
    ax.set_ylabel('fraction', fontsize=FONTSIZE)
    ax.legend(['TPR', 'FPR', 'distance'], fontsize=FONTSIZE)
    
    # ROC plot
    fig, ax = plt.subplots(figsize=FIGURESIZE)
    ax.plot(fpr, tpr, color='b')
    ax.plot([0,1], [0,1], 'r--')
    ax.set_xlabel('FPR', fontsize=FONTSIZE)
    ax.set_ylabel('TPR', fontsize=FONTSIZE)
    ax.set_aspect('equal', 'box')
    print("AUC:", auc(fpr, tpr))

## Load data

In [None]:
fname = '../ml_practices/imports/datasets/misc/classification_data.pkl'
fp = open(fname, 'rb')
ins = pkl.load(fp)
outs = pkl.load(fp)
fp.close()

## Decision Tree Classifier

In [None]:
pred = cross_val_predict(classifier, ins, outs, cv=10)
confusion = confusion_matrix(outs, pred)
confusion

In [None]:
proba = cross_val_predict(classifier, ins, outs, cv=10, method='predict_proba')
proba = 1 - proba

In [None]:
classifier.fit(ins, outs)
pred = classifier.predict(ins)
scatter_plot(ins, pred)
export_graphviz(classifier, out_file='model.dot', rounded=True, filled=True)