<h1>Various Analyses</h1>

Trying different analyses on Decision Trees, Random Forests, and K-Fold Cross Validation through changing of parameters

In [36]:
from sklearn.datasets import load_iris # Iris dataset
from sklearn.model_selection import KFold # K-fold Cross Validation
from sklearn import tree # Decision Tree
from sklearn import ensemble # Random forest
from sklearn import metrics # Accuracy scores

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import time

<h2>Helper Functions</h2>

Function for obtaining data splits over k-folds

In [37]:
def getSets(k, data, targets):
    kFold = KFold(k, True, 1)

    # Data sets
    trainSets = []
    testSets = []
    # Target sets
    trainTargs = []
    testTargs = []
    
    for train, test in kFold.split(data):
        # Training values
        trainSets.append( [data[i] for i in train] )
        trainTargs.append( [targets[i] for i in train] )

        # Testing values
        testSets.append( [data[i] for i in test] )
        testTargs.append( [targets[i] for i in test] )
        
    return trainSets, testSets, trainTargs, testTargs

Function for printing out accuracy results

In [38]:
def printScores(scores):
    # Results
    for score in scores:
        print("Accuracy: {0:0.4f}".format(score))        

Function for plotting results / scores

In [39]:
def plotScores(scores, xL, yL, t):
    n = len(scores)
    x = np.linspace(0, n, num = n)
    
    fig = plt.figure()
    p1 = fig.add_subplot()
    # It's the holidays
    p1.plot(x, scores,
             color = "forestgreen",
             marker = "*",
             markersize = "10",
             markeredgecolor = "indianred",
             markerfacecolor = "indianred",)
    p1.set_xlabel(xL)
    p1.set_ylabel(yL)
    p1.set_title(t)
    plt.show()

Function for finding Variance

In [55]:
def getVariance(predictions):
    return np.mean((predictions - np.mean(predictions))**2)

Function for finding Bias

In [41]:
def getBias(predictions, target):
    truth = np.mean(target)
    return np.mean(predictions) - truth

Function for finding Mean Square Error

In [42]:
def getMSE(predictions, target):
    truth = np.mean(target)
    return np.mean((predictions-truth)**2)

Loading Data sets

In [91]:
# Iris Dataset
iris = load_iris()

In [92]:
# Wine Quality Dataset
df = pd.read_csv('WineQuality/winequality-white.csv', delimiter=';')

wines = df.drop(columns='quality')
wines = wines.to_numpy()

targets = df['quality']
targets = targets.to_numpy()

<h1>Decision Trees</h1>

Analysis on how the maximum depth (`max_depth`) of a tree affects model accuracy and execution time

Function to run a Decision Tree with max_depth n over k-folds

In [84]:
def runDecisionTree(k, data, targets, n):
    accuracy = []
    squareErrors = []
    variances = []
    biases= []
    trainS, testS, trainT, testT = getSets(k, data, targets)
    
    for i in range(0, k):
        classTree = tree.DecisionTreeClassifier(max_depth = n)
        # Training
        classTree = classTree.fit(trainS[i], trainT[i])
        # Testing
        test = classTree.predict(testS[i])
        
        # Accuracy
        accuracy.append(metrics.accuracy_score(testT[i], test))
        # Variances
        variances.append(getVariance(test))
        # Biases
        biases.append(getBias(test, testT[i]))
        # Mean Square Error
        squareErrors.append(getMSE(test, testT[i]))
        
    # Mean results
    data = {
        "accuracy": np.mean(accuracy),
        "bias": np.mean(biases),
        "variance": np.mean(variances),
        "mse": np.mean(squareErrors)
    }
    return data

Function to run decision trees with max_depths within the range (start, end), over k-folds

In [85]:
def runTreeDepths(k, start, end, data, targets):
    allData = []
    for i in range(start, end):
        allData.append(runDecisionTree(k, data, targets, i))
    
    return allData        

<h3>Iris Flowers</h3>

In [95]:
# Decision Tree on 5 Folds
runDecisionTree(5, iris.data, iris.target, 3)

{'accuracy': 0.9199999999999999,
 'bias': -2.2204460492503132e-17,
 'variance': 0.6511111111111111,
 'mse': 0.6542222222222223}

In [105]:
# Decision Trees from 2 to 10 folds
irisTrees = []
for i in range(2, 10):
    irisTrees.append(runDecisionTree(i, iris.data, iris.target, i))

xAxis = "number of k-folds"
yAxis = "mean accuracy"
title = "Mean Accuracy for Multiple K-fold Cross Validations"
# plotScores(irisTrees, xAxis, yAxis, title)

<h3>Wine Quality</h3>

In [101]:
#  Decision Tree on 5 folds
runDecisionTree(5, wines, targets, 1)

{'accuracy': 0.43670821954931105,
 'bias': 0.008505555439744761,
 'variance': 0.04907719932682346,
 'mse': 0.10690255438948444}

In [104]:
# Decison Trees from 2 to 10 folds
wineTrees = []
for i in range(2, 10):
    wineTrees.append(runDecisionTree(i, wines, targets, i))
    
xAxis = "number of k-folds"
yAxis = "mean accuracy"
title = "Mean Accuracy for Multiple K-fold Cross Validations"
# plotScores(wineTrees, xAxis, yAxis, title)

<h1>Random Forest</h1>

Analysis on how the number of trees (`n_estimators`) in a forest effects model accuracy and execution time

Function to run a Random Forest over k-folds with n trees

In [112]:
def runRandomForest(k, n, data, targets, d):
    accuracy = []
    squareErrors = []
    variances = []
    biases= []
    trainS, testS, trainT, testT = getSets(k, data, targets)
    
    for i in range(0, k):
        forest = ensemble.RandomForestClassifier(n_estimators = n, max_depth = d)
        # Training
        forest = forest.fit(trainS[i], trainT[i])
        # Testing
        test = forest.predict(testS[i])
        
        # Accuracy
        accuracy.append(metrics.accuracy_score(testT[i], test))
        # Variances
        variances.append(getVariance(test))
        # Biases
        biases.append(getBias(test, testT[i]))
        # Mean Square Error
        squareErrors.append(getMSE(test, testT[i]))
        
    # Mean results
    data = {
        "accuracy": np.mean(accuracy),
        "bias": np.mean(biases),
        "variance": np.mean(variances),
        "mse": np.mean(squareErrors)
    }
    
    # Printing results
    return data

Function to run Random Forests of range(start, end) trees with max_depth d over k-folds

In [90]:
def runForestNums(k, start, end, data, targets, d):
    allData = []
    for i in range(start, end):
        allData.append(runRandomForest(k, i, data, targets, d))
        
    return allData

<h3>Iris Flowers</h3>

In [113]:
# Random Forest on 5 folds with 10 trees
runRandomForest(5, 10, iris.data, iris.target, 1)

{'accuracy': 0.8,
 'bias': -0.053333333333333344,
 'variance': 0.5880000000000001,
 'mse': 0.6453333333333333}

In [115]:
# Random Forest on 5 folds with 10 to 20 trees
irisForestsTree = []
for i in range(10, 20):
    irisForestsTree.append(runRandomForest(5, i, iris.data, iris.target, i))
    
xAxis = "number of k-folds"
yAxis = "mean accuracy"
title = "Mean Accuracy for Varying Tree Counts"
# plotScores(irisForestsTree, xAxis, yAxis, title)

<h3>Wine Quality</h3>

In [118]:
# Random Forest on 5 folds with 10 trees
runRandomForest(5, 10, wines, targets, 1)

{'accuracy': 0.448761334973213,
 'bias': 0.12209084655312577,
 'variance': 0.0,
 'mse': 0.015361395361437805}

In [119]:
# Random Forest on 5 folds with 10 to 20 trees
wineForestsTree = []
startTime = time.time() # Timer start

for i in range(10, 20):
    wineForestsTree.append(runRandomForest(5, i, wines, targets, i))
    
totalTime = time.time() - startTime # Timer end
    
xAxis = "number of k-folds"
yAxis = "mean accuracy"
title = "Mean Accuracy for Varying Tree Counts"
# plotScores(wineForestsTree, xAxis, yAxis, title)

<h1>K-Fold Cross Validation</h1>

Analysis on how number of folds (`k`) affects the validation of a model

<h3>Iris Flowers</h3>

In [69]:
print('h')

h


<h3>Wine Quality</h3>

In [68]:
print('hh')

hh
