In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import copy
import warnings
from scipy import stats
from IPython.display import display
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import preprocessing

In [7]:
# Loading the data set and splitting off the target feature, 'Class'
wine_df = pd.read_csv("wine.csv")
wine_class_df = wine_df['Class']
wine_data_df = wine_df.drop('Class', axis=1)
display(wine_df.columns)
wine_df.head(5)

Index(['Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Mg',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Mg,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [26]:
# Generating a consistent set of random seeds, so that my results are reproducible
rng = np.random.RandomState(1234)
random_seeds = rng.randint(low=0, high=99999, size=30)
random_seeds

array([92975, 58067, 34086, 60620, 89460, 82584, 32399, 55985, 41239,
        9449, 23706,  8222, 97963, 33950, 40684,  8060, 73498, 79222,
       79728, 73180, 93509, 93520, 49398, 68094, 36271,  3824, 22267,
       21580, 80165, 77734])

In [191]:
# This function compares 2 classifiers
def clf_comparison(clf1, clf1_name, clf2, clf2_name, data_df, class_df, test_size=0.34, n_trials=30, seed=1234):
    rng = np.random.RandomState(seed)
    random_seeds = rng.randint(low=0, high=99999, size=n_trials)
    clf1_acc = []
    clf2_acc = []
    for i in range(n_trials):
        data_train, data_test, class_train, class_test = train_test_split(data_df, class_df, test_size=test_size,
                                                                      random_state=random_seeds[i])
        clf1_acc.append(clf_evaluation(clf1, data_train, data_test, class_train, class_test))
        clf2_acc.append(clf_evaluation(clf2, data_train, data_test, class_train, class_test))
    mean_var_CI_calculator(clf1_name, pd.Series(clf1_acc), test_size)
    mean_var_CI_calculator(clf2_name, pd.Series(clf2_acc), test_size)
    return clf1_acc, clf2_acc
                        
# This executes a simple fit-predict-evaluate routine with a single data split.
# classifier:   a classifier model that conforms to sklearns regular classifier interface
# data/class_train/test: array-like containers holding corresponding data and class labels
def clf_evaluation(classifier, data_train, data_test, class_train, class_test):
    clf_fit = classifier.fit(data_train, class_train)
    clf_pred = classifier.predict(data_test)
    return accuracy_score(class_test, clf_pred)

# This calculates the mean accuracy, variance, and confidence interval and prints it
def mean_var_CI_calculator(clf_name, clf_acc, test_size):
    sample_mean = clf_acc.mean()
    sample_var = clf_acc.var()
    sample_std = clf_acc.std()
    sample_size = len(clf_acc)
    z_crit = stats.norm.ppf(q = 0.975)
    err_margin = z_crit * sample_std / (sample_size ** 0.5)
    print("Accuracy statistics For classifier: " + clf_name + " with "
          + str(test_size) + "% holdout for testing")
    print("""    Sample Size:{:5d}
    Variance:{:>8.5f}
    Mean:{:>12.5f} +/- {:0.5f}
    95% CI =   [{:0.4f}, {:0.4f}] \n"""
          .format(sample_size,
                  sample_var,
                  sample_mean,
                  err_margin,
                  sample_mean-err_margin,
                  sample_mean+err_margin))

In [192]:
# Using the Decision Tree Classifier parameters found via a GridSearch
#   over 10 models and used for assignment 1
tree_clf = tree.DecisionTreeClassifier(criterion='entropy',
                                            max_depth=3,
                                            min_samples_leaf=2,
                                            min_samples_split=2,
                                            random_state=1234)

nb_gauss_clf = naive_bayes.GaussianNB()

In [193]:
tree_acc, nb_gauss_acc = clf_comparison(tree_clf, "tree_clf",
                                        nb_gauss_clf, "nb_gauss_clf",
                                        wine_data_df, wine_class_df)

Accuracy statistics For classifier: tree_clf with 0.34% holdout for testing
    Sample Size:   30
    Variance: 0.00092
    Mean:     0.91421 +/- 0.01086
    95% CI =   [0.9033, 0.9251] 

Accuracy statistics For classifier: nb_gauss_clf with 0.34% holdout for testing
    Sample Size:   30
    Variance: 0.00023
    Mean:     0.96885 +/- 0.00541
    95% CI =   [0.9634, 0.9743] 



The functions above used the same training and testing subsets to build a model for each classifier test the accuracy of both models. This process was repeated 30 times and the model accuracies were recorded to lists for their respective classifier. Then I calculated the above descriptive statistics for the both lists of accuracy values. 

To determine whether there's a statistically significant difference between the accuracies for these models, we'll perform a hypothesis test, and as we have 30 pairs (which is considered the lower bound for performing large-sample hypothesis tests) we'll perform a small-sample hypothesis test using Student's t-statistic. 

The small-sample hypothesis test for paired population means (ie the paired t-test) is given by the equation below

$$t=\frac{\big(\bar{x}_1 - \bar{x}_2\big)-D_0}{\sqrt{s_p^2\big(\frac{1}{n_1} + \frac{1}{n_2}\big)}}$$

where sample variance for the pair, $s_p^2$ is given by 

$$s_p^2 = \frac{\big(n_1-1\big)s_1^2 + \big(n_2 -1\big)s_2^2}{n_1+n_2-2}$$

For our classifier accuracy data: 
* sample size $=n_1=n_2=30$ samples 
* tree_clf sample mean:         $\bar{x}_1 = 0.91421$ 
* tree_clf sample variance:     $s_1^2 = 0.00092$
* nb_gauss_clf sample mean:     $\bar{x}_2 = 0.96885$
* nb_gauss_clf sample variance: $s_2^2 = 0.00023$
* pooled pop. variance est:     $s_p = \frac{0.00092\big(29\big)+0.00023\big(29\big)}{58} = 0.000575$

The null hypothesis ($H_0$) is that the mean accuracy for models is the same, (ie $H_0:\mu_1 - \mu_2 = 0$), and the alternative hypotheses is that the accuracies differ (ie $H_a:\mu_1 - \mu_2 \neq 0$).

$$t=\frac{\big(0.91421 - 0.96885\big)-0}{\sqrt{0.000575\big(\frac{1}{30} + \frac{1}{30}\big)}}$$

This value is also given by the 

In [199]:
sp_sq = ((0.00092*29)+(0.00023*29))/58
paired_t = (0.91421-0.96885-0)/((0.000575*(2/30))**0.5)
paired_t

-8.825156138537452

In [200]:
stats.ttest_ind(tree_acc, nb_gauss_acc)

Ttest_indResult(statistic=-8.8269530081970586, pvalue=2.5620615813813117e-12)

In [128]:
help(format)

Help on built-in function format in module builtins:

format(value, format_spec='', /)
    Return value.__format__(format_spec)
    
    format_spec defaults to the empty string

