In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import copy
import warnings
from scipy import stats
from scipy.stats import t
from IPython.display import display
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import preprocessing

In [2]:
# Loading the data set and splitting off the target feature, 'Class'
wine_df = pd.read_csv("wine.csv")
wine_class_df = wine_df['Class']
wine_data_df = wine_df.drop('Class', axis=1)
display(wine_df.columns)
wine_df.head(5)

Index(['Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Mg',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Mg,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
# Generating a consistent set of random seeds, so that my results are reproducible
rng = np.random.RandomState(1234)
random_seeds = rng.randint(low=0, high=99999, size=30)
random_seeds

array([92975, 58067, 34086, 60620, 89460, 82584, 32399, 55985, 41239,
        9449, 23706,  8222, 97963, 33950, 40684,  8060, 73498, 79222,
       79728, 73180, 93509, 93520, 49398, 68094, 36271,  3824, 22267,
       21580, 80165, 77734])

In [4]:
# help(stats.t.ppf)
tmp = pd.Series(tree_acc)
t_crit = stats.t.ppf(0.975, tmp)
err_margin = t_crit * tmp.std() / (len(tmp) ** 0.5)
err_margin

NameError: name 'tree_acc' is not defined

In [15]:
# This function compares 2 classifiers
def clf_comparison(clf1, clf1_name, clf2, clf2_name, data_df, class_df, test_size=0.34, n_trials=30, seed=1234):
    rng = np.random.RandomState(seed)
    random_seeds = rng.randint(low=0, high=99999, size=n_trials)
    clf1_acc = [];       clf2_acc = []
    clf1_resub_acc = []; clf2_resub_acc = []
    clf1_gen_acc = [];   clf2_gen_acc = []
    for i in range(n_trials):
        data_train, data_test, class_train, class_test = train_test_split(data_df, class_df, test_size=test_size,
                                                                      random_state=random_seeds[i])
        clf1_acc.append(clf_evaluation(clf1, data_train, data_test, class_train, class_test))
        clf2_acc.append(clf_evaluation(clf2, data_train, data_test, class_train, class_test))
    clf1_resub, clf1_gen = resub_and_gen_errors(clf1, data_train, data_test, class_train, class_test)
    clf2_resub, clf2_gen = resub_and_gen_errors(clf2, data_train, data_test, class_train, class_test)
    clf1_resub_acc.append(clf1_resub)
    clf2_resub_acc.append(clf2_resub)
    clf1_gen_acc.append(clf1_gen)
    clf2_gen_acc.append(clf2_gen)
    mean_var_CI_calculator(clf1_name, pd.Series(clf1_acc), test_size)
    mean_var_CI_calculator(clf2_name, pd.Series(clf2_acc), test_size)
    mean_var_CI_calculator("Difference of " + clf1_name + " and " + clf2_name,
                           (pd.Series(clf1_acc)-pd.Series(clf2_acc)).abs(),0.34)
    return clf1_acc, clf2_acc, clf1_resub_acc, clf2_resub_acc, clf1_gen_acc, clf2_gen_acc
                        
# This executes a simple fit-predict-evaluate routine with a single data split.
# classifier:   a classifier model that conforms to sklearns regular classifier interface
# data/class_train/test: array-like containers holding corresponding data and class labels
def clf_evaluation(classifier, data_train, data_test, class_train, class_test):
    clf_fit = classifier.fit(data_train, class_train)
    clf_pred = classifier.predict(data_test)
    return accuracy_score(class_test, clf_pred)

def resub_and_gen_errors(classifier, data_train, data_test, class_train, class_test):
    classifier.fit(data_train, class_train)
    resub_err = classifier.score(data_train, class_train)
    gen_err = classifier.score(data_test, class_test)
    return resub_err, gen_err

# This calculates the mean accuracy, variance, and confidence interval and prints it
def mean_var_CI_calculator(clf_name, clf_acc, test_size):
    sample_mean = clf_acc.mean()
    sample_var = clf_acc.var()
    sample_std = clf_acc.std()
    sample_size = len(clf_acc)
    z_crit = stats.norm.ppf(0.975)  # we're using 30 data points, so it's fair to assume a normal distribution
    err_margin = z_crit * sample_std / (sample_size ** 0.5)
    print("Accuracy statistics For classifier: " + clf_name + " with "
          + str(test_size) + "% holdout for testing")
    print("- Sample Size: {:5d}".format(sample_size))
    print("- Variance: {:5f}".format(sample_var))
    print("- StdDev: {:5f}".format(sample_std))
    print("- Mean:{:>12.5f} +/- {:0.5f}".format(sample_mean, err_margin))
    print("- 95% CI =   [{:0.4f}, {:0.4f}]\n".format(sample_mean-err_margin, sample_mean+err_margin))

In [16]:
# Using the Decision Tree Classifier parameters found via a GridSearch
#   over 10 models and used for assignment 1
tree_clf = tree.DecisionTreeClassifier(criterion='entropy',
                                            max_depth=3,
                                            min_samples_leaf=2,
                                            min_samples_split=2,
                                            random_state=1234)

nb_gauss_clf = naive_bayes.GaussianNB()

In [17]:
tree_acc, nb_gauss_acc, tree_resub_acc, nb_resub_acc, tree_gen_acc, nb_gen_acc = clf_comparison(tree_clf,
                                                                                                "tree_clf",
                                                                                                nb_gauss_clf,
                                                                                                "nb_gauss_clf",
                                                                                                wine_data_df,
                                                                                                wine_class_df,
                                                                                                0.34)

Accuracy statistics For classifier: tree_clf with 0.34% holdout for testing
- Sample Size:    30
- Variance: 0.000921
- StdDev: 0.030345
- Mean:     0.91421 +/- 0.01086
- 95% CI =   [0.9033, 0.9251]

Accuracy statistics For classifier: nb_gauss_clf with 0.34% holdout for testing
- Sample Size:    30
- Variance: 0.000229
- StdDev: 0.015129
- Mean:     0.96885 +/- 0.00541
- 95% CI =   [0.9634, 0.9743]

Accuracy statistics For classifier: Difference of tree_clf and nb_gauss_clf with 0.34% holdout for testing
- Sample Size:    30
- Variance: 0.000785
- StdDev: 0.028011
- Mean:     0.05464 +/- 0.01002
- 95% CI =   [0.0446, 0.0647]



The functions above used the same training and testing subsets to build a model for each classifier test the accuracy of both models. This process was repeated 30 times and the model accuracies were recorded to lists for their respective classifier. Then I calculated the above descriptive statistics for the both lists of accuracy values. 

To determine whether there's a statistically significant difference between the accuracies for these models, we'll perform a hypothesis test, and as we have 30 pairs (which is considered the lower bound for performing large-sample hypothesis tests) we'll perform a small-sample hypothesis test using Student's t-statistic. 

The small-sample hypothesis test for paired population means (ie the paired t-test) is given by the equation below

$$t = \frac{\bar{x}_d-D_0}{\frac{s_d}{\sqrt{n_d}}}$$

For our classifier accuracy data: 
* sample size $=n_d=30$ paired samples 
* sample mean difference        $\bar{x}_d = 0.05464$ 
* Sample Stdev of diffs         $s_d = 0.02801$
* Expected mean difference      $D_0$

The null hypothesis ($H_0$) is that the mean accuracy for models is the same, (ie $H_0:\mu_1 - \mu_2 = D_0 0$), and the alternative hypotheses is that the accuracies differ (ie $H_a:\mu_1 - \mu_2 \neq 0$).

$$t=\frac{(0.05464)-0}{\frac{0.02801}{\sqrt{30}}} = \frac{(0.05464)-0}{0.00511} = 10.6846$$

For $n=30$ and $\alpha = 0.025$ (two tailed $95%$), the critical t value is 2.042. The calculated 10.6846 is far outside those bounds, so the difference in model accuracy is statistically significant.

I also used the relative t-test evaluator in scipy's stats module, which calculated 10.6852, with a $p-val = 1.43 \times 10^{-11}$, which matches the calculated result.

In [95]:
paired_t = 0.05464/(0.02801/(30**0.5))
paired_t

10.684598551261077

In [96]:
stats.ttest_rel(nb_gauss_acc, tree_acc)

Ttest_relResult(statistic=10.685187333241123, pvalue=1.4369053553409192e-11)

# 2: Varying the Training-Testing Split

In [None]:
# TODO: Build a loop where the test size is varied and capture the resub and gen accs
splits = [0.75, 0.65, 0.55, 0.45, 0.35, 0.25, 0.15]
tree_resub_acc_list = []; tree_gen_acc_list = []
nb_resub_acc_list = []; nb_gen_acc_list = []

for split in splits:
    tree_acc, nb_gauss_acc, tree_resub_acc, nb_resub_acc, tree_gen_acc, nb_gen_acc = clf_comparison(tree_clf,
                                                                                                "tree_clf",
                                                                                                nb_gauss_clf,
                                                                                                "nb_gauss_clf",
                                                                                                wine_data_df,
                                                                                                wine_class_df,
                                                                                                0.34)