In [1]:
%pylab inline
import numpy as np 
import pandas as pd 
from sklearn import tree
from sklearn import naive_bayes
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('./wdbc.data', header=None)

In [10]:
data = df.values # extract numpy array from df
y = data[:, 1] # get target 
X = data[:, 2:] # input features are from column 2 onwards

# encode target as integer
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

# Test-train split

In [11]:
clf1 = tree.DecisionTreeClassifier()
clf2 = naive_bayes.GaussianNB()

In [12]:
TRAIN_SIZE = 0.7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, shuffle=True, train_size=TRAIN_SIZE)

In [13]:
clf1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [14]:
clf2.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [15]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)

In [16]:
metrics.f1_score(y_test, y_pred1)

0.9370629370629372

In [17]:
metrics.f1_score(y_test, y_pred2)

0.9130434782608695

# $k$-fold 

In [18]:
k = 5
kf = model_selection.KFold(n_splits=k, shuffle=True)
kf.get_n_splits(X)

5

In [19]:
f1_scores1 = []
models1 = []

f1_scores2 = []
models2 = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf1 = tree.DecisionTreeClassifier()
    clf1.fit(X_train, y_train)
    
    clf2 = naive_bayes.GaussianNB()
    clf2.fit(X_train, y_train)
    
    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    
    f1_scores1.append(metrics.f1_score(y_test, y_pred1))
    models1.append(clf1)
    
    f1_scores2.append(metrics.f1_score(y_test, y_pred2))
    models2.append(clf2)

In [20]:
np.mean(f1_scores1)

0.8891648197867067

In [21]:
np.mean(f1_scores2)

0.9035713661648499

In [22]:
np.std(f1_scores1)

0.050406767395561464

In [23]:
np.std(f1_scores2)

0.05004301979436993

In [24]:
from scipy import stats

In [25]:
stats.ttest_ind(f1_scores1, f1_scores2, equal_var=False)

Ttest_indResult(statistic=-0.40565142574112634, pvalue=0.695631891295484)