In [18]:
%pylab inline
import numpy as np 
import pandas as pd 
from sklearn import tree
from sklearn import naive_bayes
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

Populating the interactive namespace from numpy and matplotlib


In [19]:
df = pd.read_csv('./wdbc.data', header=None)

In [20]:
data = df.values # extract numpy array from df
y = data[:, 1] # get target 
X = data[:, 2:] # input features are from column 2 onwards

# encode target as integer
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

# kNN 

In [21]:
from sklearn import neighbors

In [22]:
clf = neighbors.KNeighborsClassifier(metric='euclidean')

In [23]:
TRAIN_SIZE = 0.7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, shuffle=True, train_size=TRAIN_SIZE)

In [24]:
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
y_pred = clf.predict(X_test)

In [26]:
metrics.f1_score(y_test, y_pred)

0.8823529411764706

# Test-train split

In [27]:
clf1 = tree.DecisionTreeClassifier()
clf2 = naive_bayes.GaussianNB()
clf3 = neighbors.KNeighborsClassifier(metric='euclidean')

In [28]:
TRAIN_SIZE = 0.7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, shuffle=True, train_size=TRAIN_SIZE)

In [29]:
clf1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
clf2.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [31]:
clf3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [32]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [33]:
metrics.f1_score(y_test, y_pred1)

0.9333333333333333

In [34]:
metrics.f1_score(y_test, y_pred2)

0.9137931034482759

In [35]:
metrics.f1_score(y_test, y_pred3)

0.9075630252100839

# $k$-fold 

In [37]:
k = 5
kf = model_selection.KFold(n_splits=k, shuffle=True)
kf.get_n_splits(X)

5

In [38]:
f1_scores1 = []
models1 = []

f1_scores2 = []
models2 = []

f1_scores3 = []
models3 = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf1 = tree.DecisionTreeClassifier()
    clf1.fit(X_train, y_train)
    
    clf2 = naive_bayes.GaussianNB()
    clf2.fit(X_train, y_train)
    
    clf3 = neighbors.KNeighborsClassifier(metric='euclidean')
    clf3.fit(X_train, y_train)
    
    y_pred1 = clf1.predict(X_test)
    y_pred2 = clf2.predict(X_test)
    y_pred3 = clf3.predict(X_test)
    
    f1_scores1.append(metrics.f1_score(y_test, y_pred1))
    models1.append(clf1)
    
    f1_scores2.append(metrics.f1_score(y_test, y_pred2))
    models2.append(clf2)
    
    f1_scores3.append(metrics.f1_score(y_test, y_pred3))
    models3.append(clf3)

In [39]:
np.mean(f1_scores1)

0.8908313612729415

In [40]:
np.mean(f1_scores2)

0.9037411956615466

In [41]:
np.mean(f1_scores3)

0.8943666845673086

In [42]:
np.std(f1_scores1)

0.03212289653194093

In [43]:
np.std(f1_scores2)

0.012867567072041937

In [44]:
np.std(f1_scores3)

0.03782219916543761

In [46]:
from scipy import stats

In [47]:
print(stats.ttest_ind(f1_scores1, f1_scores2, equal_var=False))
print(stats.ttest_ind(f1_scores1, f1_scores3, equal_var=False))
print(stats.ttest_ind(f1_scores2, f1_scores3, equal_var=False))

Ttest_indResult(statistic=-0.7461413500884742, pvalue=0.48762023993818715)
Ttest_indResult(statistic=-0.14248852704723672, pvalue=0.8903077949787597)
Ttest_indResult(statistic=0.4692988848279598, pvalue=0.6589408124325007)
