### Loading Breast Cancer Wisconsin Dataset

In [1]:
import pandas as pd 
%matplotlib notebook

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
                     '/breast-cancer-wisconsin/wdbc.data', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
from sklearn.preprocessing import LabelEncoder 

X = df.iloc[:,2:].values 
y = df.iloc[:,1].values  

le = LabelEncoder()
y = le.fit_transform(y) 
le.classes_

array(['B', 'M'], dtype=object)

In [5]:
le.transform(['M','B'])

array([1, 0])

In [6]:
from sklearn.model_selection import train_test_split  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2,stratify=y,random_state=1)

In [7]:
## Chaining the pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import make_pipeline 

pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=2) ,LogisticRegression(random_state=1))
pipe_lr.fit(X_train,y_train)
y_pred = pipe_lr.predict(X_test)  
print('Test Accuracy: %.3f' % pipe_lr.score(X_test,y_test))

Test Accuracy: 0.956


### Stratified KFold 

In [8]:
import numpy as np 
from sklearn.model_selection import StratifiedKFold 
kfold = StratifiedKFold(n_splits=10,random_state=1).split(X_train,y_train)
scores = [] 
for k , (train,test) in enumerate(kfold):
    pipe_lr.fit(X_train[train],y_train[train])
    score = pipe_lr.score(X_train[test],y_train[test])
    scores.append(score) 
    print('Fold :%2d , class dist : %s ,Acc %.3f' %(k+1,np.bincount(y_train[train]),score))
    

Fold : 1 , class dist : [256 153] ,Acc 0.935
Fold : 2 , class dist : [256 153] ,Acc 0.935
Fold : 3 , class dist : [256 153] ,Acc 0.957
Fold : 4 , class dist : [256 153] ,Acc 0.957
Fold : 5 , class dist : [256 153] ,Acc 0.935
Fold : 6 , class dist : [257 153] ,Acc 0.956
Fold : 7 , class dist : [257 153] ,Acc 0.978
Fold : 8 , class dist : [257 153] ,Acc 0.933
Fold : 9 , class dist : [257 153] ,Acc 0.956
Fold :10 , class dist : [257 153] ,Acc 0.956


In [9]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.1.


In [10]:
from sklearn.model_selection import cross_val_score 
scores = cross_val_score(estimator=pipe_lr,X = X_train , y = y_train ,cv =10 ,n_jobs =1) 
print('CV accuracy scores :%s' % scores) 


CV accuracy scores :[ 0.93478261  0.93478261  0.95652174  0.95652174  0.93478261  0.95555556
  0.97777778  0.93333333  0.95555556  0.95555556]


### learning curves

In [11]:
import matplotlib.pyplot as plt 
from sklearn.model_selection import learning_curve 
pipe_lr = make_pipeline(StandardScaler(),LogisticRegression(penalty='l2',random_state=1))

In [12]:
train_sizes , train_scores , test_scores = learning_curve(estimator=pipe_lr,X=X_train,y=y_train,train_sizes=np.linspace(0.1,1.0,10),cv=10,n_jobs=1)

In [13]:
train_sizes

array([ 40,  81, 122, 163, 204, 245, 286, 327, 368, 409])

In [32]:
train_scores
print(train_scores.shape)

(10, 10)


In [20]:
test_scores

array([[ 0.93478261,  0.93478261,  0.97826087,  0.95652174,  0.86956522,
         0.97777778,  0.95555556,  0.95555556,  0.95555556,  1.        ],
       [ 0.95652174,  0.97826087,  0.93478261,  0.97826087,  0.95652174,
         0.95555556,  0.97777778,  0.95555556,  0.97777778,  1.        ],
       [ 0.95652174,  0.97826087,  0.93478261,  0.97826087,  0.95652174,
         0.95555556,  0.97777778,  0.93333333,  0.97777778,  1.        ],
       [ 0.95652174,  0.97826087,  1.        ,  0.97826087,  0.95652174,
         0.95555556,  0.97777778,  1.        ,  1.        ,  1.        ],
       [ 0.97826087,  0.97826087,  0.95652174,  0.95652174,  0.95652174,
         0.95555556,  0.97777778,  1.        ,  0.97777778,  0.97777778],
       [ 0.95652174,  0.97826087,  1.        ,  0.97826087,  0.95652174,
         0.97777778,  0.97777778,  0.97777778,  0.97777778,  0.97777778],
       [ 0.97826087,  0.97826087,  0.97826087,  0.97826087,  0.95652174,
         0.97777778,  0.97777778,  0.97777778

In [34]:
train_mean = np.mean(train_scores,axis=1)
train_std = np.mean(train_scores , axis =1)
test_mean = np.mean(test_scores,axis=1) 
test_std = np.std(test_scores,axis=1)

In [42]:
plt.plot(train_sizes,train_mean,color='blue',marker='o',markersize=5,label='training accuracy') 
plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha =0.15,color ='blue') 
plt.plot(train_sizes,test_mean,color='blue',marker='o',markersize=5,label='testing accuracy') 
plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha =0.15,color ='green') 
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy') 
plt.legend(loc='lower right')
plt.ylim([0.8,1.2]) 
plt.show()

<IPython.core.display.Javascript object>

### Validation curve

In [49]:
from sklearn.model_selection import validation_curve 
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='logisticregression__C', 
                param_range=param_range,
                cv=10)


In [53]:
train_mean =np.mean(train_scores ,axis = 1)
train_std = np.std(train_scores,axis = 1)
test_mean = np.mean(test_scores ,axis = 1)
test_std = np.mean(test_scores,axis = 1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='blue')

plt.plot(param_range, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.tight_layout()
# plt.savefig('./figures/validation_curve.png', dpi=300)
plt.show()



<IPython.core.display.Javascript object>

### Grid Search 


In [57]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC 

pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1)) 
param_range = [0.0001 , 0.001,0.01,0.1,1.0,10.0,100.0,1000.0] 
param_grid = [{'svc__C':param_range,
                'svc__kernel':['linear']}, 
            {'svc__C':param_range,
             'svc__gamma': param_range,
             'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1) 
gs = gs.fit(X_train,y_train)

In [58]:
print(gs.best_score_)

0.984615384615


In [59]:
print(gs.best_params_)

{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [60]:
clf =gs.best_estimator_
clf.fit(X_train,y_train) 
print('Test accuracy: %.3f' % clf.score(X_test,y_test))

Test accuracy: 0.974


### Nested Grid Search


In [62]:
#NestedGridSearch SVM
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring ='accuracy',cv=2) 
scores = cross_val_score(gs,X_train,y_train,scoring='accuracy',cv=5)
print('CV accuracy: %.3f +/- %.3f' %(np.mean(scores),np.std(scores)))

CV accuracy: 0.974 +/- 0.015


In [63]:
#NestedGridSearch DecisionTree 

from sklearn.tree import DecisionTreeClassifier 
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                   param_grid=[{'max_depth':[1,2,3,4,5,6,7,None]}],
                   scoring ='accuracy',
                   cv=2) 
scores = cross_val_score(gs,X_train,y_train,scoring ='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

CV accuracy: 0.934 +/- 0.016


# Looking at different performance metric

In [64]:
# Reading a confusing matrix 

from sklearn.metrics import confusion_matrix 

pipe_svc.fit(X_train,y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test,y_pred=y_pred)
print(confmat)

[[71  1]
 [ 2 40]]


In [65]:
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

plt.xlabel('predicted label')
plt.ylabel('true label')

plt.tight_layout()
# plt.savefig('./figures/confusion_matrix.png', dpi=300)
plt.show()

<IPython.core.display.Javascript object>

In [69]:
# Precision,Recall 

from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score,f1_score 

print('Precision: %.3f' % precision_score(y_true =y_test,y_pred=y_pred))

print('Recall: %.3f' % recall_score(y_true=y_test,y_pred=y_pred))

print('F1: %.3f' % f1_score(y_true=y_test,y_pred=y_pred))

Precision: 0.976
Recall: 0.952
F1: 0.964


In [70]:
from sklearn.metrics import make_scorer ,f1_score 
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10) 
gs = gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)

0.988021913796
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [None]:
f