In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)

In [3]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
df=data.frame
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
data.data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [7]:
data.target

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int32

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data.data,data.target,test_size=0.2,random_state=34)

In [9]:
print("x train:",x_train.shape)
print("x test:",x_test.shape)
print("y train:",y_train.shape)
print("y test:",y_test.shape)

x train: (455, 30)
x test: (114, 30)
y train: (455,)
y test: (114,)


In [10]:
from sklearn.svm import SVC
#model=SVC(kernel='linear')
model=SVC(C=10,kernel='linear')
#model=SVC(C=1,kernel='poly')
#model=SVC(C=1,kernel='sigmoid')
#model=SVC(C=10,kernel='rbf')


model

SVC(C=10, kernel='linear')

In [11]:
model.fit(x_train,y_train)

SVC(C=10, kernel='linear')

In [12]:
predicted=model.predict(x_test)
predicted

array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1])

In [13]:
(y_test)

7      0
202    0
267    1
138    0
369    0
      ..
28     0
196    0
493    1
397    1
391    1
Name: target, Length: 114, dtype: int32

In [14]:
from sklearn import metrics
metrics.accuracy_score(predicted,y_test)

0.9824561403508771

In [15]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predicted)
cm

array([[38,  0],
       [ 2, 74]], dtype=int64)

In [16]:
from sklearn.metrics import classification_report
cf=classification_report(y_test,predicted)
print(cf)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        38
           1       1.00      0.97      0.99        76

    accuracy                           0.98       114
   macro avg       0.97      0.99      0.98       114
weighted avg       0.98      0.98      0.98       114



In [17]:
#using KFold cross validation

In [18]:
from sklearn.svm import SVC
model=SVC(kernel='linear')
from sklearn.model_selection import KFold,cross_val_score
kfold_val=KFold(10)
results=cross_val_score(model,data.data,data.target,cv=kfold_val)
print(results)
print(np.mean(results))

[0.9122807  0.92982456 0.94736842 0.96491228 0.96491228 0.96491228
 0.98245614 0.94736842 0.94736842 0.96428571]
0.9525689223057643


In [19]:
#Using Stratified KFold

In [20]:
from sklearn.svm import SVC
model=SVC(kernel='linear')
from sklearn.model_selection import StratifiedKFold,cross_val_score
kfold_val=StratifiedKFold(n_splits=10)
results=cross_val_score(model,data.data,data.target,cv=kfold_val)
print(results)
print(np.mean(results))

[0.98245614 0.92982456 0.92982456 0.94736842 0.96491228 0.98245614
 0.92982456 0.94736842 0.96491228 0.96428571]
0.9543233082706767


In [29]:
cross_val_score(SVC(kernel='linear'),data.data,data.target,cv=5)

array([0.94736842, 0.92982456, 0.97368421, 0.92105263, 0.95575221])

In [30]:
cross_val_score(SVC(kernel='poly'),data.data,data.target,cv=5)

array([0.84210526, 0.88596491, 0.92982456, 0.94736842, 0.9380531 ])

In [27]:
from sklearn.model_selection import KFold,cross_val_score
kernels=['rbf','linear']
C=[1,5,10]
avg_scores={}
for kval in kernels:
    for cval in C:
        cv_scores=cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),data.data,data.target,cv=5)
        avg_scores[kval + '_' + str(cval)]=np.average(cv_scores)
        
avg_scores        

{'rbf_1': 0.6274181027790716,
 'rbf_5': 0.6274181027790716,
 'rbf_10': 0.6274181027790716,
 'linear_1': 0.9455364073901569,
 'linear_5': 0.9508150908244062,
 'linear_10': 0.9525694767893185}

In [28]:
#using GridSearchCV

In [41]:
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(SVC(gamma='auto'),{
    'kernel' : ['linear','rbf'],
    'C' : [1,5,10]
},cv=5,return_train_score=False)
clf.fit(data.data,data.target)
                 

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': [1, 5, 10], 'kernel': ['linear', 'rbf']})

In [42]:
print(clf.cv_results_)

{'mean_fit_time': array([ 4.67418838,  0.06910439, 10.72601752,  0.16064286, 18.04166422,
        0.07010846]), 'std_fit_time': array([1.87118038, 0.01509603, 2.8769428 , 0.05608467, 5.23814548,
       0.01603717]), 'mean_score_time': array([0.00701346, 0.01662631, 0.0064105 , 0.03866215, 0.00701375,
       0.01682491]), 'std_score_time': array([0.00109741, 0.00049062, 0.00049025, 0.01824407, 0.00109707,
       0.00074857]), 'param_C': masked_array(data=[1, 1, 5, 5, 10, 10],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1, 'kernel': 'linear'}, {'C': 1, 'kernel': 'rbf'}, {'C': 5, 'kernel': 'linear'}, {'C': 5, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'linear'}, {'C': 10, 'kernel': 'rbf'}], 'split0_test_score': arr

In [43]:
df=pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.674188,1.87118,0.007013,0.001097,1,linear,"{'C': 1, 'kernel': 'linear'}",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,3
1,0.069104,0.015096,0.016626,0.000491,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.622807,0.622807,0.631579,0.631579,0.628319,0.627418,0.003949,4
2,10.726018,2.876943,0.006411,0.00049,5,linear,"{'C': 5, 'kernel': 'linear'}",0.947368,0.938596,0.973684,0.929825,0.964602,0.950815,0.016216,2
3,0.160643,0.056085,0.038662,0.018244,5,rbf,"{'C': 5, 'kernel': 'rbf'}",0.622807,0.622807,0.631579,0.631579,0.628319,0.627418,0.003949,4
4,18.041664,5.238145,0.007014,0.001097,10,linear,"{'C': 10, 'kernel': 'linear'}",0.938596,0.938596,0.973684,0.947368,0.964602,0.952569,0.0142,1
5,0.070108,0.016037,0.016825,0.000749,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.622807,0.622807,0.631579,0.631579,0.628319,0.627418,0.003949,4


In [44]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.945536
1,1,rbf,0.627418
2,5,linear,0.950815
3,5,rbf,0.627418
4,10,linear,0.952569
5,10,rbf,0.627418


In [45]:
clf.best_params_

{'C': 10, 'kernel': 'linear'}

In [46]:
clf.best_score_

0.9525694767893185

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters.
This is useful when you have too many parameters to try and your training time is longer. It helps
reduce the cost of computation

In [49]:
from sklearn.model_selection import RandomizedSearchCV
clf=RandomizedSearchCV(SVC(gamma='auto'),{
    'kernel' : ['linear','rbf'],
    'C' : [1,5,10]
},cv=5,return_train_score=False,n_iter=2)
clf.fit(data.data,data.target)
df=pd.DataFrame(clf.cv_results_)
df[['param_C','param_kernel','mean_test_score']]    

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.627418
1,5,linear,0.950815
