In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [12]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
column_names = ['id', 'diagnosis', 'mean_radius', 'mean_texture', 'mean_perimeter',
                'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity',
                'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
                'se_radius', 'se_texture', 'se_perimeter', 'se_area', 'se_smoothness',
                'se_compactness', 'se_concavity', 'se_concave_points', 'se_symmetry',
                'se_fractal_dimension', 'worst_radius', 'worst_texture', 'worst_perimeter',
                'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity',
                'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension']
df = pd.read_csv(url, header=None, names=column_names)
df

Unnamed: 0,id,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [13]:
from sklearn.preprocessing import LabelEncoder

X = df.iloc[:, 2:]
y = df.iloc[:, 1]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
pipe_svc = make_pipeline(StandardScaler(),
                         PCA(random_state=42),
                         SVC(random_state=42),
                         )

In [15]:
param_pca_range = [5, 10, 15, 20]
param_svc_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [
              {'pca__n_components': param_pca_range,
               'svc__C': param_svc_range,
               'svc__kernel': ['linear']
               },  # Fisrt configuration
              {'pca__n_components': param_pca_range,
               'svc__C': param_svc_range,
               'svc__gamma': param_svc_range,
               'svc__kernel': ['rbf']
               }  # Second configuration
              ]

# In the previous grid, two configurations will be evaluated, in addition to tunning of hyperparams for PCA and SVC.

In [16]:
# create grid to obtain the best hyperparams after using Scaling, PCA and finally SVC.
search = GridSearchCV(estimator=pipe_svc,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      refit=True,
                      n_jobs=-1)

search.fit(X_train, y_train)

In [17]:
print(f'best params: {search.best_params_}')
print(f'best score:  {search.best_score_}')

cv_GridScore_mean = round( search.cv_results_["mean_test_score"].mean()*100 , 2)
cv_GridScore_std  = round( search.cv_results_["std_test_score"].mean()*100 , 2)

print(f'mean score: ({cv_GridScore_mean} +- {cv_GridScore_std})')

best params: {'pca__n_components': 20, 'svc__C': 0.1, 'svc__kernel': 'linear'}
best score:  0.9758937198067633
mean score: (77.03 +- 1.82)


In [18]:
search.cv_results_["mean_test_score"]

array([0.9384058 , 0.97135266, 0.9647343 , 0.96487923, 0.96700483,
       0.96700483, 0.93396135, 0.96913043, 0.97584541, 0.97140097,
       0.97362319, 0.97362319, 0.93618357, 0.96251208, 0.97584541,
       0.97144928, 0.95835749, 0.95821256, 0.93618357, 0.96468599,
       0.97589372, 0.96917874, 0.96700483, 0.96483092, 0.62855072,
       0.62855072, 0.62855072, 0.62855072, 0.62855072, 0.62855072,
       0.62855072, 0.62855072, 0.62855072, 0.62855072, 0.62855072,
       0.62855072, 0.7447343 , 0.94067633, 0.95816425, 0.62855072,
       0.62855072, 0.62855072, 0.94280193, 0.96913043, 0.95381643,
       0.8547343 , 0.63077295, 0.62855072, 0.97135266, 0.96690821,
       0.95381643, 0.86574879, 0.63294686, 0.62855072, 0.96256039,
       0.96700483, 0.92961353, 0.86574879, 0.63294686, 0.62855072,
       0.62855072, 0.62855072, 0.62855072, 0.62855072, 0.62855072,
       0.62855072, 0.62855072, 0.62855072, 0.62855072, 0.62855072,
       0.62855072, 0.62855072, 0.7447343 , 0.93845411, 0.94275

In [20]:
# we pick the best estimator from the grid
best_estimator = search.best_estimator_

#best_estimator.fit(X_train, y_train)  # this line is not needed. (X,y)_train were already used in SearchGridCV
                                       # This is due to default refit=True param in SearchGridCV.
y_pred = best_estimator.predict(X_test)
accuracy_v1 = best_estimator.score(X_test, y_test)

# calculating throught the function
accuracy_v2 = accuracy_score(y_true=y_test, y_pred=best_estimator.predict(X_test))

print(accuracy_v1, accuracy_v2)

0.9824561403508771 0.9824561403508771


</h3> It is important to highlight that these value for mean score is considerably lower than best score due to
 is calculated throught the 168 combinations of hyperparams, where models had both good and bad performance.
 Now, we can use the best hyperparams and apply the cross validations on the best model to calculate a reliable
 accuracy. <h3>

</h3> As note, here we use PCA with specified values for n_components. The best accuracy could be gotten without using
 PCA but was used to see how use other transformer inside grid.<h3>

In [21]:
# The cross validation is applied on the training sample.
cv_score = cross_val_score(estimator=best_estimator,
                           X=X_train, y=y_train, cv=10, n_jobs=-1)

cv_score_mean = round( cv_score.mean()*100, 2 )
cv_score_std  = round( cv_score.std()*100,  2 )

print(f'({cv_score_mean} +- {cv_score_std})%')

(97.59 +- 1.82)%


<h1> Confusion Matrix <h1>

In [42]:
from sklearn.metrics import confusion_matrix

# remember that:  1-M (Malign), 0-B (Bening)

confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
confmat

array([[71,  0],
       [ 2, 41]])

In [44]:
def counting(y_test, y_pred):
    TP, TN, FP, FN = 0, 0, 0, 0
    for i, j in zip(y_test, y_pred):
        if i == j:
            if i == 1: TP += 1  # patients with test Positive(1-M) that were predicted correctly (True)
            else:      TN += 1  # patients with test Negative(0-B) that were predicted correctly (True)
        else:
            if i == 1: FP += 1  # patients with test Positive(1-M) that were predicted incorrectly (False)
            else:      FN += 1  # patients with test Negative(0-B) that were predicted incorrectly (False)

    return (TP, TN, FP, FN)

counts = counting(y_test, y_pred)
print(counts)

(41, 71, 2, 0)


Due to Confusion matrix is design as:

[ [TP] [FN]
  [FP] [TN] ]

for our results:

[ 41,  0
   2,  71 ]


In [52]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# we can evaluate different metrics

bal_acc = balanced_accuracy_score(y_true=y_test, y_pred=y_pred)
pre_val = precision_score(y_true=y_test, y_pred=y_pred)
rec_val = recall_score(y_true=y_test, y_pred=y_pred)
f1_val  = f1_score(y_true=y_test, y_pred=y_pred)
mcc_val = matthews_corrcoef(y_true=y_test, y_pred=y_pred)

In [53]:
print(f'bal_acc = {bal_acc}')
print(f'pre_val = {pre_val}')
print(f'rec_val = {rec_val}')
print(f'f1_val = {f1_val}')
print(f'mcc_val = {mcc_val}')

bal_acc = 0.9767441860465116
pre_val = 1.0
rec_val = 0.9534883720930233
f1_val = 0.9761904761904763
mcc_val = 0.962998132394131
