# Breast Cancer Wisconsin (Diagnostic) Data Set

## 1. Upload necessary libraries 

In [1]:
#preliminaries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Import library for splitign the data
from sklearn.model_selection import train_test_split  

#import standard scaler library 
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier  #kNN

## 2. Upload the breast cancer dataset 

In [31]:
datapath = 'wdbc.csv' #https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data

#column names 
cols = ['id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 'concavity_mean',
         'concave_points_mean', 'symmetry_mean', 
         'fractal_dimension_mean', 'radius_se', 'texture_se', 
         'perimeter_se', 'area_se', 'smoothness_se', 
         'compactness_se', 'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 'perimeter_worst',
         'area_worst', 'smoothness_worst', 
         'compactness_worst', 'concavity_worst', 
         'concave_points_worst', 'symmetry_worst', 
         'fractal_dimension_worst'] 

data = pd.read_csv(datapath, header=None, names=cols, error_bad_lines=False)

# Setting 'id_number' as our index
data.set_index(['id_number'], inplace = True) 
data.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
id_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 3. Exploratory Analysis  

In [32]:
data.shape 

(569, 31)

In [33]:
data.dtypes 

diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave_points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave_points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave_points_worst

In [34]:
data['diagnosis'] = data['diagnosis']\
  .map({'M':1, 'B':0})

In [35]:
data['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [36]:
x = data['diagnosis']
ax = sns.countplot(x=x, data=data)

In [37]:
data.describe()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [38]:
# Visual Exploratory Analysis 

# Scatterplot Matrix
# For this part I will the first 7 variables to allow us to really visualise the data
cols = ['radius_mean', 'texture_mean', 
        'perimeter_mean', 'area_mean', 
        'smoothness_mean', 'compactness_mean', 'diagnosis']

sns.pairplot(data,
             x_vars = cols,
             y_vars = cols,
             hue = 'diagnosis', 
             palette = ('Red', 'Blue'), 
             markers=["x", "D"])

<seaborn.axisgrid.PairGrid at 0x7f0bdbd6aba8>

In [39]:
#Pearson Correlation Matrix - To understand the variable correlation 

corr = data.corr(method = 'pearson') # Correlation Matrix

f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 275, as_cmap=True)
plt.title('Breast Cancer Attributes Heatmap')


# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,  cmap=cmap,square=True, 
            xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0bdb2e0080>

In [40]:
#Check for missing values 
data.isnull().sum()
data.isna().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave_points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

## Model Building 

In [41]:
#Split the data into X and y 
X = np.array(data.drop(['diagnosis'], 1)) #features 
y = np.array(data['diagnosis']) #class

In [42]:
#Split Training and test Dataset
#Split the data 80/20 - 80% training data and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)  

In [43]:
#Feature Scaling 
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [44]:
# Develop the K-NN Algorithm 
#No ideal value for K, to start out will use 5 - most commonly used 
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(X_train, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [45]:
#make the prediction on the test data 
y_pred = classifier.predict(X_test) 

## Model Evaluation 

In [54]:
#Performance of the model will be evaluated by a confusion matrix 
#Import required library 
from sklearn.metrics import classification_report, confusion_matrix  

#Performance Metrics for k= 5
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

[[77  0]
 [ 3 34]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        77
          1       1.00      0.92      0.96        37

avg / total       0.97      0.97      0.97       114



In [55]:
#Test the accuracy
accuracy = classifier.score(X_test, y_test)
print(accuracy)
print("5 Nearest Neighbours accuracy:",accuracy, "%")

0.964912280702
5 Nearest Neighbours accuracy: 0.964912280702 %


In [56]:
#Finding the best k-value 
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [57]:
plt.figure(figsize=(12, 6))  
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')  
plt.xlabel('K Value')  
plt.ylabel('Mean Error')

Text(0,0.5,'Mean Error')

From the graph we can see that the mean error is zero when k is between 2 and 4. We canplay around with k = 2 , 3, or 4 to analyse how it impacts the accurcy of the predictions. 

In [58]:
# k = 3 

#No ideal value for K, to start out will use 3 - most commonly used 
classifier_3 = KNeighborsClassifier(n_neighbors=3)  
classifier_3.fit(X_train, y_train)  

#make the prediction on the test data 
y_pred = classifier_3.predict(X_test) 

#Performance Metrics for k=3
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

[[77  0]
 [ 3 34]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        77
          1       1.00      0.92      0.96        37

avg / total       0.97      0.97      0.97       114



In [59]:
#Test the accuracy
accuracy_3 = classifier_3.score(X_test, y_test)
print(accuracy_3)
print("3 Nearest Neighbours accuracy:",accuracy_3, "%")

0.973684210526
3 Nearest Neighbours accuracy: 0.973684210526 %


In [64]:
# k = 4 

#No ideal value for K, to start out will use 4 - most commonly used 
classifier_4 = KNeighborsClassifier(n_neighbors=4)  
classifier_4.fit(X_train, y_train)  

#make the prediction on the test data 
y_pred = classifier_4.predict(X_test) 

#Performance Metrics for k=4
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

[[77  0]
 [ 3 34]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        77
          1       1.00      0.92      0.96        37

avg / total       0.97      0.97      0.97       114



In [65]:
#Test the accuracy
accuracy_4 = classifier_4.score(X_test, y_test)
print(accuracy_4)

print("4 Nearest Neighbours accuracy:",accuracy_4, "%")

0.973684210526
4 Nearest Neighbours accuracy: 0.973684210526 %
