In [1]:
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

### Classify cells as begning or malignant

### Citation: 

#### https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [2]:
os.chdir("Classification_Data/")

In [3]:
df = pd.read_csv("breast-cancer-wisconsin.data.txt", header=None)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
0     699 non-null int64
1     699 non-null int64
2     699 non-null int64
3     699 non-null int64
4     699 non-null int64
5     699 non-null int64
6     699 non-null object
7     699 non-null int64
8     699 non-null int64
9     699 non-null int64
10    699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
s = """Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape, Marginal Adhesion,Cell Size, Bare Nuclei, Bland Chromatin,Normal Nucleoli, Mitoses, Class"""

In [7]:
names = s.split(",")
names

['Sample code number',
 'Clump Thickness',
 'Uniformity of Cell Size',
 'Uniformity of Cell Shape',
 ' Marginal Adhesion',
 'Cell Size',
 ' Bare Nuclei',
 ' Bland Chromatin',
 'Normal Nucleoli',
 ' Mitoses',
 ' Class']

In [8]:
df.columns = names
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [9]:
df.columns = df.columns.str.replace("\s", "")
df.columns

Index(['Samplecodenumber', 'ClumpThickness', 'UniformityofCellSize',
       'UniformityofCellShape', 'MarginalAdhesion', 'CellSize', 'BareNuclei',
       'BlandChromatin', 'NormalNucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [12]:
df.replace("?", 1, inplace=True)
df = df.astype({"BareNuclei" : "int64"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Samplecodenumber         699 non-null int64
ClumpThickness           699 non-null int64
UniformityofCellSize     699 non-null int64
UniformityofCellShape    699 non-null int64
MarginalAdhesion         699 non-null int64
CellSize                 699 non-null int64
BareNuclei               699 non-null int64
BlandChromatin           699 non-null int64
NormalNucleoli           699 non-null int64
Mitoses                  699 non-null int64
Class                    699 non-null int64
dtypes: int64(11)
memory usage: 60.1 KB


In [13]:
df.Class = df.Class.replace({2: 0, 4: 1})

In [14]:
df.Class.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [15]:
x = df[['ClumpThickness', 'UniformityofCellSize',
       'UniformityofCellShape', 'MarginalAdhesion', 'CellSize', 'BareNuclei',
       'BlandChromatin', 'NormalNucleoli', 'Mitoses']]
y = df.Class

#### KNN measures distances so I will scale the features by giving the features the same magnitude.

In [16]:
x_scaled = StandardScaler().fit_transform(x)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, y, random_state=0)

#### Grid Search to find the optimal parameters

In [18]:
grid = GridSearchCV(n_jobs=-1, 
                    cv=5, estimator=KNeighborsClassifier(), 
                    param_grid={"n_neighbors": list(range(1, 31, 2))})

In [19]:
grid_search = grid.fit(X_train, Y_train)

In [20]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=25, p=2,
           weights='uniform')

In [21]:
grid_search.best_score_

0.9637404580152672

In [22]:
grid_search.best_params_

{'n_neighbors': 25}

#### KNN will use best performing n_neighbors: 25.

In [23]:
knn = KNeighborsClassifier(25).fit(X_train, Y_train)

#### Train accuracy

In [24]:
pd.crosstab(Y_train, knn.predict(X_train), rownames=["Observed"], colnames=["Predicted"])

Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,337,9
1,8,170


In [25]:
print(classification_report(Y_train, knn.predict(X_train)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       346
           1       0.95      0.96      0.95       178

   micro avg       0.97      0.97      0.97       524
   macro avg       0.96      0.96      0.96       524
weighted avg       0.97      0.97      0.97       524



In [26]:
print(f"Accuracy of Train data: {accuracy_score(Y_train, knn.predict(X_train)):.2f}")

Accuracy of Train data: 0.97


#### Test accuracy

In [27]:
pd.crosstab(Y_test, knn.predict(X_test), rownames=["Observed"], colnames=["Predicted"])

Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,109,3
1,5,58


In [28]:
print(classification_report(Y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       112
           1       0.95      0.92      0.94        63

   micro avg       0.95      0.95      0.95       175
   macro avg       0.95      0.95      0.95       175
weighted avg       0.95      0.95      0.95       175



In [29]:
print(f"Accuracy of Test data: {accuracy_score(Y_test, knn.predict(X_test)):.2f}")

Accuracy of Test data: 0.95
