In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from imblearn.over_sampling import BorderlineSMOTE

matplotlib.style.use('ggplot')

In [2]:
# loading csv file
df = pd.read_csv('../datasets/breast-cancer-wisconsin.data', names=['sample', 'thickness', 'size', 'shape', 'adhesion', 'epithelial', 'nuclei', 'chromatin', 'nucleoli', 'mitoses', 'status'], na_values='?')

print(df.isnull().sum())
df.head(15)

sample         0
thickness      0
size           0
shape          0
adhesion       0
epithelial     0
nuclei        16
chromatin      0
nucleoli       0
mitoses        0
status         0
dtype: int64


Unnamed: 0,sample,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [3]:
df['status'].value_counts()

2    458
4    241
Name: status, dtype: int64

In [4]:
y = df['status']
X = df.drop(columns=['status', 'sample'])
# 'status' goes for labels and number of 'sample' isn't important

print(X.dtypes)
X.head(15)

thickness       int64
size            int64
shape           int64
adhesion        int64
epithelial      int64
nuclei        float64
chromatin       int64
nucleoli        int64
mitoses         int64
dtype: object


Unnamed: 0,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1
5,8,10,10,8,7,10.0,9,7,1
6,1,1,1,1,2,10.0,3,1,1
7,2,1,2,1,2,1.0,3,1,1
8,2,1,1,1,2,1.0,1,1,5
9,4,2,1,1,2,1.0,2,1,1


In [5]:
# cleaning
X.fillna(X.mean(), inplace=True)
# changing NaN values for mean values
print(X.isna().sum())

thickness     0
size          0
shape         0
adhesion      0
epithelial    0
nuclei        0
chromatin     0
nucleoli      0
mitoses       0
dtype: int64


In [6]:
# making correction of types
X['nuclei'] = X['nuclei'].astype('int64')
print(X.dtypes)
X.head(10)

thickness     int64
size          int64
shape         int64
adhesion      int64
epithelial    int64
nuclei        int64
chromatin     int64
nucleoli      int64
mitoses       int64
dtype: object


Unnamed: 0,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
5,8,10,10,8,7,10,9,7,1
6,1,1,1,1,2,10,3,1,1
7,2,1,2,1,2,1,3,1,1
8,2,1,1,1,2,1,1,1,5
9,4,2,1,1,2,1,2,1,1


In [7]:
oversample = BorderlineSMOTE(sampling_strategy=0.7)
X, y = oversample.fit_resample(X, y)
y.value_counts()

2    458
4    320
Name: status, dtype: int64

In [8]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
np.random.seed(123)

In [9]:
# preprocessing
preprocesser = MinMaxScaler()
X = preprocesser.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [11]:
# decomposition
pca = PCA()
# classifier
knn = KNeighborsClassifier()

In [12]:
# pipeline
pipeline = Pipeline(steps=[('pca', pca), ('knn', knn)])

n_components = np.arange(2, 6)
n_neighbors  = np.arange(2, 9)
weights      = ['uniform', 'distance']

parameters = dict(pca__n_components=n_components, 
                  knn__n_neighbors =n_neighbors,
                  knn__weights     =weights)

In [13]:
# searching for best model and fitting
model = GridSearchCV(pipeline, parameters, cv=7)
model.fit(X_train, y_train)

In [14]:
print('Best number of PCA components:', model.best_estimator_.get_params()['pca__n_components'])
print('Best number of knn neighbors:', model.best_estimator_.get_params()['knn__n_neighbors'])
print('Best knn weights:', model.best_estimator_.get_params()['knn__weights'])
print('Best scores:', f'Accuracy: {100 * model.best_score_ :.4f}%')

Best number of PCA components: 5
Best number of knn neighbors: 5
Best knn weights: uniform
Best scores: Accuracy: 97.1269%


In [15]:
def plottingFun(model, X, y):
    fig = plt.figure()
    ax  = fig.add_subplot(111)
    
    padding    = 0.1
    resolution = 0.1
    
    # !!! (2 for benign, 4 for malignant)
    colors = {2:'royalblue', 4:'lightsalmon'}
    
    
    # calculate boundaries
    X_min, X_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()
    X_range = X_max - X_min
    y_range = y_max - y_min
    X_min  -= X_range * padding
    y_min  -= y_range * padding
    X_max  += X_range * padding
    y_max  += y_range * padding
    
    
    xx, yy = np.meshgrid(np.arange(X_min, X_max, resolution), np.arange(y_min, y_max, resolution))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, cmap=plt.cm.seismic)
    plt.axis('tight')
    
    
    for label in np.unique(y):
        indices = np.where(y==label)
        plt.scatter(X[indices, 0], X[indices, 1], c=colors[label], alpha=0.7, label='{}'.format('benign' if label==2 else 'malignant'))
        
    plt.title('K = ' + str(model.get_params()['n_neighbors']))
    plt.legend(loc='best')
    plt.show()

In [16]:
pca_plot = PCA(n_components=2) # 2D
X_plot = pca_plot.fit_transform(X_train)

knn_plot = KNeighborsClassifier(n_neighbors = model.best_estimator_.get_params()['knn__n_neighbors'], 
                           weights     = model.best_estimator_.get_params()['knn__weights'])
knn_plot.fit(X_plot, y_train)

In [17]:
# plotting the samples and algorithm results
%matplotlib notebook
plottingFun(knn_plot, X_plot, y_train)

<IPython.core.display.Javascript object>

In [18]:
y_pred = model.predict(X_test)

In [19]:
def get_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / y_true.shape[0]

In [20]:
accuracy = get_accuracy(y_test, y_pred)
print( f'Accuracy: {100 * accuracy :.4f}%' )

Accuracy: 98.2906%


In [21]:
import seaborn as sn
from sklearn import metrics

In [22]:
pca_components = model.best_estimator_.get_params()['pca'].components_
print(pca_components)

[[ 0.29299737  0.40311309  0.38940134  0.34268479  0.25946825  0.4315539
   0.2899492   0.35706143  0.13213693]
 [-0.11601901  0.23472124  0.14040567  0.04545603  0.29698629 -0.82124621
   0.00639356  0.33826092  0.18043836]
 [-0.74422577  0.02469438 -0.06347962  0.62990724  0.06187836  0.09205658
   0.06800028 -0.16652186  0.00696278]
 [ 0.41168043  0.16757947  0.10035273  0.30394796  0.22484432 -0.2086493
  -0.09322924 -0.77242466  0.02366984]
 [-0.41138808  0.30539107  0.31575344 -0.61690486  0.3502419   0.17590492
   0.07911667 -0.3083924   0.04740305]]


In [23]:
# plotting the inportance of pca components
cols = ['thickness', 'size', 'shape', 'adhesion', 'epithelial', 'nuclei', 'chromatin', 'nucleoli', 'mitoses']
map = pd.DataFrame(pca_components, columns=cols)
plt.figure()
plt.title('PCA components')
sn.heatmap(map, cmap='seismic')
plt.show()

<IPython.core.display.Javascript object>

In [24]:
cm = metrics.confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 7))
sn.heatmap(cm, annot=True, cmap=plt.cm.Spectral)
plt.ylabel('Actual classes', size=15)
plt.xlabel('Predicted classes', size=15)
plt.title(f'Accuracy: {accuracy*100:.2f}%', size=15)
plt.show()

<IPython.core.display.Javascript object>