# 데이터 준비

https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ <br>
"?" 문자열이 존재. 이를 누락 값으로 처리하여 최빈값으로 대체 필요. <br> 
class 특징 값으로 양성 종양(2), 악성 종양(4)를 갖는데 이를 각각 0과 1로 변환해야 함

In [193]:
import pandas as pd

col_names = ["id", "clump_thickness", "un_cell_size", "un_cell_shape", "marginal_adheshion", "single_eph_cell_size", "bare_nuclei", "bland_chromatin", "normal_nucleoli", "mitoses", "class"]
breast_cancer = pd.read_csv('./breast-cancer-wisconsin.data', header=None, names=col_names)
breast_cancer.head()

Unnamed: 0,id,clump_thickness,un_cell_size,un_cell_shape,marginal_adheshion,single_eph_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [194]:
breast_cancer.info()

# 'bare_nuclei'의 Dtype만 'object'임

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    699 non-null    int64 
 1   clump_thickness       699 non-null    int64 
 2   un_cell_size          699 non-null    int64 
 3   un_cell_shape         699 non-null    int64 
 4   marginal_adheshion    699 non-null    int64 
 5   single_eph_cell_size  699 non-null    int64 
 6   bare_nuclei           699 non-null    object
 7   bland_chromatin       699 non-null    int64 
 8   normal_nucleoli       699 non-null    int64 
 9   mitoses               699 non-null    int64 
 10  class                 699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


## 누락값 대체 및 클래스 레이블을 0과 1로 변환

In [195]:
breast_cancer.isnull().values.sum() # null이 없다.

0

In [196]:
import numpy as np

breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.replace('?', np.NaN)
breast_cancer.bare_nuclei = \
    breast_cancer.bare_nuclei.fillna(breast_cancer.bare_nuclei.value_counts().index[0])  # 최빈값으로 대체

In [197]:
breast_cancer['cancer_ind'] = 0 
breast_cancer.loc[breast_cancer['class'] == 4, 'cancer_ind'] = 1

# 'class'이라는 Column의 값은 2 또는 4 값이 있는데
# 2로 되어있는 것은 0, 4로 되어어있는 것은 1로 바꾸기 위해 `cancer_ind`라는 Column을 추가하였다.

## 불필요한 변수 제거 및 표준화 적용

In [198]:
X_df = breast_cancer.drop(['id', 'class', 'cancer_ind'], axis=1)
y = breast_cancer['cancer_ind']

In [199]:
from sklearn.preprocessing import StandardScaler  # Z-Score (x-평균/표준편차) -> 표준화 점수로 변환 -> 비교가 가능하도록 하기 위해서

scaler = StandardScaler()

In [200]:
X_scaled_df = scaler.fit_transform(X_df) 

In [201]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.3, random_state=42)

# K-NeighborsClassifer를 이용한 학습

In [202]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(X_train, y_train)

In [203]:
y_pred = knn.predict(X_test)

# 성능 측정 : KNN 모델의 오차행렬, 정확도, AUC

In [204]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9761904761904762

In [205]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[141,   2],
       [  3,  64]])

In [206]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_auc_score(y_test, y_pred)

0.9706189333055005

# GridSearchCV를 이용한 hyper parameter 최적 값 선택

In [214]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(knn, {'n_neighbors' : [1, 2, 3, 4, 5]}, \
    n_jobs=-1, cv=7, scoring='roc_auc')

grid_search.fit(X_train, y_train)

In [215]:
grid_search.best_params_

{'n_neighbors': 4}

In [216]:
knn_best = grid_search.best_estimator_

In [217]:
y_pred = knn_best.predict(X_test)

In [218]:
accuracy_score(y_test, y_pred)

0.9619047619047619

In [220]:
confusion_matrix(y_test, y_pred)

array([[141,   2],
       [  6,  61]])

In [222]:
roc_auc_score(y_test, y_pred)

0.948230873604008