## GridSearchCV
#### 유방암(Breast cancer) 데이터 분류

##### 1. 데이터 전처리 및 탐색

In [71]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [72]:
import pandas as pd
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [73]:
df.shape

(569, 31)

In [74]:
# 0 - 악성, 1 - 양성
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

In [75]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

##### 2. 훈련/테스트 데이터셋 분리

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target,
    test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

In [107]:
# y 값의 분포
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1]), array([170, 285], dtype=int64))

In [108]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([42, 72], dtype=int64))

#### 3. 학습

In [79]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [80]:
# 분류 - 지도 학습 --> X, y 데이터 모두 필요 --> 학습(훈련): X_train, y_train
dtc.fit(X_train, y_train)

### 4. 예측

In [81]:
pred = dtc.predict(X_test)

In [82]:
rf = pd.DataFrame({'y 실제값': y_test, 'y 예측값': pred})
rf.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,1,1
3,1,1
4,1,1


#### 5. 평가

In [83]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9210526315789473

In [84]:
dtc.score(X_test, y_test)

0.9210526315789473

### GridSearchCV 적용
- 학습/훈련시 사용

In [85]:
params = {
    'max_depth': [2, 5, 8],
    'min_samples_split': [2, 3, 4]
}

In [86]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc,                    # estimator, Decision Tree Classifier
    param_grid = params,    # 파라메터 조합
    scoring='accuracy',     # 평가방법 - 정확도
    cv=5                    # 교차검증 세트 수
)
# 총 3 x 3 x 5 = 45회 훈련           

In [87]:
# 학습 실행
grid_dt.fit(X_train, y_train)

In [88]:
# 베스트 파라메터 조합
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [89]:
# 베스트 스코어
grid_dt.best_score_

0.9472527472527472

In [90]:
# 베스트 모델로 평가
grid_dt.best_estimator_.score(X_test, y_test)

0.8947368421052632

- 파라메터의 범위를 좁혀가면서 계속 수행

In [91]:
params = {
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 3, 4]
}
grid_dt = GridSearchCV(
    dtc,                    # estimator, Decision Tree Classifier
    param_grid = params,    # 파라메터 조합
    scoring='accuracy',     # 평가방법 - 정확도
    cv=5                    # 교차검증 세트 수
)
grid_dt.fit(X_train, y_train)

In [92]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

- 베스트 모델(최적 분류기)로 예측 및 평가

In [93]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.8947368421052632