<a href="https://colab.research.google.com/github/MoriartyKang/ML/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [None]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [None]:
# wine 처음 5개 행 데이터 확인
wine.head(5)

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [None]:
# wine 전체 행의 개수 확인
print(wine.shape)

(6497, 4)


In [None]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [None]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [None]:
# 데이터 셋 랜덤하게 훈련데이터 8 테스트 데이터 2 비율
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [None]:
# 훈련에 사용할 데이터
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [None]:
# 훈련에 사용할 데이터,학습에 사용할 데이터 갯수 확인
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [None]:
# 훈련 데이터와 검증데이터 결과값 확인(과적합이 됫나 안됫나 확인)
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [None]:
# 결정나무 사용
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.03606963, 0.02873015, 0.04017568, 0.03807163, 0.02842617]), 'score_time': array([0.01300716, 0.00208879, 0.00672793, 0.00457644, 0.00606632]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [None]:
# 교차 검정의 점수 평균값 계산
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [None]:

from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [None]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [None]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [None]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.01929951, 0.04127059, 0.06968102, ..., 0.00833912, 0.00462952,
       0.00543079])

### 랜덤 서치

In [None]:
from scipy.stats import uniform, randint

In [None]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([7, 1, 0, 8, 6, 2, 8, 0, 0, 6])

In [None]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 92,  91, 100,  96, 110,  87, 117, 103, 108,  96]))

In [None]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.02777908, 0.53140565, 0.22600484, 0.30637314, 0.79826869,
       0.36162616, 0.47980262, 0.6328237 , 0.77838442, 0.89945592])

In [None]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [None]:
# 랜덤 서치 방식 이용
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [None]:
print(rs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [None]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [None]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [None]:
rs.cv_results_['mean_fit_time']

array([0.00700192, 0.00723543, 0.00753069, 0.00771675, 0.00730247,
       0.00910559, 0.00687523, 0.00688968, 0.00839233, 0.00708389,
       0.00699997, 0.01288209, 0.00767064, 0.00821848, 0.00686083,
       0.00853434, 0.00680208, 0.00710239, 0.00765295, 0.00697207,
       0.00807629, 0.00672312, 0.00706315, 0.00695734, 0.00796437,
       0.00772848, 0.0069561 , 0.00723109, 0.00728536, 0.0068984 ,
       0.00687976, 0.01176839, 0.00691381, 0.00846853, 0.00933995,
       0.00726266, 0.00684819, 0.00989232, 0.00802827, 0.0073596 ,
       0.00736637, 0.00780416, 0.00685239, 0.00733032, 0.0082962 ,
       0.00680099, 0.00794973, 0.00706286, 0.00767932, 0.00697613,
       0.00737538, 0.00786824, 0.00719199, 0.00675659, 0.00674534,
       0.00718417, 0.00696778, 0.00804858, 0.00711455, 0.00685945,
       0.00817904, 0.00684524, 0.00665708, 0.0075717 , 0.00662036,
       0.00669594, 0.00682979, 0.00699449, 0.00688806, 0.00898037,
       0.00678153, 0.00813766, 0.00856791, 0.00669684, 0.00957

In [None]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.007553496837615967


### 결정트리 분할 옵션 변경

In [None]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [None]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [None]:
rs2.cv_results_['mean_fit_time']

array([0.00377913, 0.00353098, 0.00389204, 0.00341649, 0.00312347,
       0.00333924, 0.00311871, 0.00315027, 0.0049367 , 0.00348568,
       0.00328889, 0.00314245, 0.00452585, 0.00398345, 0.00423646,
       0.00363727, 0.00362587, 0.0058104 , 0.01034102, 0.00422406,
       0.00357037, 0.00336652, 0.00469718, 0.00368772, 0.00331492,
       0.00371165, 0.0031621 , 0.00319381, 0.0035378 , 0.00338721,
       0.00317349, 0.00363564, 0.00302095, 0.00336609, 0.00347886,
       0.00352073, 0.00336347, 0.00349951, 0.00306621, 0.00352716,
       0.00326672, 0.00366669, 0.00337658, 0.00336814, 0.00333042,
       0.00321841, 0.00319586, 0.00404549, 0.00347619, 0.00341334,
       0.00502219, 0.00328984, 0.00383258, 0.00319757, 0.00334921,
       0.0033721 , 0.00380354, 0.00344663, 0.00344024, 0.00337806,
       0.00354857, 0.00306773, 0.00335994, 0.00351701, 0.00317197,
       0.0032052 , 0.0033731 , 0.00555482, 0.00312719, 0.00342116,
       0.00328517, 0.00324864, 0.00353289, 0.00425963, 0.01077

In [None]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.00504103422164917


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

---
계산 속도를 향상시킴, 지니 불순도 활용X
