<a href="https://colab.research.google.com/github/Inkyu-Yang356/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [7]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [8]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [9]:
# wine 전체 행의 개수 확인
print(wine.shape)

(6497, 4)


In [10]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [11]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [12]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [13]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [14]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [15]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [16]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))  # 훈련 데이터 정확도
print(dt.score(val_input, val_target))  # 검증 데이터 정확도

0.9971133028626413
0.864423076923077


## 교차 검증

In [17]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)  # 결정트리 모델 교차검증
print(scores)

{'fit_time': array([0.01666832, 0.01454496, 0.02345228, 0.02039123, 0.07605338]), 'score_time': array([0.00230622, 0.00427628, 0.00459242, 0.00207567, 0.00572205]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [18]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [19]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [20]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # 10-Fold, Shuffle 추가
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [21]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [22]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [23]:
gs.fit(train_input, train_target)

In [24]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [25]:
print(gs.best_params_) # 최적의 불순도

{'min_impurity_decrease': 0.0001}


In [26]:
print(gs.cv_results_['mean_test_score'])  # 교차 검증 결과

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [27]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [28]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }  # 적당한 범위의 Parameters

In [29]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [30]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [31]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [32]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.02508965, 0.01821637, 0.04008589, ..., 0.00748053, 0.00628705,
       0.00528779])

### 랜덤 서치

In [33]:
from scipy.stats import uniform, randint

In [34]:
# 균등 분포 샘플링
rgen = randint(0, 10)  # 0과 10 사이의 정수
rgen.rvs(10)  # Rnadom Value Sample

array([2, 5, 0, 6, 1, 7, 9, 3, 2, 0])

In [35]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([102,  79, 124, 111, 106,  99, 103, 102,  92,  82]))

In [36]:
ugen = uniform(0, 1)  # 0과 1 사이의 실수
ugen.rvs(10)

array([0.11890752, 0.34643523, 0.04332004, 0.2780947 , 0.67018034,
       0.13265397, 0.2198937 , 0.57474302, 0.34730468, 0.87127923])

In [37]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [38]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)  # n_jobs=-1, 모든 CPU 코어
rs.fit(train_input, train_target)  # 훈련 데이터 학습

In [39]:
print(rs.best_params_)  # 랜덤서치 후 최적의 Parameters 조합

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [40]:
print(np.max(rs.cv_results_['mean_test_score']))  # 평균 테스트 결과 최댓값

0.8695428296438884


In [41]:
dt = rs.best_estimator_  # 가장 좋은 성능을 낸 모델 반환

print(dt.score(test_input, test_target))

0.86


In [42]:
rs.cv_results_['mean_fit_time']

array([0.00852537, 0.00716767, 0.00771227, 0.00840969, 0.0067307 ,
       0.00780797, 0.0087719 , 0.00793247, 0.00691891, 0.00884576,
       0.00697098, 0.00680428, 0.00719876, 0.0075284 , 0.00656314,
       0.00716882, 0.00723963, 0.00685396, 0.00786052, 0.00749373,
       0.00819478, 0.00730796, 0.0078608 , 0.00699811, 0.00853281,
       0.00781965, 0.00737877, 0.00739875, 0.00890965, 0.00659661,
       0.00685239, 0.00741091, 0.00687318, 0.00803456, 0.008635  ,
       0.00729899, 0.0068501 , 0.0092298 , 0.00682793, 0.00699806,
       0.00696988, 0.00744815, 0.00760064, 0.00814452, 0.00820293,
       0.00674748, 0.00715594, 0.00685267, 0.0075757 , 0.00796571,
       0.0071979 , 0.00753651, 0.00649037, 0.00643516, 0.00661273,
       0.00704789, 0.00680308, 0.00783772, 0.00667939, 0.00680509,
       0.00819483, 0.00684953, 0.00787449, 0.00694318, 0.00630918,
       0.00729151, 0.00646558, 0.00674596, 0.00708985, 0.01218739,
       0.00668998, 0.00782919, 0.00658951, 0.00700808, 0.00694

In [43]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.007462034702301025


### 결정트리 분할 옵션 변경

In [44]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)  # 훈련 데이터 학습

In [45]:
print(rs2.best_params_)  # 랜덤서치 최적의 Parameters
print(np.max(rs2.cv_results_['mean_test_score']))  # Parameter 조합 결과, 평균 테스트 결과

dt = rs2.best_estimator_  #
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [46]:
rs2.cv_results_['mean_fit_time']  # 평균 학습 시간

array([0.00442381, 0.0055666 , 0.00323925, 0.00326633, 0.00484309,
       0.0040556 , 0.00763226, 0.00499649, 0.00712061, 0.00490375,
       0.0076385 , 0.00346246, 0.00829353, 0.00330386, 0.00307541,
       0.00535321, 0.00837011, 0.00937924, 0.00838637, 0.00651445,
       0.00536628, 0.00601964, 0.00748405, 0.00743713, 0.00334754,
       0.00821757, 0.00706453, 0.00721297, 0.00516047, 0.00949435,
       0.00406961, 0.00403185, 0.00369735, 0.00869226, 0.00340462,
       0.00452175, 0.0085989 , 0.00832686, 0.00707512, 0.0042779 ,
       0.01098995, 0.00754838, 0.0085372 , 0.00939455, 0.00473599,
       0.00331445, 0.00302019, 0.00308847, 0.00300875, 0.00298781,
       0.00550299, 0.00768099, 0.007552  , 0.00612755, 0.00587811,
       0.00514441, 0.00534339, 0.00889544, 0.0077075 , 0.00502892,
       0.00974884, 0.0073719 , 0.00647864, 0.00741563, 0.00512629,
       0.00459218, 0.00308685, 0.00594239, 0.00782032, 0.00713282,
       0.00350132, 0.00407963, 0.00452094, 0.00504208, 0.00434

In [47]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.005798003673553467


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

첫번째는 학습해야 할 Parameters 가 많으며 또, 결정트리 데이터 모델이 첫번째의 경우에는 Splitter 지정이 안되어 있지만, 두번쨰는 Random으로 최적의 노드를 찾기때문에 시간이 짧게 걸린다.