In [16]:
import warnings
warnings.filterwarnings('ignore')

### 교차 검증 실습

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [18]:
fish_df = pd.read_csv('../../data/fish.csv')
fish_input = fish_df.drop('Species', axis=1)
fish_target = fish_df['Species']

X_train, X_test, y_train, y_test = train_test_split(fish_input, fish_target, random_state=42, stratify=fish_target)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 생선 다중 분류 with cross_val_score

In [28]:
# 교차검증 점수
from sklearn.model_selection import cross_val_score

lg_clf = LogisticRegression(max_iter=1000, solver='newton-cg')

scores = cross_val_score(lg_clf, fish_input, fish_target, cv=5, scoring='accuracy')


# 학습/예측/평가
print("훈련별 정확도: ", scores)
print("모델 정확도", np.mean(scores))

훈련별 정확도:  [0.6875     0.9375     0.96875    1.         0.67741935]
모델 정확도 0.854233870967742


### 생선 다중 분류 with GridSearchCV

In [33]:
# GridSearchCV 사용 -> 최적의 파라미터, 평가점수, 모델확인
from sklearn.model_selection import GridSearchCV, StratifiedKFold
params = {
    'max_iter': [10, 100, 200, 500, 1000, 1500],
    'solver': ['liblinear', 'newton-cg', 'lbfgs']
}
stratifiedkfold = StratifiedKFold()
grid = GridSearchCV(lg_clf, param_grid=params, scoring='accuracy', cv=stratifiedkfold)

grid.fit(fish_input, fish_target)
print("최적의 파라미터: ", grid.best_params_)
print("최적화된 모델 객체: ", grid.best_estimator_)
print("최적화된 점수: ", grid.best_score_)

# 예측 결과 평가
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

최적의 파라미터:  {'max_iter': 1000, 'solver': 'lbfgs'}
최적화된 모델 객체:  LogisticRegression(max_iter=1000)
최적화된 점수:  0.917741935483871


0.975

---
### HyperOpt

---

### Optuna

**hyper.hp클래스**
<table border="1">
  <thead>
    <tr>
      <th>함수명</th>
      <th>설명</th>
      <th>사용 방법</th>
      <th>예시 코드</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>hp.uniform</td>
      <td>연속적인 실수 값 샘플링</td>
      <td>hp.uniform(label, low, high)</td>
      <td><code>hp.uniform('learning_rate', 0.01, 0.1)</code></td>
    </tr>
    <tr>
      <td>hp.quniform</td>
      <td>연속적이지만 일정 간격(q)을 갖는 값 샘플링</td>
      <td>hp.quniform(label, low, high, q)</td>
      <td><code>hp.quniform('num_layers', 1, 5, 1)</code></td>
    </tr>
    <tr>
      <td>hp.loguniform</td>
      <td>로그 스케일로 분포된 실수 값 샘플링</td>
      <td>hp.loguniform(label, low, high)</td>
      <td><code>hp.loguniform('reg_param', -3, 0)</code></td>
    </tr>
    <tr>
      <td>hp.randint</td>
      <td>정수 값 샘플링</td>
      <td>hp.randint(label, upper)</td>
      <td><code>hp.randint('num_trees', 1, 100)</code></td>
    </tr>
    <tr>
      <td>hp.choice</td>
      <td>주어진 리스트 중 임의의 값 샘플링</td>
      <td>hp.choice(label, options)</td>
      <td><code>hp.choice('optimizer', ['adam', 'sgd', 'rmsprop'])</code></td>
    </tr>
    <tr>
      <td>hp.normal</td>
      <td>정규분포에서 값 샘플링</td>
      <td>hp.normal(label, mean, std)</td>
      <td><code>hp.normal('dropout_rate', 0.3, 0.05)</code></td>
    </tr>
    <tr>
      <td>hp.lognormal</td>
      <td>로그 정규분포에서 값 샘플링</td>
      <td>hp.lognormal(label, mean, std)</td>
      <td><code>hp.lognormal('scale', 0, 1)</code></td>
    </tr>
  </tbody>
</table>

In [None]:
!pip install hyperopt

In [None]:
!pip install optuna

In [36]:
from hyperopt import hp

search_space = {
    'x': hp.quniform('x', -10, 10, 1),
    'y': hp.quniform('y', -15, 15, 1)
}

In [37]:
import hyperopt

# 목적 함수
def objective(search_space):
    x = search_space['x']
    y = search_space['y']

    return {
        'loss': x**2 + 20 * y,
        'status': hyperopt.STATUS_OK
    }

In [39]:
from hyperopt import fmin, tpe, Trials

# 하이퍼 파라미터 탐색 과정을 저장하는 객체
trials = Trials()

# fmin(): 목적 함수의 최소값을 찾는 함수
best_val = fmin(
    fn=objective,       # 목적함수
    space=search_space, # 검색공간
    algo=tpe.suggest,   # 베이지안 최적화 적용
    max_evals=500,       # 반복 횟수
    trials=trials       # 탐색과정 저장
)

best_val

  0%|          | 0/500 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 500/500 [00:05<00:00, 88.33trial/s, best loss: -300.0] 


{'x': np.float64(0.0), 'y': np.float64(-15.0)}

In [46]:
# 탐색과정 -> 목적함수 반환값 (loos와 실행 상태 저장)
print(trials.results[:10])
print(trials.vals)


[{'loss': -276.0, 'status': 'ok'}, {'loss': 261.0, 'status': 'ok'}, {'loss': -195.0, 'status': 'ok'}, {'loss': -204.0, 'status': 'ok'}, {'loss': -176.0, 'status': 'ok'}, {'loss': 285.0, 'status': 'ok'}, {'loss': 244.0, 'status': 'ok'}, {'loss': 41.0, 'status': 'ok'}, {'loss': -71.0, 'status': 'ok'}, {'loss': 24.0, 'status': 'ok'}]
{'x': [np.float64(-2.0), np.float64(1.0), np.float64(5.0), np.float64(6.0), np.float64(-8.0), np.float64(5.0), np.float64(-2.0), np.float64(-1.0), np.float64(-7.0), np.float64(-8.0), np.float64(-1.0), np.float64(2.0), np.float64(-3.0), np.float64(0.0), np.float64(8.0), np.float64(-7.0), np.float64(9.0), np.float64(7.0), np.float64(2.0), np.float64(7.0), np.float64(-4.0), np.float64(-4.0), np.float64(-10.0), np.float64(-4.0), np.float64(-4.0), np.float64(-5.0), np.float64(-10.0), np.float64(-6.0), np.float64(-3.0), np.float64(-9.0), np.float64(-5.0), np.float64(3.0), np.float64(-0.0), np.float64(-2.0), np.float64(-6.0), np.float64(3.0), np.float64(-3.0), np.fl

- hyperopt를 활용한 XGBoost 하이퍼 파라미터 튜닝

In [47]:
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42)

# 1. 검색 공간
search_space = {
    'n_estimators': hp.quniform('n_estimators', 100, 500, 100), 
    'max_depth': hp.quniform('max_depth', 3, 10, 1), 
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)

}

# 2. 목적함수
def xgb_objective(search_space):
    xgb_clf = XGBClassifier(
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree']
    )

    return {
        'loss': x**2 + 20 * y,
        'status': hyperopt.STATUS_OK
    }

# 3. 