In [1]:
import warnings
warnings.filterwarnings('ignore')

### 교차 검증 실습

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
fish_df = pd.read_csv('../../data/fish.csv')
fish_input = fish_df.drop('Species', axis=1)
fish_target = fish_df['Species']

X_train, X_test, y_train, y_test = train_test_split(fish_input, fish_target, random_state=42, stratify=fish_target)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 생선 다중 분류 with cross_val_score

In [4]:
# 교차검증 점수
from sklearn.model_selection import cross_val_score

lg_clf = LogisticRegression(max_iter=1000, solver='newton-cg')

scores = cross_val_score(lg_clf, fish_input, fish_target, cv=5, scoring='accuracy')


# 학습/예측/평가
print("훈련별 정확도: ", scores)
print("모델 정확도", np.mean(scores))

훈련별 정확도:  [0.6875     0.9375     0.96875    1.         0.67741935]
모델 정확도 0.854233870967742


### 생선 다중 분류 with GridSearchCV

In [5]:
# GridSearchCV 사용 -> 최적의 파라미터, 평가점수, 모델확인
from sklearn.model_selection import GridSearchCV, StratifiedKFold
params = {
    'max_iter': [10, 100, 200, 500, 1000, 1500],
    'solver': ['liblinear', 'newton-cg', 'lbfgs']
}
stratifiedkfold = StratifiedKFold()
grid = GridSearchCV(lg_clf, param_grid=params, scoring='accuracy', cv=stratifiedkfold)

grid.fit(fish_input, fish_target)
print("최적의 파라미터: ", grid.best_params_)
print("최적화된 모델 객체: ", grid.best_estimator_)
print("최적화된 점수: ", grid.best_score_)

# 예측 결과 평가
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

최적의 파라미터:  {'max_iter': 1000, 'solver': 'lbfgs'}
최적화된 모델 객체:  LogisticRegression(max_iter=1000)
최적화된 점수:  0.917741935483871


0.975

---
### HyperOpt

---

### Optuna

**hyper.hp클래스**
<table border="1">
  <thead>
    <tr>
      <th>함수명</th>
      <th>설명</th>
      <th>사용 방법</th>
      <th>예시 코드</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>hp.uniform</td>
      <td>연속적인 실수 값 샘플링</td>
      <td>hp.uniform(label, low, high)</td>
      <td><code>hp.uniform('learning_rate', 0.01, 0.1)</code></td>
    </tr>
    <tr>
      <td>hp.quniform</td>
      <td>연속적이지만 일정 간격(q)을 갖는 값 샘플링</td>
      <td>hp.quniform(label, low, high, q)</td>
      <td><code>hp.quniform('num_layers', 1, 5, 1)</code></td>
    </tr>
    <tr>
      <td>hp.loguniform</td>
      <td>로그 스케일로 분포된 실수 값 샘플링</td>
      <td>hp.loguniform(label, low, high)</td>
      <td><code>hp.loguniform('reg_param', -3, 0)</code></td>
    </tr>
    <tr>
      <td>hp.randint</td>
      <td>정수 값 샘플링</td>
      <td>hp.randint(label, upper)</td>
      <td><code>hp.randint('num_trees', 1, 100)</code></td>
    </tr>
    <tr>
      <td>hp.choice</td>
      <td>주어진 리스트 중 임의의 값 샘플링</td>
      <td>hp.choice(label, options)</td>
      <td><code>hp.choice('optimizer', ['adam', 'sgd', 'rmsprop'])</code></td>
    </tr>
    <tr>
      <td>hp.normal</td>
      <td>정규분포에서 값 샘플링</td>
      <td>hp.normal(label, mean, std)</td>
      <td><code>hp.normal('dropout_rate', 0.3, 0.05)</code></td>
    </tr>
    <tr>
      <td>hp.lognormal</td>
      <td>로그 정규분포에서 값 샘플링</td>
      <td>hp.lognormal(label, mean, std)</td>
      <td><code>hp.lognormal('scale', 0, 1)</code></td>
    </tr>
  </tbody>
</table>

In [6]:
!pip install hyperopt



In [7]:
!pip install optuna



In [8]:
from hyperopt import hp

search_space = {
    'x': hp.quniform('x', -10, 10, 1),
    'y': hp.quniform('y', -15, 15, 1)
}

In [9]:
import hyperopt

# 목적 함수
def objective(search_space):
    x = search_space['x']
    y = search_space['y']

    return {
        'loss': x**2 + 20 * y,
        'status': hyperopt.STATUS_OK
    }

In [10]:
from hyperopt import fmin, tpe, Trials

# 하이퍼 파라미터 탐색 과정을 저장하는 객체
trials = Trials()

# fmin(): 목적 함수의 최소값을 찾는 함수
best_val = fmin(
    fn=objective,       # 목적함수
    space=search_space, # 검색공간
    algo=tpe.suggest,   # 베이지안 최적화 적용
    max_evals=500,       # 반복 횟수
    trials=trials       # 탐색과정 저장
)

best_val

100%|██████████| 500/500 [00:06<00:00, 81.29trial/s, best loss: -300.0] 


{'x': np.float64(0.0), 'y': np.float64(-15.0)}

In [11]:
# 탐색과정 -> 목적함수 반환값 (loos와 실행 상태 저장)
print(trials.results[:10])
print(trials.vals)


[{'loss': -11.0, 'status': 'ok'}, {'loss': -96.0, 'status': 'ok'}, {'loss': 144.0, 'status': 'ok'}, {'loss': 216.0, 'status': 'ok'}, {'loss': 16.0, 'status': 'ok'}, {'loss': -104.0, 'status': 'ok'}, {'loss': 205.0, 'status': 'ok'}, {'loss': 236.0, 'status': 'ok'}, {'loss': -216.0, 'status': 'ok'}, {'loss': 264.0, 'status': 'ok'}]
{'x': [np.float64(7.0), np.float64(-2.0), np.float64(2.0), np.float64(-4.0), np.float64(-4.0), np.float64(-6.0), np.float64(-5.0), np.float64(4.0), np.float64(8.0), np.float64(2.0), np.float64(-1.0), np.float64(-4.0), np.float64(-2.0), np.float64(-9.0), np.float64(-6.0), np.float64(-9.0), np.float64(8.0), np.float64(-3.0), np.float64(8.0), np.float64(8.0), np.float64(5.0), np.float64(5.0), np.float64(5.0), np.float64(5.0), np.float64(1.0), np.float64(1.0), np.float64(10.0), np.float64(3.0), np.float64(3.0), np.float64(-0.0), np.float64(-0.0), np.float64(3.0), np.float64(1.0), np.float64(6.0), np.float64(10.0), np.float64(-2.0), np.float64(3.0), np.float64(1.0)

- hyperopt를 활용한 XGBoost 하이퍼 파라미터 튜닝

In [12]:
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score

In [13]:
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42)

# 1. 검색 공간
search_space = {
    'n_estimators': hp.quniform('n_estimators', 100, 500, 100), 
    'max_depth': hp.quniform('max_depth', 3, 10, 1), 
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)

}

# 2. 목적함수
def xgb_objective(search_space):
    xgb_clf = XGBClassifier(
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree']
    )
    mean_acc = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy',cv=3).mean()
    return {
        'loss': -1 * mean_acc,          # 손실함수는 최소화 하는 방향으로 가니까 정확도 평균에 음수를 넣음
        'status': hyperopt.STATUS_OK
    }

best = fmin(
    fn=xgb_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

best

500trial [00:00, ?trial/s, best loss=?]


{'x': np.float64(0.0), 'y': np.float64(-15.0)}

In [14]:
import optuna

def objective(trial):
    x = trial.suggest_uniform('x', -10, 10)
    y = trial.suggest_uniform('y', -15, 15)

    return (x - 3) ** 2 + (y + 5) ** 2

# 스터디 생성
# minimize: 목적함수의 리턴값이 최소가 되는 방향으로
study = optuna.create_study(direction="minimize")

# 최적화 실행
study.optimize(objective, n_trials=500)

# 결과 확인
print(study.best_value)
print(study.best_params)

[I 2025-09-25 16:12:23,354] A new study created in memory with name: no-name-4b2f8b29-3ad4-4fdf-9162-a4d6834d023a
[I 2025-09-25 16:12:23,356] Trial 0 finished with value: 190.00285763424017 and parameters: {'x': -3.9931813771257074, 'y': 6.878479358102297}. Best is trial 0 with value: 190.00285763424017.
[I 2025-09-25 16:12:23,357] Trial 1 finished with value: 15.127277733012534 and parameters: {'x': 6.364212247962062, 'y': -3.048243436369539}. Best is trial 1 with value: 15.127277733012534.
[I 2025-09-25 16:12:23,357] Trial 2 finished with value: 237.85489945540672 and parameters: {'x': -9.669918025970798, 'y': -13.793638420505301}. Best is trial 1 with value: 15.127277733012534.
[I 2025-09-25 16:12:23,358] Trial 3 finished with value: 126.47870464525487 and parameters: {'x': 8.26765541021167, 'y': 4.9363228170446725}. Best is trial 1 with value: 15.127277733012534.
[I 2025-09-25 16:12:23,359] Trial 4 finished with value: 68.7057920578992 and parameters: {'x': -4.011806223574359, 'y':

0.008039066698608273
{'x': 3.0174575410839712, 'y': -4.9120551254596965}


In [15]:
!pip install plotly



In [16]:
import optuna.visualization as vis
import plotly

vis.plot_param_importances(study).show()

In [17]:
vis.plot_optimization_history(study).show()

- optuna를 활용한 XGBoost 하이퍼 파라미터 튜닝

In [22]:
trial = Trials()

# 1. 목적 함수
def xgb_optuna_objective(trial):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100),
        'max_depth': trial.suggest_int('n_estimators', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }


    xgb_clf = XGBClassifier(**params)
    mean_acc = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy',cv=3).mean()
    return -mean_acc



# 2. study 객체 -> 최적화
study = optuna.create_study(direction="maximize")
study.optimize(xgb_optuna_objective, n_trials=50)


# 3. 결과 출력
print(study.best_value)
print(study.best_params)

[I 2025-09-25 16:37:17,939] A new study created in memory with name: no-name-3c2349b0-61f6-48b0-8cb2-05386f06acf6
[I 2025-09-25 16:37:18,214] Trial 0 finished with value: -0.960093896713615 and parameters: {'n_estimators': 200, 'learning_rate': 0.01710992457137359, 'colsample_bytree': 0.6133701931703666}. Best is trial 0 with value: -0.960093896713615.
[I 2025-09-25 16:37:18,501] Trial 1 finished with value: -0.960093896713615 and parameters: {'n_estimators': 300, 'learning_rate': 0.052073008822431585, 'colsample_bytree': 0.9105448042479937}. Best is trial 0 with value: -0.960093896713615.
[I 2025-09-25 16:37:18,752] Trial 2 finished with value: -0.9624413145539906 and parameters: {'n_estimators': 300, 'learning_rate': 0.12768003538568096, 'colsample_bytree': 0.8076959061634145}. Best is trial 0 with value: -0.960093896713615.
[I 2025-09-25 16:37:19,078] Trial 3 finished with value: -0.9647887323943661 and parameters: {'n_estimators': 400, 'learning_rate': 0.06672201120576389, 'colsamp

-0.9389671361502346
{'n_estimators': 100, 'learning_rate': 0.010283577549969601, 'colsample_bytree': 0.9629109905798786}


#### HyperOpt vs Optuna

In [29]:
from sklearn.metrics import accuracy_score
xgb_hpopt = XGBClassifier(
    n_estimator=400,
    max_depth=5,
    learning_rate=0.14,
    colsample_bytree=0.5
)

xgb_optuna = XGBClassifier(
    n_estimators=300,
    max_depth=9,
    learning_rate=0.18,
    colsample_bytree=0.5
)

In [30]:
xgb_hpopt.fit(X_train, y_train)
xgb_optuna.fit(X_train, y_train)

hpopt_pred = xgb_hpopt.predict(X_test)
optuna_pred = xgb_optuna.predict(X_test)

print(f'HyperOpt 최적 파라미터 적용: {accuracy_score(y_test, hpopt_pred)}')
print(f'HyperOpt 최적 파라미터 적용: {accuracy_score(y_test, optuna_pred)}')

HyperOpt 최적 파라미터 적용: 0.958041958041958
HyperOpt 최적 파라미터 적용: 0.958041958041958
