In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 데이터 로드
data = pd.read_csv('../titanic_train.csv')

# 간단한 전처리
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna('S', inplace=True)
data.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'], inplace=True)
data = pd.get_dummies(data, columns=['Sex', 'Embarked'])

# 특징 및 타겟 분리
X = data.drop(columns=['Survived'])
y = data['Survived']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
import optuna
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb

In [3]:
# 함수 정의
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'accuracy',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)
    }

    # 모델학습
    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 정확도 계산
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [4]:
# optuna 스터디 생성 및 최적화
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-07-04 16:41:15,200] A new study created in memory with name: no-name-1c424341-b920-44d2-ab02-543ce698470c
[I 2024-07-04 16:41:15,419] Trial 0 finished with value: 0.5865921787709497 and parameters: {'max_depth': 6, 'num_leaves': 204, 'learning_rate': 7.791422771913986e-06, 'n_estimators': 103, 'min_child_samples': 85, 'subsample': 0.5527933638000618, 'colsample_bytree': 0.5603532829094447, 'reg_alpha': 0.6868599023792641, 'reg_lambda': 0.07490947073628001}. Best is trial 0 with value: 0.5865921787709497.
[I 2024-07-04 16:41:15,678] Trial 1 finished with value: 0.7932960893854749 and parameters: {'max_depth': 8, 'num_leaves': 184, 'learning_rate': 0.0024882324108051607, 'n_estimators': 814, 'min_child_samples': 26, 'subsample': 0.9117967896538771, 'colsample_bytree': 0.523410125379955, 'reg_alpha': 1.2387079604189207e-05, 'reg_lambda': 4.395108251267898e-06}. Best is trial 1 with value: 0.7932960893854749.
[I 2024-07-04 16:41:15,745] Trial 2 finished with value: 0.58659217877094

In [5]:
# 최적의 하이퍼 파라미터를 출력해보기
print('best parameters : ', study.best_params)
print('best accuracy : ', study.best_value)

best parameters :  {'max_depth': 15, 'num_leaves': 159, 'learning_rate': 0.047465076314686626, 'n_estimators': 420, 'min_child_samples': 78, 'subsample': 0.999713559984131, 'colsample_bytree': 0.638214280959998, 'reg_alpha': 3.394783617917728e-05, 'reg_lambda': 9.921859110977678e-06}
best accuracy :  0.8547486033519553
