In [158]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import RobustScaler

import optuna
import lightgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import json

plt.rc('font', family='NanumBarunGothic')

In [160]:
train_df = pd.read_csv(r"C:\Users\User\LG_Aimers\MainTask\train.csv")
train_id = train_df.pop('ID')
test_df = pd.read_csv(r"C:\Users\User\LG_Aimers\MainTask\test.csv")
test_id = test_df.pop('ID')

In [161]:
categorical_columns = [
    "시술 시기 코드",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "난자 출처",
    "정자 출처",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [162]:
numeric_columns = [
    "시술 당시 나이",
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 기증자 나이",
    "정자 기증자 나이"
]

In [163]:
times_values = ["총 시술 횟수",
                "클리닉 내 총 시술 횟수",
                "IVF 시술 횟수",
                "DI 시술 횟수",
                "총 임신 횟수",
                "IVF 임신 횟수",
                "DI 임신 횟수",
                "총 출산 횟수",
                "IVF 출산 횟수",
                "DI 출산 횟수"]

donar_age_values = ["난자 기증자 나이", "정자 기증자 나이"]

train_df["시술 당시 나이"] = train_df["시술 당시 나이"].map({
    "만18-34세" : 1,
    "만35-37세" : 2,
    "만38-39세" : 3,
    "만40-42세" : 4,
    "만43-44세" : 5,
    "만45-50세" : 6,
    "알 수 없음" : 0
})

train_df[times_values] = train_df[times_values].map(lambda x : int(x[0]))

for donar_age_value in donar_age_values:
    train_df[donar_age_value] = train_df[donar_age_value].map({
        '만20세 이하' : 1,
        '만21-25세' : 2,
        '만26-30세' : 3,
        '만31-35세' : 4,
        '만36-40세' : 5,
        '만41-45세' : 6,
        "알 수 없음" : 0
    })

test_df["시술 당시 나이"] = test_df["시술 당시 나이"].map({
    "만18-34세" : 1,
    "만35-37세" : 2,
    "만38-39세" : 3,
    "만40-42세" : 4,
    "만43-44세" : 5,
    "만45-50세" : 6,
    "알 수 없음" : 0
})

test_df[times_values] = test_df[times_values].map(lambda x : int(x[0]))

for donar_age_value in donar_age_values:
    test_df[donar_age_value] = test_df[donar_age_value].map({
        '만20세 이하' : 1,
        '만21-25세' : 2,
        '만26-30세' : 3,
        '만31-35세' : 4,
        '만36-40세' : 5,
        '만41-45세' : 6,
        "알 수 없음" : 0
    })

In [164]:
# Fill missing values in numeric columns using the mean
train_df[numeric_columns] = train_df[numeric_columns].fillna(train_df[numeric_columns].median())
test_df[numeric_columns] = test_df[numeric_columns].fillna(train_df[numeric_columns].median())

train_df = train_df.fillna("NA")
test_df = test_df.fillna("NA")

In [165]:
if train_df.isnull().any().any():
    print(train_df.isnull().sum())
if test_df.isnull().any().any():
    print(test_df.isnull().sum())

In [166]:
# Convert categorical columns to category dtype
for col in categorical_columns:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype("category")
        test_df[col] = test_df[col].astype("category")
    else:
        categorical_columns.remove(col)

In [167]:
# Function to safely divide and replace division by zero with 99
def safe_divide(numerator, denominator):
    return numerator / (denominator + 1e-6)

# Apply the safe division function to each calculation
train_df["미세주입에서 생성된 배아 비율1"] = safe_divide(train_df["미세주입에서 생성된 배아 수"], train_df["총 생성 배아 수"])
test_df["미세주입에서 생성된 배아 비율1"] = safe_divide(test_df["미세주입에서 생성된 배아 수"], test_df["총 생성 배아 수"])

train_df["저장된 배아 비율"] = safe_divide(train_df["저장된 배아 수"], train_df["총 생성 배아 수"])
test_df["저장된 배아 비율"] = safe_divide(test_df["저장된 배아 수"], test_df["총 생성 배아 수"])

train_df["미세주입 후 저장된 배아 비율"] = safe_divide(train_df["미세주입 후 저장된 배아 수"], train_df["저장된 배아 수"])
test_df["미세주입 후 저장된 배아 비율"] = safe_divide(test_df["미세주입 후 저장된 배아 수"], test_df["저장된 배아 수"])

train_df["혼합된 난자 비율"] = safe_divide(train_df["혼합된 난자 수"], train_df["수집된 신선 난자 수"] + train_df["해동 난자 수"])
test_df["혼합된 난자 비율"] = safe_divide(test_df["혼합된 난자 수"], test_df["수집된 신선 난자 수"] + test_df["해동 난자 수"])

train_df["저장된 신선 난자 비율"] = safe_divide(train_df["저장된 신선 난자 수"], train_df["수집된 신선 난자 수"])
test_df["저장된 신선 난자 비율"] = safe_divide(test_df["저장된 신선 난자 수"], test_df["수집된 신선 난자 수"])

train_df["미세주입된 난자 비율"] = safe_divide(train_df["미세주입된 난자 수"], train_df["혼합된 난자 수"])
test_df["미세주입된 난자 비율"] = safe_divide(test_df["미세주입된 난자 수"], test_df["혼합된 난자 수"])


In [168]:
train_df[numeric_columns] =train_df[numeric_columns].astype(str)
train_df[categorical_columns] =train_df[categorical_columns].astype(str)

# 학습

# CATBOOST

In [214]:
import optuna
import numpy as np
import pandas as pd
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X, y = train_df.drop(['임신 성공 여부'], axis=1), train_df['임신 성공 여부']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def objective(trial):
    model = CatBoostClassifier(
        task_type='GPU', 
        devices='0', 
        iterations=trial.suggest_int('iterations', 500, 2000),
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        depth=trial.suggest_int('depth', 4, 12),
        cat_features=categorical_columns, 
        verbose=0,  
    )
    
    model.fit(X_train, y_train)

    y_pred_prob = model.predict_proba(X_test)[:, 1] 
    
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    
    print(f"Trial {trial.number}: AUC = {roc_auc:.4f}")
    
    return roc_auc

study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=50, interval_steps=20)
)

study.optimize(objective, n_trials=1000) 
print("Best hyperparameters: ", study.best_params)
print("Best ROC AUC: ", study.best_value)


[I 2025-02-19 16:08:41,027] A new study created in memory with name: no-name-6f49b848-d9f7-4393-8185-6e37813bf8d9
  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:11:00,183] Trial 0 finished with value: 0.7262162537407912 and parameters: {'iterations': 1062, 'learning_rate': 0.07114476009343425, 'depth': 10}. Best is trial 0 with value: 0.7262162537407912.


Trial 0: AUC = 0.7262


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:12:08,792] Trial 1 finished with value: 0.7247180892996571 and parameters: {'iterations': 1398, 'learning_rate': 0.00029380279387035364, 'depth': 5}. Best is trial 0 with value: 0.7262162537407912.


Trial 1: AUC = 0.7247


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:13:11,936] Trial 2 finished with value: 0.7374949384200336 and parameters: {'iterations': 587, 'learning_rate': 0.0396760507705299, 'depth': 9}. Best is trial 2 with value: 0.7374949384200336.


Trial 2: AUC = 0.7375


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:19:27,489] Trial 3 finished with value: 0.7323003937628256 and parameters: {'iterations': 1562, 'learning_rate': 0.00011527987128232407, 'depth': 12}. Best is trial 2 with value: 0.7374949384200336.


Trial 3: AUC = 0.7323


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:20:51,477] Trial 4 finished with value: 0.7271925803722847 and parameters: {'iterations': 1749, 'learning_rate': 0.0004335281794951569, 'depth': 5}. Best is trial 2 with value: 0.7374949384200336.


Trial 4: AUC = 0.7272


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:21:58,290] Trial 5 finished with value: 0.7307217680401388 and parameters: {'iterations': 775, 'learning_rate': 0.0008179499475211679, 'depth': 8}. Best is trial 2 with value: 0.7374949384200336.


Trial 5: AUC = 0.7307


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:23:51,331] Trial 6 finished with value: 0.7321921580345533 and parameters: {'iterations': 1148, 'learning_rate': 0.0007476312062252305, 'depth': 9}. Best is trial 2 with value: 0.7374949384200336.


Trial 6: AUC = 0.7322


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:24:42,954] Trial 7 finished with value: 0.7290267789776735 and parameters: {'iterations': 709, 'learning_rate': 0.0007523742884534858, 'depth': 7}. Best is trial 2 with value: 0.7374949384200336.


Trial 7: AUC = 0.7290


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:25:42,613] Trial 8 finished with value: 0.7381208866428457 and parameters: {'iterations': 1184, 'learning_rate': 0.0226739865237804, 'depth': 5}. Best is trial 8 with value: 0.7381208866428457.


Trial 8: AUC = 0.7381


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:26:35,625] Trial 9 finished with value: 0.7357139698718167 and parameters: {'iterations': 1271, 'learning_rate': 0.005987474910461402, 'depth': 4}. Best is trial 8 with value: 0.7381208866428457.


Trial 9: AUC = 0.7357


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:28:24,690] Trial 10 finished with value: 0.7383094438787408 and parameters: {'iterations': 1918, 'learning_rate': 0.012578389235983662, 'depth': 6}. Best is trial 10 with value: 0.7383094438787408.


Trial 10: AUC = 0.7383


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:30:15,084] Trial 11 finished with value: 0.7384141543938629 and parameters: {'iterations': 1899, 'learning_rate': 0.013980053602502707, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 11: AUC = 0.7384


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:32:30,713] Trial 12 finished with value: 0.738251385227547 and parameters: {'iterations': 1975, 'learning_rate': 0.008201915027201747, 'depth': 7}. Best is trial 11 with value: 0.7384141543938629.


Trial 12: AUC = 0.7383


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:34:27,401] Trial 13 finished with value: 0.7383685907383215 and parameters: {'iterations': 1994, 'learning_rate': 0.014293584268844563, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 13: AUC = 0.7384


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:36:21,790] Trial 14 finished with value: 0.7360339802998515 and parameters: {'iterations': 1698, 'learning_rate': 0.002591374126878954, 'depth': 7}. Best is trial 11 with value: 0.7384141543938629.


Trial 14: AUC = 0.7360


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:37:35,046] Trial 15 finished with value: 0.732493855873152 and parameters: {'iterations': 1813, 'learning_rate': 0.0019113354590616638, 'depth': 4}. Best is trial 11 with value: 0.7384141543938629.


Trial 15: AUC = 0.7325


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:39:04,658] Trial 16 finished with value: 0.7383700546167027 and parameters: {'iterations': 1559, 'learning_rate': 0.02019150237252517, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 16: AUC = 0.7384


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:40:35,426] Trial 17 finished with value: 0.734712405447919 and parameters: {'iterations': 1534, 'learning_rate': 0.09534689385187049, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 17: AUC = 0.7347


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:42:43,737] Trial 18 finished with value: 0.7365592917341568 and parameters: {'iterations': 1572, 'learning_rate': 0.03658219102210516, 'depth': 8}. Best is trial 11 with value: 0.7384141543938629.


Trial 18: AUC = 0.7366


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:48:31,012] Trial 19 finished with value: 0.7369641153954375 and parameters: {'iterations': 1411, 'learning_rate': 0.004612405396824376, 'depth': 12}. Best is trial 11 with value: 0.7384141543938629.


Trial 19: AUC = 0.7370


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:50:55,157] Trial 20 finished with value: 0.7376375431025761 and parameters: {'iterations': 1808, 'learning_rate': 0.0204245291147279, 'depth': 8}. Best is trial 11 with value: 0.7384141543938629.


Trial 20: AUC = 0.7376


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:52:47,534] Trial 21 finished with value: 0.7383226037926458 and parameters: {'iterations': 1997, 'learning_rate': 0.012409240034701615, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 21: AUC = 0.7383


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:54:25,472] Trial 22 finished with value: 0.7377757640866653 and parameters: {'iterations': 1705, 'learning_rate': 0.03769270504511828, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 22: AUC = 0.7378


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:55:56,311] Trial 23 finished with value: 0.7381644229150146 and parameters: {'iterations': 1892, 'learning_rate': 0.012476266568110624, 'depth': 5}. Best is trial 11 with value: 0.7384141543938629.


Trial 23: AUC = 0.7382


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:57:45,307] Trial 24 finished with value: 0.7369688580086508 and parameters: {'iterations': 1635, 'learning_rate': 0.0036926620513174436, 'depth': 7}. Best is trial 11 with value: 0.7384141543938629.


Trial 24: AUC = 0.7370


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 16:58:41,671] Trial 25 finished with value: 0.7379260215020946 and parameters: {'iterations': 1457, 'learning_rate': 0.02148652516146319, 'depth': 4}. Best is trial 11 with value: 0.7384141543938629.


Trial 25: AUC = 0.7379


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:00:26,686] Trial 26 finished with value: 0.73807332602791 and parameters: {'iterations': 1878, 'learning_rate': 0.00813811158480941, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 26: AUC = 0.7381


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:01:56,404] Trial 27 finished with value: 0.7373402382219026 and parameters: {'iterations': 1811, 'learning_rate': 0.04943141615398296, 'depth': 5}. Best is trial 11 with value: 0.7384141543938629.


Trial 27: AUC = 0.7373


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:03:32,351] Trial 28 finished with value: 0.7347940118174704 and parameters: {'iterations': 980, 'learning_rate': 0.0023158102441079177, 'depth': 9}. Best is trial 11 with value: 0.7384141543938629.


Trial 28: AUC = 0.7348


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:08:01,981] Trial 29 finished with value: 0.7151725118413299 and parameters: {'iterations': 1697, 'learning_rate': 0.06140295958729102, 'depth': 11}. Best is trial 11 with value: 0.7384141543938629.


Trial 29: AUC = 0.7152


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:09:29,144] Trial 30 finished with value: 0.7382082722455122 and parameters: {'iterations': 1283, 'learning_rate': 0.025961277113798673, 'depth': 7}. Best is trial 11 with value: 0.7384141543938629.


Trial 30: AUC = 0.7382


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:11:19,614] Trial 31 finished with value: 0.7383477278258265 and parameters: {'iterations': 1985, 'learning_rate': 0.014844145083571054, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 31: AUC = 0.7383


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[I 2025-02-19 17:13:12,352] Trial 32 finished with value: 0.7383971725227869 and parameters: {'iterations': 1985, 'learning_rate': 0.014638300109369172, 'depth': 6}. Best is trial 11 with value: 0.7384141543938629.


Trial 32: AUC = 0.7384


  learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
[W 2025-02-19 17:14:22,603] Trial 33 failed with parameters: {'iterations': 1872, 'learning_rate': 0.008032205730628418, 'depth': 5} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\User\AppData\Local\Temp\ipykernel_18648\2419899885.py", line 25, in objective
    model.fit(X_train, y_train)
  File "C:\Users\User\anaconda3\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\User\anaconda3\Lib\site-packages\catboost\core.py", line 2410, in _fit
    self._train(
  File "C:\Users\User\anaconda3\Lib\site-packages\catboost\core.py", l

KeyboardInterrupt: 

# CATBOOST X LGBM


In [None]:
import optuna
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

# 데이터 준비
X = train_df.drop(['임신 성공 여부'], axis=1)
y = train_df['임신 성공 여부']
cat_features = categorical_columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def objective(trial):
    # CatBoost 하이퍼파라미터
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 500, 1500),
        'depth': trial.suggest_int('cat_depth', 4, 10),
        'learning_rate': trial.suggest_float('cat_lr', 0.01, 0.2, log=True),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'class_weights': [1, 3],
        'random_seed': 42,
        'verbose': 0
    }
    cat_model = CatBoostClassifier(**cat_params)
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)
    cat_model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, use_best_model=True)
    cat_pred_prob = cat_model.predict_proba(X_test)[:, 1]
    
    # LightGBM 하이퍼파라미터
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 500, 1500),
        'learning_rate': trial.suggest_float('lgb_lr', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('lgb_max_depth', 4, 10),
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'class_weight': {0: 1, 1: 3},
        'random_state': 42
    }
    X_train_lgb = X_train.copy()
    X_test_lgb = X_test.copy()
    X_train_lgb[cat_features] = X_train_lgb[cat_features].astype('category')
    X_test_lgb[cat_features] = X_test_lgb[cat_features].astype('category')
    
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train_lgb, y_train, eval_set=[(X_test_lgb, y_test)], callbacks=[lgb.early_stopping(50)])
    lgb_pred_prob = lgb_model.predict_proba(X_test_lgb)[:, 1]
    
    # 앙상블
    ensemble_pred_prob = (cat_pred_prob + lgb_pred_prob) / 2
    return roc_auc_score(y_test, ensemble_pred_prob)

# Optuna 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적 하이퍼파라미터 출력
print("Best trial:", study.best_trial.params)

[I 2025-02-19 14:10:56,648] A new study created in memory with name: no-name-d58b8514-537c-4c80-878d-396bc7ffffe2


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511008 -> initscore=0.044039
[LightGBM] [Info] Start training from score 0.044039
Training until validation scores don't improve for 50 rounds


[I 2025-02-19 14:12:41,886] Trial 0 finished with value: 0.7375302283209461 and parameters: {'cat_iterations': 1112, 'cat_depth': 9, 'cat_lr': 0.03830467838356166, 'lgb_n_estimators': 1407, 'lgb_lr': 0.17918952839126617, 'lgb_max_depth': 10}. Best is trial 0 with value: 0.7375302283209461.


Early stopping, best iteration is:
[38]	valid_0's auc: 0.736334
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511008 -> initscore=0.044039
[LightGBM] [Info] Start training from score 0.044039
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[631]	valid_0's auc: 0.73695


[I 2025-02-19 14:13:37,286] Trial 1 finished with value: 0.7372029386167762 and parameters: {'cat_iterations': 1217, 'cat_depth': 4, 'cat_lr': 0.08915582310662644, 'lgb_n_estimators': 1207, 'lgb_lr': 0.029070421389034314, 'lgb_max_depth': 4}. Best is trial 0 with value: 0.7375302283209461.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511008 -> initscore=0.044039
[LightGBM] [Info] Start training from score 0.044039
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.736795


[I 2025-02-19 14:15:20,130] Trial 2 finished with value: 0.7373578884494226 and parameters: {'cat_iterations': 1114, 'cat_depth': 4, 'cat_lr': 0.03883280156846449, 'lgb_n_estimators': 658, 'lgb_lr': 0.16682406903054436, 'lgb_max_depth': 7}. Best is trial 0 with value: 0.7375302283209461.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012908 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511008 -> initscore=0.044039
[LightGBM] [Info] Start training from score 0.044039
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[148]	valid_0's auc: 0.736738


[I 2025-02-19 14:17:19,873] Trial 3 finished with value: 0.7372219100195034 and parameters: {'cat_iterations': 811, 'cat_depth': 5, 'cat_lr': 0.027290723449181008, 'lgb_n_estimators': 975, 'lgb_lr': 0.06580661782784465, 'lgb_max_depth': 9}. Best is trial 0 with value: 0.7375302283209461.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511008 -> initscore=0.044039
[LightGBM] [Info] Start training from score 0.044039
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	valid_0's auc: 0.736631


[I 2025-02-19 14:20:55,440] Trial 4 finished with value: 0.7374053512042338 and parameters: {'cat_iterations': 704, 'cat_depth': 10, 'cat_lr': 0.022798764749376497, 'lgb_n_estimators': 664, 'lgb_lr': 0.12958849164225786, 'lgb_max_depth': 5}. Best is trial 0 with value: 0.7375302283209461.
