In [None]:
!pip install optuna catboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# ✅ LightGBM과 XGBoost를 sklearn과 호환되도록 패치
class SklearnLGBM(LGBMClassifier):
    def _more_tags(self):
        return {"non_deterministic": True}

class SklearnXGB(XGBClassifier):
    def _more_tags(self):
        return {"non_deterministic": True}

In [None]:
# 데이터 로드
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv("./test.csv")

In [None]:
# ID 제거
df_train.drop(columns=['ID'], inplace=True)
df_test_ids = df_test['ID']
df_test.drop(columns=['ID'], inplace=True)

In [None]:
# 타겟 변수 분리
X = df_train.drop(columns=['임신 성공 여부'], errors='ignore').copy()
y = df_train['임신 성공 여부']
df_test = df_test.drop(columns=['임신 성공 여부'], errors='ignore').copy()

In [None]:
# 유니크 값이 1개인 컬럼 삭제
drop_cols = ['불임 원인 - 여성 요인', '난자 채취 경과일']
X.drop(columns=drop_cols, inplace=True, errors='ignore')
df_test.drop(columns=drop_cols, inplace=True, errors='ignore')

# 숫자형 컬럼 값 정수 변환
cols_to_slice = ['총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수',
                 '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수']
for col in cols_to_slice:
    X[col] = X[col].astype(str).str.extract(r'(\d+)').astype(float).fillna(0).astype(int)
    df_test[col] = df_test[col].astype(str).str.extract(r'(\d+)').astype(float).fillna(0).astype(int)

# 결측치 처리
imputer_num = SimpleImputer(strategy="mean")
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
X[numeric_features] = imputer_num.fit_transform(X[numeric_features])
df_test[numeric_features] = imputer_num.transform(df_test[numeric_features])

imputer_cat = SimpleImputer(strategy="most_frequent")
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
X[categorical_features] = imputer_cat.fit_transform(X[categorical_features])
df_test[categorical_features] = imputer_cat.transform(df_test[categorical_features])

# 범주형 변수 인코딩
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_features] = encoder.fit_transform(X[categorical_features])
df_test[categorical_features] = encoder.transform(df_test[categorical_features])

# Feature Scaling
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])
df_test[numeric_features] = scaler.transform(df_test[numeric_features])

# Feature Selection (LGBM 기반)
lgbm = SklearnLGBM(n_estimators=300, random_state=42, n_jobs=-1)
lgbm.fit(X, y)
feature_importance = pd.Series(lgbm.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features = feature_importance[feature_importance > 0.0015].index.tolist()
X = X[selected_features]
df_test = df_test[selected_features]

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 779
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


In [None]:
# ✅ 훈련 / 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ✅ Optuna 하이퍼파라미터 튜닝
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),
        'num_leaves': trial.suggest_int('num_leaves', 30, 100),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
    }
    lgbm = LGBMClassifier(**params, random_state=42)
    lgbm.fit(X_train, y_train)
    y_val_pred = lgbm.predict_proba(X_val)[:, 1]
    return roc_auc_score(y_val, y_val_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
best_params = study.best_params
print(f"✅ Best LightGBM Params: {best_params}")



[I 2025-02-15 15:37:29,479] A new study created in memory with name: no-name-6f2e6f1e-c5e8-4922-b20c-945bac9e7739
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:38:27,375] Trial 0 finished with value: 0.7378759350048184 and parameters: {'n_estimators': 920, 'learning_rate': 0.015209365412205742, 'num_leaves': 62, 'max_depth': 9}. Best is trial 0 with value: 0.7378759350048184.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:38:55,160] Trial 1 finished with value: 0.7384502080493636 and parameters: {'n_estimators': 700, 'learning_rate': 0.01370271278969953, 'num_leaves': 76, 'max_depth': 5}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.139891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:39:15,653] Trial 2 finished with value: 0.7360347954538932 and parameters: {'n_estimators': 476, 'learning_rate': 0.009053537831575384, 'num_leaves': 61, 'max_depth': 5}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:40:07,400] Trial 3 finished with value: 0.7381236904729735 and parameters: {'n_estimators': 897, 'learning_rate': 0.007800227735383351, 'num_leaves': 81, 'max_depth': 10}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:40:41,666] Trial 4 finished with value: 0.7376769820652156 and parameters: {'n_estimators': 703, 'learning_rate': 0.007476032560684762, 'num_leaves': 52, 'max_depth': 6}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:41:14,827] Trial 5 finished with value: 0.7376260904249439 and parameters: {'n_estimators': 624, 'learning_rate': 0.016512691073333217, 'num_leaves': 89, 'max_depth': 10}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:41:53,213] Trial 6 finished with value: 0.7379761284833267 and parameters: {'n_estimators': 780, 'learning_rate': 0.006140020046868114, 'num_leaves': 40, 'max_depth': 7}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:42:17,736] Trial 7 finished with value: 0.7379047491680851 and parameters: {'n_estimators': 416, 'learning_rate': 0.014707925450520956, 'num_leaves': 86, 'max_depth': 10}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:42:48,297] Trial 8 finished with value: 0.7360350241377286 and parameters: {'n_estimators': 598, 'learning_rate': 0.0054841546598852, 'num_leaves': 52, 'max_depth': 6}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:43:29,083] Trial 9 finished with value: 0.7375282886398239 and parameters: {'n_estimators': 937, 'learning_rate': 0.01719076032401121, 'num_leaves': 56, 'max_depth': 9}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:43:57,275] Trial 10 finished with value: 0.7373046528252829 and parameters: {'n_estimators': 809, 'learning_rate': 0.011315472074432798, 'num_leaves': 99, 'max_depth': 4}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:44:46,983] Trial 11 finished with value: 0.7379304306623921 and parameters: {'n_estimators': 990, 'learning_rate': 0.01130475875932105, 'num_leaves': 76, 'max_depth': 8}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:45:15,474] Trial 12 finished with value: 0.7363835432960515 and parameters: {'n_estimators': 814, 'learning_rate': 0.008647953317122523, 'num_leaves': 75, 'max_depth': 4}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:45:50,213] Trial 13 finished with value: 0.7380834700792765 and parameters: {'n_estimators': 688, 'learning_rate': 0.012394621244833103, 'num_leaves': 74, 'max_depth': 7}. Best is trial 1 with value: 0.7384502080493636.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:46:23,239] Trial 14 finished with value: 0.7384962993263293 and parameters: {'n_estimators': 869, 'learning_rate': 0.019900431071966256, 'num_leaves': 85, 'max_depth': 5}. Best is trial 14 with value: 0.7384962993263293.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:46:51,596] Trial 15 finished with value: 0.7382774618778555 and parameters: {'n_estimators': 744, 'learning_rate': 0.01973408391342559, 'num_leaves': 99, 'max_depth': 5}. Best is trial 14 with value: 0.7384962993263293.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:47:13,570] Trial 16 finished with value: 0.7386841985481393 and parameters: {'n_estimators': 564, 'learning_rate': 0.019641696824474305, 'num_leaves': 70, 'max_depth': 5}. Best is trial 16 with value: 0.7386841985481393.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:47:40,691] Trial 17 finished with value: 0.7386067436326581 and parameters: {'n_estimators': 539, 'learning_rate': 0.019771423456472637, 'num_leaves': 36, 'max_depth': 6}. Best is trial 16 with value: 0.7386841985481393.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:48:07,312] Trial 18 finished with value: 0.7386085691088643 and parameters: {'n_estimators': 529, 'learning_rate': 0.017314755131987904, 'num_leaves': 32, 'max_depth': 6}. Best is trial 16 with value: 0.7386841985481393.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:48:28,135] Trial 19 finished with value: 0.738683645313009 and parameters: {'n_estimators': 505, 'learning_rate': 0.01721502545418227, 'num_leaves': 30, 'max_depth': 7}. Best is trial 16 with value: 0.7386841985481393.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:48:50,902] Trial 20 finished with value: 0.7384870700861232 and parameters: {'n_estimators': 461, 'learning_rate': 0.013356790652324525, 'num_leaves': 44, 'max_depth': 8}. Best is trial 16 with value: 0.7386841985481393.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:49:13,026] Trial 21 finished with value: 0.7387012729421066 and parameters: {'n_estimators': 537, 'learning_rate': 0.016751732549216886, 'num_leaves': 30, 'max_depth': 6}. Best is trial 21 with value: 0.7387012729421066.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:49:37,432] Trial 22 finished with value: 0.738741299603646 and parameters: {'n_estimators': 587, 'learning_rate': 0.017638081542511567, 'num_leaves': 30, 'max_depth': 7}. Best is trial 22 with value: 0.738741299603646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:50:03,940] Trial 23 finished with value: 0.7387672267583176 and parameters: {'n_estimators': 590, 'learning_rate': 0.016333325126522115, 'num_leaves': 45, 'max_depth': 7}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:50:32,558] Trial 24 finished with value: 0.738563565328391 and parameters: {'n_estimators': 631, 'learning_rate': 0.015248352035279456, 'num_leaves': 44, 'max_depth': 8}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:50:59,616] Trial 25 finished with value: 0.7383289876414262 and parameters: {'n_estimators': 588, 'learning_rate': 0.01069833124891951, 'num_leaves': 37, 'max_depth': 7}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.143292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:51:30,569] Trial 26 finished with value: 0.7387276155224357 and parameters: {'n_estimators': 654, 'learning_rate': 0.012593689151682295, 'num_leaves': 45, 'max_depth': 8}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:52:00,383] Trial 27 finished with value: 0.738759494448545 and parameters: {'n_estimators': 640, 'learning_rate': 0.01255005291814575, 'num_leaves': 45, 'max_depth': 9}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.106561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:52:33,977] Trial 28 finished with value: 0.7386662154105463 and parameters: {'n_estimators': 661, 'learning_rate': 0.009759163998477646, 'num_leaves': 49, 'max_depth': 9}. Best is trial 23 with value: 0.7387672267583176.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.02),


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


[I 2025-02-15 15:53:09,555] Trial 29 finished with value: 0.738174427327243 and parameters: {'n_estimators': 749, 'learning_rate': 0.015105831170752737, 'num_leaves': 62, 'max_depth': 9}. Best is trial 23 with value: 0.7387672267583176.


✅ Best LightGBM Params: {'n_estimators': 590, 'learning_rate': 0.016333325126522115, 'num_leaves': 45, 'max_depth': 7}


In [None]:
# ✅ 최적화된 모델 적용
lgbm_best = LGBMClassifier(**best_params, random_state=42)
xgb_best = XGBClassifier(n_estimators=500, learning_rate=0.01, max_depth=6, eval_metric="logloss", random_state=42)
cat_best = CatBoostClassifier(iterations=500, learning_rate=0.01, depth=6, verbose=0, random_seed=42, allow_const_label=True)

In [None]:
# 🔹 개별 모델 학습
lgbm_best.fit(X_train, y_train)
xgb_best.fit(X_train, y_train)
cat_best.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.095043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521


<catboost.core.CatBoostClassifier at 0x7f43fe763910>

In [None]:
# ✅ Optuna를 사용한 Blending 가중치 최적화
def blend_objective(trial):
    w_lgbm = trial.suggest_float('w_lgbm', 0.2, 0.6)
    w_xgb = trial.suggest_float('w_xgb', 0.2, 0.6)
    w_cat = 1.0 - (w_lgbm + w_xgb)  # 총합이 1이 되도록 설정

    y_pred_blend = (w_lgbm * lgbm_best.predict_proba(X_val)[:, 1]) + \
                   (w_xgb * xgb_best.predict_proba(X_val)[:, 1]) + \
                   (w_cat * cat_best.predict_proba(X_val)[:, 1])

    return roc_auc_score(y_val, y_pred_blend)

study_blend = optuna.create_study(direction="maximize")
study_blend.optimize(blend_objective, n_trials=30)
best_weights = study_blend.best_params
print(f"✅ Best Blending Weights: {best_weights}")


[I 2025-02-15 15:59:48,989] A new study created in memory with name: no-name-bd5a3ccd-0848-4825-a8ac-19c5ac473618
[I 2025-02-15 15:59:52,504] Trial 0 finished with value: 0.7380197152229158 and parameters: {'w_lgbm': 0.20297895404289906, 'w_xgb': 0.4438578641990716}. Best is trial 0 with value: 0.7380197152229158.
[I 2025-02-15 15:59:55,548] Trial 1 finished with value: 0.7380499214615824 and parameters: {'w_lgbm': 0.23768580032422953, 'w_xgb': 0.32097389554784345}. Best is trial 1 with value: 0.7380499214615824.
[I 2025-02-15 15:59:58,568] Trial 2 finished with value: 0.7383676382054754 and parameters: {'w_lgbm': 0.3530146550270041, 'w_xgb': 0.4260216837279861}. Best is trial 2 with value: 0.7383676382054754.
[I 2025-02-15 16:00:02,130] Trial 3 finished with value: 0.7379454139474012 and parameters: {'w_lgbm': 0.21488986840698968, 'w_xgb': 0.27777165039129464}. Best is trial 2 with value: 0.7383676382054754.
[I 2025-02-15 16:00:05,475] Trial 4 finished with value: 0.7384752833817135 a

✅ Best Blending Weights: {'w_lgbm': 0.5993703344739661, 'w_xgb': 0.200713664663019}


In [None]:
# ✅ 최적화된 Blending 가중치 적용
w_lgbm, w_xgb, w_cat = best_weights['w_lgbm'], best_weights['w_xgb'], 1.0 - (best_weights['w_lgbm'] + best_weights['w_xgb'])

# 🔹 검증 데이터 성능 평가
y_val_pred_blend = (w_lgbm * lgbm_best.predict_proba(X_val)[:, 1]) + \
                    (w_xgb * xgb_best.predict_proba(X_val)[:, 1]) + \
                    (w_cat * cat_best.predict_proba(X_val)[:, 1])

roc_auc_blend = roc_auc_score(y_val, y_val_pred_blend)
print(f'✅ Blended Model Validation ROC-AUC: {roc_auc_blend:.4f}')

✅ Blended Model Validation ROC-AUC: 0.7387


In [None]:
# ✅ 테스트 데이터 예측 (Blending)
test_preds_blend = (w_lgbm * lgbm_best.predict_proba(df_test)[:, 1]) + \
                   (w_xgb * xgb_best.predict_proba(df_test)[:, 1]) + \
                   (w_cat * cat_best.predict_proba(df_test)[:, 1])



In [None]:
# 제출 파일 생성
submission = pd.DataFrame({'ID': df_test_ids, 'probability': test_preds})
submission.to_csv('submission_25.csv', index=False)
print("✅ Submission file saved: submission_25.csv")

✅ Submission file saved: submission_25.csv
