# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

# Setting

In [2]:
pd.set_option("display.max_rows", None)  # 모든 행 출력
pd.set_option("display.max_columns", None)  # 모든 열 출력
rc('font', family='AppleGothic')
pd.options.display.float_format = '{:.4f}'.format

# plt.rcParams['font.family'] = 'AppleGothic'  # 또는 'Malgun Gothic' (Windows 사용자)
plt.rcParams['axes.unicode_minus'] = False   # 음수 기호 깨짐 방지

# Data

In [13]:
# Data
file_path = '/Users/toad/Documents/ToyProject/LG_Aimers_Infertility/'

train = pd.read_csv(file_path + 'Data/train.csv').drop(columns=['ID'])
test = pd.read_csv(file_path + 'Data/test.csv').drop(columns=['ID'])

In [14]:
cat2num = {
    '0회': 0, '1회': 1, '2회': 2, '3회': 3, '4회': 4, '5회': 5, '6회 이상': 6
}
target_feature = [
    '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수',
    'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수',
    '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수'
]
for t in target_feature:
    train[t] = train[t].map(cat2num)
    test[t] = test[t].map(cat2num)

In [15]:

def safe_divide(numerator, denominator):
    return np.where(denominator == 0, 0, numerator / denominator).astype(float) * 100

def derived_features(train, test):
    train["전체 시술 임신 성공률"] = safe_divide(train["총 임신 횟수"], train["총 시술 횟수"])
    test["전체 시술 임신 성공률"] = safe_divide(test["총 임신 횟수"], test["총 시술 횟수"])

    train["전체 시술 출산 성공률"] = safe_divide(train["총 출산 횟수"], train["총 시술 횟수"])
    test["전체 시술 출산 성공률"] = safe_divide(test["총 출산 횟수"], test["총 시술 횟수"])

    train["배아 이식 대비 임신률"] = safe_divide(train["총 임신 횟수"], train["이식된 배아 수"])
    test["배아 이식 대비 임신률"] = safe_divide(test["총 임신 횟수"], test["이식된 배아 수"])

    train["배아 활용률"] = safe_divide(train["저장된 배아 수"] + train["해동된 배아 수"], train["총 생성 배아 수"])
    test["배아 활용률"] = safe_divide(test["저장된 배아 수"] + test["해동된 배아 수"], test["총 생성 배아 수"])

    train["IVF 임신 성공률"] = safe_divide(train["IVF 임신 횟수"], train["IVF 시술 횟수"])
    test["IVF 임신 성공률"] = safe_divide(test["IVF 임신 횟수"], test["IVF 시술 횟수"])

    train["IVF 출산 성공률"] = safe_divide(train["IVF 출산 횟수"], train["IVF 시술 횟수"])
    test["IVF 출산 성공률"] = safe_divide(test["IVF 출산 횟수"], test["IVF 시술 횟수"])

    train["DI 임신 성공률"] = safe_divide(train["DI 임신 횟수"], train["DI 시술 횟수"])
    test["DI 임신 성공률"] = safe_divide(test["DI 임신 횟수"], test["DI 시술 횟수"])

    train["DI 출산 성공률"] = safe_divide(train["DI 출산 횟수"], train["DI 시술 횟수"])
    test["DI 출산 성공률"] = safe_divide(test["DI 출산 횟수"], test["DI 시술 횟수"])

    train["미세주입 배아 생성률"] = safe_divide(train["미세주입에서 생성된 배아 수"], train["미세주입된 난자 수"])
    test["미세주입 배아 생성률"] = safe_divide(test["미세주입에서 생성된 배아 수"], test["미세주입된 난자 수"])

    train["미세주입 배아 이식률"] = safe_divide(train["미세주입 배아 이식 수"], train["미세주입에서 생성된 배아 수"])
    test["미세주입 배아 이식률"] = safe_divide(test["미세주입 배아 이식 수"], test["미세주입에서 생성된 배아 수"])

    train["해동 비율"] = safe_divide(train["해동된 배아 수"], train["해동 난자 수"])
    test["해동 비율"] = safe_divide(test["해동된 배아 수"], test["해동 난자 수"])
    
    return train, test

columns_to_replace = ["전체 시술 임신 성공률", "전체 시술 출산 성공률", "배아 이식 대비 임신률", "배아 활용률", 
                      "IVF 임신 성공률", "IVF 출산 성공률", "DI 임신 성공률", "DI 출산 성공률", 
                      "미세주입 배아 생성률", "미세주입 배아 이식률", "해동 비율"]

In [16]:
train, test = derived_features(train, test)

for col in columns_to_replace:
    train[col].replace([np.inf, -np.inf], 0, inplace=True)
    test[col].replace([np.inf, -np.inf], 0, inplace=True)
    train[col].fillna(0, inplace=True)
    test[col].fillna(0, inplace=True)

for col in columns_to_replace:
    train[col] = train[col].astype(float).clip(0, 100)
    test[col] = test[col].astype(float).clip(0, 100)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are

In [17]:
y = train.pop('임신 성공 여부')

In [18]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [19]:
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='mean')

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

categorical_cols = train.select_dtypes(include=['object']).columns
for col in categorical_cols:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_cols] = ordinal_encoder.fit_transform(train[categorical_cols])
test[categorical_cols] = ordinal_encoder.transform(test[categorical_cols])

In [20]:
columns_to_drop = [
    "남성 주 불임 원인","남성 부 불임 원인",
    "여성 주 불임 원인","여성 부 불임 원인","부부 주 불임 원인","부부 부 불임 원인",
    "불임 원인 - 자궁경부 문제","불임 원인 - 자궁내막증","불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인","불임 원인 - 정자 운동성","불임 원인 - 정자 형태",
    "DI 임신 횟수","IVF 출산 횟수","DI 출산 횟수","기증자 정자와 혼합된 난자 수","대리모 여부","난자 혼합 경과일",
]

train.drop(columns=columns_to_drop, axis=1, inplace=True)
test.drop(columns=columns_to_drop, axis=1, inplace=True)

# Data Split

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_temp, y_train, y_temp = train_test_split(train, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

print(f'Train : {x_train.shape}, {y_train.shape}')
print(f'Validation : {x_val.shape}, {y_val.shape}')
print(f'Test : {x_test.shape}, {y_test.shape}')
print(f'Submission : {test.shape}')

Train : (205080, 60), (205080,)
Validation : (25635, 60), (25635,)
Test : (25636, 60), (25636,)
Submission : (90067, 60)


# Train

In [31]:
from catboost import CatBoostRegressor

In [34]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
import optuna

In [23]:
from scipy.special import expit
def custom_auc(y_true, y_pred):
    """LightGBM의 eval_metric을 위한 커스텀 AUC 함수"""
    y_pred = expit(y_pred)  # Sigmoid 변환 (0~1 범위로 조정)
    return "auc", roc_auc_score(y_true, y_pred), True

In [48]:
models = ['lgbm', 'xgb', 'catboost']
best_params = []
def objective(trial, model_type='lgbm'):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
    }

    if model_type == "lgbm":
        param.update({
            'objective': 'regression',
            'metric': 'rmse',
            'eval_metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': trial.suggest_int('num_leaves', 20, 80),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),  # LGBM 전용
            'verbose': -1,
            'early_stopping_rounds': 50,
            'n_jobs': -1
        })
    elif model_type == "xgb":
        param.update({
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'gamma': trial.suggest_float('gamma', 0, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),  # XGBoost 전용
            'verbose' : 0,
        })
    elif model_type == "catboost":
        param.update({
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),  # ✅ CatBoost 전용
            'verbose': False
        })
    

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in skf.split(x_train, y_train):
        x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

         # 모델 선택
        if model_type == "lgbm":
            model = LGBMRegressor(**param)
        elif model_type == "xgb":
            model = XGBRegressor(**param)
        elif model_type == "catboost":
            model = CatBoostRegressor(**param)
        # 모델 학습
        model.fit(x_train_fold, y_train_fold, 
                eval_set=[(x_val_fold, y_val_fold)], 
                )
        # preds = model.predict_proba(x_val)[:, 1]
        preds = model.predict(x_val_fold)
        preds = expit(preds) # sigmoid 변환
        auc_score = roc_auc_score(y_val_fold, preds)
        scores.append(auc_score)
    return np.mean(scores)

for model_type in models:
    print(f"Model : {model_type}")
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, model_type), n_trials=5)
    best_params.append(study.best_params)
    print("Best hyperparameters({model_type}):", study.best_params)

[I 2025-02-19 22:27:31,521] A new study created in memory with name: no-name-55df75cc-432d-42bb-91bf-6a68deb60509


Model : lgbm


[I 2025-02-19 22:27:35,811] Trial 0 finished with value: 0.7385260448260879 and parameters: {'max_depth': 5, 'learning_rate': 0.13703065941981868, 'n_estimators': 817, 'subsample': 0.7002346740331031, 'num_leaves': 47, 'colsample_bytree': 0.864922271230465}. Best is trial 0 with value: 0.7385260448260879.
[I 2025-02-19 22:27:40,799] Trial 1 finished with value: 0.738242515594056 and parameters: {'max_depth': 5, 'learning_rate': 0.1736967988772074, 'n_estimators': 341, 'subsample': 0.8425970351810051, 'num_leaves': 32, 'colsample_bytree': 0.8728085271214501}. Best is trial 0 with value: 0.7385260448260879.
[I 2025-02-19 22:27:52,148] Trial 2 finished with value: 0.7387461905993223 and parameters: {'max_depth': 5, 'learning_rate': 0.03616798248306044, 'n_estimators': 440, 'subsample': 0.7103762092889467, 'num_leaves': 64, 'colsample_bytree': 0.8471844883421515}. Best is trial 2 with value: 0.7387461905993223.
[I 2025-02-19 22:27:55,710] Trial 3 finished with value: 0.7387298596278358 and

Best hyperparameters({model_type}): {'max_depth': 5, 'learning_rate': 0.03616798248306044, 'n_estimators': 440, 'subsample': 0.7103762092889467, 'num_leaves': 64, 'colsample_bytree': 0.8471844883421515}
Model : xgb
[0]	validation_0-rmse:0.43310
[1]	validation_0-rmse:0.42956
[2]	validation_0-rmse:0.42570
[3]	validation_0-rmse:0.42252
[4]	validation_0-rmse:0.42012
[5]	validation_0-rmse:0.41823
[6]	validation_0-rmse:0.41650
[7]	validation_0-rmse:0.41509
[8]	validation_0-rmse:0.41393
[9]	validation_0-rmse:0.41311
[10]	validation_0-rmse:0.41232
[11]	validation_0-rmse:0.41167
[12]	validation_0-rmse:0.41122
[13]	validation_0-rmse:0.41078
[14]	validation_0-rmse:0.41050
[15]	validation_0-rmse:0.41014
[16]	validation_0-rmse:0.40981
[17]	validation_0-rmse:0.40956
[18]	validation_0-rmse:0.40938
[19]	validation_0-rmse:0.40920
[20]	validation_0-rmse:0.40903
[21]	validation_0-rmse:0.40889
[22]	validation_0-rmse:0.40880
[23]	validation_0-rmse:0.40870
[24]	validation_0-rmse:0.40861
[25]	validation_0-rm

[I 2025-02-19 22:28:42,790] Trial 0 finished with value: 0.7391695016317043 and parameters: {'max_depth': 5, 'learning_rate': 0.11122347828588397, 'n_estimators': 363, 'subsample': 0.7654733123293791, 'gamma': 0.6329122323134845, 'reg_alpha': 2.1050421899601823, 'reg_lambda': 3.60608038576645, 'colsample_bytree': 0.8585541278196673}. Best is trial 0 with value: 0.7391695016317043.


[0]	validation_0-rmse:0.43369
[1]	validation_0-rmse:0.43013
[2]	validation_0-rmse:0.42650
[3]	validation_0-rmse:0.42342
[4]	validation_0-rmse:0.42100
[5]	validation_0-rmse:0.41912
[6]	validation_0-rmse:0.41730
[7]	validation_0-rmse:0.41577
[8]	validation_0-rmse:0.41454
[9]	validation_0-rmse:0.41366
[10]	validation_0-rmse:0.41278
[11]	validation_0-rmse:0.41199
[12]	validation_0-rmse:0.41146
[13]	validation_0-rmse:0.41091
[14]	validation_0-rmse:0.41055
[15]	validation_0-rmse:0.41017
[16]	validation_0-rmse:0.40985
[17]	validation_0-rmse:0.40957
[18]	validation_0-rmse:0.40932
[19]	validation_0-rmse:0.40914
[20]	validation_0-rmse:0.40896
[21]	validation_0-rmse:0.40881
[22]	validation_0-rmse:0.40870
[23]	validation_0-rmse:0.40858
[24]	validation_0-rmse:0.40851
[25]	validation_0-rmse:0.40843
[26]	validation_0-rmse:0.40834
[27]	validation_0-rmse:0.40825
[28]	validation_0-rmse:0.40818
[29]	validation_0-rmse:0.40813
[30]	validation_0-rmse:0.40809
[31]	validation_0-rmse:0.40805
[32]	validation_0-

[I 2025-02-19 22:29:50,141] Trial 1 finished with value: 0.7388143060043975 and parameters: {'max_depth': 7, 'learning_rate': 0.09535949390472166, 'n_estimators': 623, 'subsample': 0.6257421360169542, 'gamma': 1.0907769624331558, 'reg_alpha': 1.5235523745137276, 'reg_lambda': 3.2591785825850126, 'colsample_bytree': 0.7927823665044877}. Best is trial 0 with value: 0.7391695016317043.


[0]	validation_0-rmse:0.43480
[1]	validation_0-rmse:0.43169
[2]	validation_0-rmse:0.42891
[3]	validation_0-rmse:0.42642
[4]	validation_0-rmse:0.42439
[5]	validation_0-rmse:0.42248
[6]	validation_0-rmse:0.42081
[7]	validation_0-rmse:0.41930
[8]	validation_0-rmse:0.41803
[9]	validation_0-rmse:0.41690
[10]	validation_0-rmse:0.41598
[11]	validation_0-rmse:0.41515
[12]	validation_0-rmse:0.41442
[13]	validation_0-rmse:0.41373
[14]	validation_0-rmse:0.41315
[15]	validation_0-rmse:0.41262
[16]	validation_0-rmse:0.41212
[17]	validation_0-rmse:0.41172
[18]	validation_0-rmse:0.41136
[19]	validation_0-rmse:0.41104
[20]	validation_0-rmse:0.41077
[21]	validation_0-rmse:0.41048
[22]	validation_0-rmse:0.41028
[23]	validation_0-rmse:0.41008
[24]	validation_0-rmse:0.40987
[25]	validation_0-rmse:0.40971
[26]	validation_0-rmse:0.40956
[27]	validation_0-rmse:0.40943
[28]	validation_0-rmse:0.40929
[29]	validation_0-rmse:0.40918
[30]	validation_0-rmse:0.40908
[31]	validation_0-rmse:0.40899
[32]	validation_0-

[I 2025-02-19 22:30:35,988] Trial 2 finished with value: 0.7386536089441721 and parameters: {'max_depth': 5, 'learning_rate': 0.07226027696116928, 'n_estimators': 376, 'subsample': 0.5017802938089374, 'gamma': 1.1138786743381202, 'reg_alpha': 3.3135505227352393, 'reg_lambda': 4.590310650793137, 'colsample_bytree': 0.937949055967786}. Best is trial 0 with value: 0.7391695016317043.


[0]	validation_0-rmse:0.43386
[1]	validation_0-rmse:0.43116
[2]	validation_0-rmse:0.42776
[3]	validation_0-rmse:0.42490
[4]	validation_0-rmse:0.42258
[5]	validation_0-rmse:0.42074
[6]	validation_0-rmse:0.41897
[7]	validation_0-rmse:0.41752
[8]	validation_0-rmse:0.41634
[9]	validation_0-rmse:0.41544
[10]	validation_0-rmse:0.41454
[11]	validation_0-rmse:0.41373
[12]	validation_0-rmse:0.41314
[13]	validation_0-rmse:0.41259
[14]	validation_0-rmse:0.41224
[15]	validation_0-rmse:0.41178
[16]	validation_0-rmse:0.41144
[17]	validation_0-rmse:0.41110
[18]	validation_0-rmse:0.41084
[19]	validation_0-rmse:0.41058
[20]	validation_0-rmse:0.41037
[21]	validation_0-rmse:0.41024
[22]	validation_0-rmse:0.41013
[23]	validation_0-rmse:0.41000
[24]	validation_0-rmse:0.40987
[25]	validation_0-rmse:0.40974
[26]	validation_0-rmse:0.40959
[27]	validation_0-rmse:0.40947
[28]	validation_0-rmse:0.40943
[29]	validation_0-rmse:0.40938
[30]	validation_0-rmse:0.40931
[31]	validation_0-rmse:0.40927
[32]	validation_0-

[I 2025-02-19 22:31:19,605] Trial 3 finished with value: 0.735043375564569 and parameters: {'max_depth': 4, 'learning_rate': 0.0964489065115604, 'n_estimators': 411, 'subsample': 0.8264114799021542, 'gamma': 4.958095281255147, 'reg_alpha': 2.9329962950039907, 'reg_lambda': 0.8604254827441526, 'colsample_bytree': 0.7873889230022074}. Best is trial 0 with value: 0.7391695016317043.


[0]	validation_0-rmse:0.43521
[1]	validation_0-rmse:0.43271
[2]	validation_0-rmse:0.42968
[3]	validation_0-rmse:0.42689
[4]	validation_0-rmse:0.42555
[5]	validation_0-rmse:0.42362
[6]	validation_0-rmse:0.42163
[7]	validation_0-rmse:0.41993
[8]	validation_0-rmse:0.41851
[9]	validation_0-rmse:0.41744
[10]	validation_0-rmse:0.41638
[11]	validation_0-rmse:0.41546
[12]	validation_0-rmse:0.41474
[13]	validation_0-rmse:0.41401
[14]	validation_0-rmse:0.41353
[15]	validation_0-rmse:0.41299
[16]	validation_0-rmse:0.41249
[17]	validation_0-rmse:0.41202
[18]	validation_0-rmse:0.41170
[19]	validation_0-rmse:0.41146
[20]	validation_0-rmse:0.41116
[21]	validation_0-rmse:0.41093
[22]	validation_0-rmse:0.41072
[23]	validation_0-rmse:0.41054
[24]	validation_0-rmse:0.41041
[25]	validation_0-rmse:0.41026
[26]	validation_0-rmse:0.41009
[27]	validation_0-rmse:0.41000
[28]	validation_0-rmse:0.40985
[29]	validation_0-rmse:0.40976
[30]	validation_0-rmse:0.40966
[31]	validation_0-rmse:0.40959
[32]	validation_0-

[I 2025-02-19 22:32:11,662] Trial 4 finished with value: 0.7356825335018174 and parameters: {'max_depth': 6, 'learning_rate': 0.07762192484339268, 'n_estimators': 532, 'subsample': 0.6390454434876959, 'gamma': 3.765812226673032, 'reg_alpha': 4.657044263898371, 'reg_lambda': 4.8107121136935636, 'colsample_bytree': 0.6844455812922281}. Best is trial 0 with value: 0.7391695016317043.
[I 2025-02-19 22:32:11,664] A new study created in memory with name: no-name-b2aa1ab9-71e9-457d-b1df-fd6db028ee94


Best hyperparameters({model_type}): {'max_depth': 5, 'learning_rate': 0.11122347828588397, 'n_estimators': 363, 'subsample': 0.7654733123293791, 'gamma': 0.6329122323134845, 'reg_alpha': 2.1050421899601823, 'reg_lambda': 3.60608038576645, 'colsample_bytree': 0.8585541278196673}
Model : catboost


[I 2025-02-19 22:33:06,847] Trial 0 finished with value: 0.7383397479111179 and parameters: {'max_depth': 10, 'learning_rate': 0.12737218141219975, 'n_estimators': 668, 'subsample': 0.5896769445533399, 'l2_leaf_reg': 9.5251938515435, 'border_count': 207, 'colsample_bylevel': 0.685931170104237}. Best is trial 0 with value: 0.7383397479111179.
[I 2025-02-19 22:33:49,450] Trial 1 finished with value: 0.7392884204861511 and parameters: {'max_depth': 6, 'learning_rate': 0.058957299687438514, 'n_estimators': 899, 'subsample': 0.872250036484848, 'l2_leaf_reg': 2.302103648190275, 'border_count': 230, 'colsample_bylevel': 0.8706719833442083}. Best is trial 1 with value: 0.7392884204861511.
[I 2025-02-19 22:34:15,604] Trial 2 finished with value: 0.738699496973255 and parameters: {'max_depth': 9, 'learning_rate': 0.10484065638259912, 'n_estimators': 430, 'subsample': 0.6442772354426505, 'l2_leaf_reg': 6.193461691670228, 'border_count': 134, 'colsample_bylevel': 0.7991346510660341}. Best is trial

Best hyperparameters({model_type}): {'max_depth': 5, 'learning_rate': 0.03919906140974328, 'n_estimators': 961, 'subsample': 0.6822346992655712, 'l2_leaf_reg': 8.711569534471371, 'border_count': 99, 'colsample_bylevel': 0.9503710674621726}


In [49]:
trained_models = []
for model_type, best_param in zip(models, best_params):
    if model_type == "lgbm":
        model = LGBMRegressor(**best_param)
    elif model_type == "xgb":
        model = XGBRegressor(**best_param)
    elif model_type == "catboost":
        model = CatBoostRegressor(**best_param)
    model.fit(x_train, y_train)
    trained_models.append(model)
print(trained_models)

0:	learn: 0.4361679	total: 14.9ms	remaining: 14.3s
1:	learn: 0.4344174	total: 30.6ms	remaining: 14.7s
2:	learn: 0.4328608	total: 44.4ms	remaining: 14.2s
3:	learn: 0.4313129	total: 58.4ms	remaining: 14s
4:	learn: 0.4298702	total: 71.9ms	remaining: 13.7s
5:	learn: 0.4285081	total: 84.6ms	remaining: 13.5s
6:	learn: 0.4272583	total: 97.2ms	remaining: 13.2s
7:	learn: 0.4261111	total: 111ms	remaining: 13.2s
8:	learn: 0.4250303	total: 128ms	remaining: 13.5s
9:	learn: 0.4240131	total: 142ms	remaining: 13.5s
10:	learn: 0.4230932	total: 154ms	remaining: 13.3s
11:	learn: 0.4222198	total: 168ms	remaining: 13.3s
12:	learn: 0.4214116	total: 182ms	remaining: 13.3s
13:	learn: 0.4206900	total: 200ms	remaining: 13.5s
14:	learn: 0.4199659	total: 231ms	remaining: 14.5s
15:	learn: 0.4193280	total: 268ms	remaining: 15.8s
16:	learn: 0.4187047	total: 300ms	remaining: 16.7s
17:	learn: 0.4181381	total: 333ms	remaining: 17.5s
18:	learn: 0.4175983	total: 367ms	remaining: 18.2s
19:	learn: 0.4170941	total: 381ms	re

In [50]:
predict_list = []
for model_name, model in zip(models, trained_models):
    predict_train = model.predict(x_train)
    predict_train = expit(predict_train) # sigmoid 변환

    predict_val = model.predict(x_val)
    predict_val = expit(predict_val) # sigmoid 변환

    predict_test = model.predict(x_test)
    predict_test = expit(predict_test) # sigmoid 변환
    predict_list.append([predict_train, predict_val, predict_test])

In [51]:
### Regressor
for model_name, predict in zip(models, predict_list):
    predict_train, predict_val, predict_test = predict
    print(f'Model : {model_name}')
    print(f'Train AUC : {roc_auc_score(y_train, predict_train)}')
    print(f'Validation AUC : {roc_auc_score(y_val, predict_val)}')
    print(f'Test AUC : {roc_auc_score(y_test, predict_test)}')

Model : lgbm
Train AUC : 0.752025652834561
Validation AUC : 0.7364952009274902
Test AUC : 0.7406006080224894
Model : xgb
Train AUC : 0.747082025183092
Validation AUC : 0.7360565963389789
Test AUC : 0.740905642505254
Model : catboost
Train AUC : 0.7478369716542436
Validation AUC : 0.7364186962002386
Test AUC : 0.7408559015468144


# Stacking

In [56]:
from sklearn.ensemble import StackingRegressor

# 개별 모델들의 예측값을 Feature로 변환
meta_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, verbose=-1)  # LGBM을 메타 모델로 사용
lgbm, xgb, catboost = trained_models
stacking_model = StackingRegressor(
    estimators=[
        ('lgbm', lgbm),
        ('xgb', xgb),
        ('catboost', catboost)
    ],
    final_estimator=meta_model,  # LGBMRegressor를 메타 모델로 사용
    passthrough=True  # ✅ 원본 feature도 함께 사용 가능
)
stacking_model.fit(x_train, y_train)

# Validation Set 성능 평가
val_preds = stacking_model.predict(x_val)
auc_score_val = roc_auc_score(y_val, val_preds)
print(f"Validation AUC (Stacking Model): {auc_score_val:.4f}")

# Test Set 성능 평가
final_preds = stacking_model.predict(x_test)
auc_score_test = roc_auc_score(y_test, final_preds)
print(f"Test AUC (Stacking Model): {auc_score_test:.4f}")


0:	learn: 0.4361679	total: 20.5ms	remaining: 19.6s
1:	learn: 0.4344174	total: 39.7ms	remaining: 19s
2:	learn: 0.4328608	total: 57ms	remaining: 18.2s
3:	learn: 0.4313129	total: 92.7ms	remaining: 22.2s
4:	learn: 0.4298702	total: 113ms	remaining: 21.6s
5:	learn: 0.4285081	total: 138ms	remaining: 21.9s
6:	learn: 0.4272583	total: 164ms	remaining: 22.3s
7:	learn: 0.4261111	total: 197ms	remaining: 23.5s
8:	learn: 0.4250303	total: 222ms	remaining: 23.5s
9:	learn: 0.4240131	total: 243ms	remaining: 23.1s
10:	learn: 0.4230932	total: 261ms	remaining: 22.6s
11:	learn: 0.4222198	total: 277ms	remaining: 21.9s
12:	learn: 0.4214116	total: 290ms	remaining: 21.2s
13:	learn: 0.4206900	total: 303ms	remaining: 20.5s
14:	learn: 0.4199659	total: 316ms	remaining: 19.9s
15:	learn: 0.4193280	total: 328ms	remaining: 19.4s
16:	learn: 0.4187047	total: 340ms	remaining: 18.9s
17:	learn: 0.4181381	total: 355ms	remaining: 18.6s
18:	learn: 0.4175983	total: 368ms	remaining: 18.3s
19:	learn: 0.4170941	total: 382ms	remaini



Validation AUC (Stacking Model): 0.7340




Test AUC (Stacking Model): 0.7395


# Submission

In [59]:
### Regressor
pred = stacking_model.predict(test)
submission = pd.read_csv(file_path + 'Data/sample_submission.csv')
submission['probability'] = pred

submission.to_csv(file_path + 'Data/submission.csv', index=False)

