# 1.행별 공통결측치 지우고 돌리기

In [168]:
import pandas as pd
df = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\preprocessed_tmp.csv')

In [169]:
#결측치가 있는 행 제거 
df = df.dropna()

In [172]:
target = df['임신 성공 여부']
df = df.drop(columns=['임신 성공 여부'],axis=1)

In [174]:
df = df.drop(columns=['Unnamed: 0'],axis=1)

# XGBOOST

In [69]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def optimize_xgboost(df, y):
    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)
    
    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss", 
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        }

        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train)

        preds_proba = model.predict_proba(X_valid)[:, 1] 
        auc_score = roc_auc_score(y_valid, preds_proba) 

        return auc_score  

 
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50) 


    print("Best hyperparameters:", study.best_params)


    best_params = study.best_params
    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train)

    preds_proba = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds_proba)
    print(f"Final AUC-ROC: {auc_score:.4f}")

    return model, study.best_params

In [71]:
model, best_params = optimize_xgboost(df,target)

[I 2025-02-10 14:22:27,351] A new study created in memory with name: no-name-90dc24c4-b6b4-48b5-aace-51e674e3adf9
[I 2025-02-10 14:22:29,547] Trial 0 finished with value: 0.7364270111503513 and parameters: {'learning_rate': 0.14582705100844515, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.8482673818181203, 'colsample_bytree': 0.9768622587095395, 'reg_alpha': 0.41879233104772606, 'reg_lambda': 0.584109749248326, 'n_estimators': 444}. Best is trial 0 with value: 0.7364270111503513.
[I 2025-02-10 14:22:32,866] Trial 1 finished with value: 0.71136565520416 and parameters: {'learning_rate': 0.19116190340681735, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.7318066313088285, 'colsample_bytree': 0.6566511385702803, 'reg_alpha': 0.05620097574771765, 'reg_lambda': 0.4955684684765236, 'n_estimators': 452}. Best is trial 0 with value: 0.7364270111503513.
[I 2025-02-10 14:22:33,721] Trial 2 finished with value: 0.7334028089765818 and parameters: {'learning_rate': 0.185000767281412

Best hyperparameters: {'learning_rate': 0.038559669778046056, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.927786767214776, 'colsample_bytree': 0.7058848853411166, 'reg_alpha': 0.16706107306297407, 'reg_lambda': 0.5942216185246811, 'n_estimators': 318}
Final AUC-ROC: 0.7398


# CatBoost

In [73]:
import optuna
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

def optimize_catboost(df, y):
    if y.dtype == 'object':  
        le = LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)

    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 500), 
            "depth": trial.suggest_int("depth", 4, 8), 
            "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.1), 
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "loss_function": "Logloss",
            "task_type": "CPU",  
            "thread_count": 16,  
            "verbose": 0 
        }

        model = cb.CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

        preds_proba = model.predict_proba(X_valid)[:, 1]  
        auc_score = roc_auc_score(y_valid, preds_proba)

        return auc_score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30) 

    print("Best hyperparameters:", study.best_params)

    best_params = study.best_params
    model = cb.CatBoostClassifier(**best_params)
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

    preds_proba = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds_proba)
    print(f"Final AUC-ROC: {auc_score:.4f}")

    return model, study.best_params

In [75]:
optimize_catboost(df,target)

[I 2025-02-10 14:24:27,673] A new study created in memory with name: no-name-042a8073-9101-406a-aa29-8ec6babe0421
[I 2025-02-10 14:24:30,828] Trial 0 finished with value: 0.738432351114694 and parameters: {'iterations': 190, 'depth': 5, 'learning_rate': 0.07311006840592245, 'l2_leaf_reg': 5.826686928993293}. Best is trial 0 with value: 0.738432351114694.
[I 2025-02-10 14:24:35,051] Trial 1 finished with value: 0.7391012995004602 and parameters: {'iterations': 233, 'depth': 6, 'learning_rate': 0.058740476190534595, 'l2_leaf_reg': 9.57551902260205}. Best is trial 1 with value: 0.7391012995004602.
[I 2025-02-10 14:24:37,119] Trial 2 finished with value: 0.7381438039663895 and parameters: {'iterations': 111, 'depth': 6, 'learning_rate': 0.07486446182036957, 'l2_leaf_reg': 9.010067667898157}. Best is trial 1 with value: 0.7391012995004602.
[I 2025-02-10 14:24:39,649] Trial 3 finished with value: 0.7387593843468064 and parameters: {'iterations': 141, 'depth': 6, 'learning_rate': 0.0926329256

Best hyperparameters: {'iterations': 395, 'depth': 7, 'learning_rate': 0.06341843233816352, 'l2_leaf_reg': 1.9209893310244492}
0:	learn: 0.6514444	test: 0.6516668	best: 0.6516668 (0)	total: 21ms	remaining: 8.28s
1:	learn: 0.6199663	test: 0.6202562	best: 0.6202562 (1)	total: 39.7ms	remaining: 7.8s
2:	learn: 0.5989082	test: 0.5992261	best: 0.5992261 (2)	total: 58.7ms	remaining: 7.67s
3:	learn: 0.5791515	test: 0.5794860	best: 0.5794860 (3)	total: 77.2ms	remaining: 7.54s
4:	learn: 0.5667154	test: 0.5670281	best: 0.5670281 (4)	total: 95.3ms	remaining: 7.43s
5:	learn: 0.5561026	test: 0.5563918	best: 0.5563918 (5)	total: 113ms	remaining: 7.35s
6:	learn: 0.5463260	test: 0.5466251	best: 0.5466251 (6)	total: 132ms	remaining: 7.3s
7:	learn: 0.5384849	test: 0.5387752	best: 0.5387752 (7)	total: 150ms	remaining: 7.27s
8:	learn: 0.5341647	test: 0.5343861	best: 0.5343861 (8)	total: 169ms	remaining: 7.24s
9:	learn: 0.5286428	test: 0.5288532	best: 0.5288532 (9)	total: 188ms	remaining: 7.25s
10:	learn: 0

(<catboost.core.CatBoostClassifier at 0x1aa4efa3f20>,
 {'iterations': 395,
  'depth': 7,
  'learning_rate': 0.06341843233816352,
  'l2_leaf_reg': 1.9209893310244492})

# Cat_Feature에서 시술 당시 나이를 범주형 처리시(결측치 제외)

In [106]:
age_df = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\CatBoost_preprocessed.csv')

In [120]:
#결측치가 있는 행 제거 
age_df = age_df.dropna()

In [122]:
age_target = age_df['임신 성공 여부']
age_df = age_df.drop(columns=['임신 성공 여부'],axis=1)

In [114]:
import optuna
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

def optimize_catboost(df, y):
    if y.dtype == 'object':  
        le = LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)

    cat_features = ['시술 당시 나이']
    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 500), 
            "depth": trial.suggest_int("depth", 4, 8), 
            "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.1), 
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "loss_function": "Logloss",
            "cat_features": cat_features,
            "task_type": "CPU",  
            "thread_count": 16,  
            "verbose": 0 
        }

        model = cb.CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

        preds_proba = model.predict_proba(X_valid)[:, 1]  
        auc_score = roc_auc_score(y_valid, preds_proba)

        return auc_score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30) 

    print("Best hyperparameters:", study.best_params)

    best_params = study.best_params
    model = cb.CatBoostClassifier(**best_params)
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50,cat_features = ['시술 당시 나이'])

    preds_proba = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds_proba)
    print(f"Final AUC-ROC: {auc_score:.4f}")

    return model, study.best_params

In [124]:
optimize_catboost(age_df,age_target)

[I 2025-02-10 14:45:50,193] A new study created in memory with name: no-name-521782bd-dcb2-4415-aa7c-8d8d6e89ac79
[I 2025-02-10 14:46:27,381] Trial 0 finished with value: 0.739305879273113 and parameters: {'iterations': 439, 'depth': 5, 'learning_rate': 0.056903249728466734, 'l2_leaf_reg': 8.441933999315413}. Best is trial 0 with value: 0.739305879273113.
[I 2025-02-10 14:46:58,528] Trial 1 finished with value: 0.7391335896212525 and parameters: {'iterations': 303, 'depth': 8, 'learning_rate': 0.08315071611492689, 'l2_leaf_reg': 5.386484231428273}. Best is trial 0 with value: 0.739305879273113.
[I 2025-02-10 14:47:09,728] Trial 2 finished with value: 0.7391123116662819 and parameters: {'iterations': 128, 'depth': 8, 'learning_rate': 0.08768705310889072, 'l2_leaf_reg': 1.6334601978678434}. Best is trial 0 with value: 0.739305879273113.
[I 2025-02-10 14:47:42,982] Trial 3 finished with value: 0.7392276693641818 and parameters: {'iterations': 273, 'depth': 8, 'learning_rate': 0.0653374618

Best hyperparameters: {'iterations': 315, 'depth': 8, 'learning_rate': 0.06654432435609951, 'l2_leaf_reg': 7.237782374120956}
0:	learn: 0.6523578	test: 0.6525197	best: 0.6525197 (0)	total: 114ms	remaining: 35.7s
1:	learn: 0.6205872	test: 0.6208564	best: 0.6208564 (1)	total: 232ms	remaining: 36.4s
2:	learn: 0.6037694	test: 0.6039884	best: 0.6039884 (2)	total: 345ms	remaining: 35.8s
3:	learn: 0.5860418	test: 0.5862525	best: 0.5862525 (3)	total: 452ms	remaining: 35.2s
4:	learn: 0.5746783	test: 0.5748365	best: 0.5748365 (4)	total: 566ms	remaining: 35.1s
5:	learn: 0.5593597	test: 0.5595459	best: 0.5595459 (5)	total: 674ms	remaining: 34.7s
6:	learn: 0.5498323	test: 0.5499942	best: 0.5499942 (6)	total: 789ms	remaining: 34.7s
7:	learn: 0.5426979	test: 0.5427999	best: 0.5427999 (7)	total: 904ms	remaining: 34.7s
8:	learn: 0.5369425	test: 0.5370002	best: 0.5370002 (8)	total: 1.02s	remaining: 34.7s
9:	learn: 0.5295569	test: 0.5296612	best: 0.5296612 (9)	total: 1.13s	remaining: 34.4s
10:	learn: 0.5

(<catboost.core.CatBoostClassifier at 0x1aa4f10e9f0>,
 {'iterations': 315,
  'depth': 8,
  'learning_rate': 0.06654432435609951,
  'l2_leaf_reg': 7.237782374120956})

# 2.행 별 공통결측치를 평균으로 매꾸고 돌리기 

In [176]:
df.fillna(df.mean(), inplace=True)

# XGBOOST

In [181]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def optimize_xgboost(df, y):
    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)
    
    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss", 
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        }

        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train)

        preds_proba = model.predict_proba(X_valid)[:, 1] 
        auc_score = roc_auc_score(y_valid, preds_proba) 

        return auc_score  

 
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50) 


    print("Best hyperparameters:", study.best_params)


    best_params = study.best_params
    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train)

    preds_proba = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds_proba)
    print(f"Final AUC-ROC: {auc_score:.4f}")

    return model, study.best_params

In [187]:
model, best_params = optimize_xgboost(df,target)

[I 2025-02-10 15:48:44,254] A new study created in memory with name: no-name-c644a8e1-3ce8-4c5a-bb53-3ed4ed545e63
[I 2025-02-10 15:48:45,394] Trial 0 finished with value: 0.7381791500850019 and parameters: {'learning_rate': 0.03876229392400183, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.5588881729401745, 'colsample_bytree': 0.7796939865047814, 'reg_alpha': 0.03470096400228828, 'reg_lambda': 0.8846285430769367, 'n_estimators': 144}. Best is trial 0 with value: 0.7381791500850019.
[I 2025-02-10 15:48:46,848] Trial 1 finished with value: 0.730819652181546 and parameters: {'learning_rate': 0.25611028629654975, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.9410892710154601, 'colsample_bytree': 0.6058301537408308, 'reg_alpha': 0.7584962766881315, 'reg_lambda': 0.19690319642792697, 'n_estimators': 189}. Best is trial 0 with value: 0.7381791500850019.
[I 2025-02-10 15:48:48,485] Trial 2 finished with value: 0.7291163242080057 and parameters: {'learning_rate': 0.2756480824143

Best hyperparameters: {'learning_rate': 0.01806187091247416, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.9468182735635396, 'colsample_bytree': 0.5221968457919561, 'reg_alpha': 0.869809225643778, 'reg_lambda': 0.01038946519531564, 'n_estimators': 473}
Final AUC-ROC: 0.7399


In [191]:
import optuna
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

def optimize_catboost(df, y):
    if y.dtype == 'object':  
        le = LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)

    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 100, 500), 
            "depth": trial.suggest_int("depth", 4, 8), 
            "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.1), 
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "loss_function": "Logloss",
            "task_type": "CPU",  
            "thread_count": 16,  
            "verbose": 0 
        }

        model = cb.CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

        preds_proba = model.predict_proba(X_valid)[:, 1]  
        auc_score = roc_auc_score(y_valid, preds_proba)

        return auc_score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30) 

    print("Best hyperparameters:", study.best_params)

    best_params = study.best_params
    model = cb.CatBoostClassifier(**best_params)
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

    preds_proba = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds_proba)
    print(f"Final AUC-ROC: {auc_score:.4f}")

    return model, study.best_params

In [193]:
optimize_catboost(df,target)

[I 2025-02-10 15:52:05,913] A new study created in memory with name: no-name-6c1829dc-dda2-4b5b-b696-a17ddbe24976
[I 2025-02-10 15:52:09,755] Trial 0 finished with value: 0.7386715886808055 and parameters: {'iterations': 217, 'depth': 4, 'learning_rate': 0.08793969495390443, 'l2_leaf_reg': 8.145118698357532}. Best is trial 0 with value: 0.7386715886808055.
[I 2025-02-10 15:52:14,723] Trial 1 finished with value: 0.7397006358219085 and parameters: {'iterations': 215, 'depth': 7, 'learning_rate': 0.08659495043661242, 'l2_leaf_reg': 1.8781013032386542}. Best is trial 1 with value: 0.7397006358219085.
[I 2025-02-10 15:52:19,682] Trial 2 finished with value: 0.7391716651978906 and parameters: {'iterations': 194, 'depth': 8, 'learning_rate': 0.07131918557715022, 'l2_leaf_reg': 1.461363269089502}. Best is trial 1 with value: 0.7397006358219085.
[I 2025-02-10 15:52:24,496] Trial 3 finished with value: 0.7394353963584701 and parameters: {'iterations': 241, 'depth': 6, 'learning_rate': 0.0683678

Best hyperparameters: {'iterations': 335, 'depth': 7, 'learning_rate': 0.06578041597022108, 'l2_leaf_reg': 4.734732133666494}
0:	learn: 0.6504429	test: 0.6506593	best: 0.6506593 (0)	total: 22.7ms	remaining: 7.58s
1:	learn: 0.6185242	test: 0.6188148	best: 0.6188148 (1)	total: 46ms	remaining: 7.66s
2:	learn: 0.5970602	test: 0.5973721	best: 0.5973721 (2)	total: 68.8ms	remaining: 7.62s
3:	learn: 0.5773828	test: 0.5777028	best: 0.5777028 (3)	total: 90.9ms	remaining: 7.52s
4:	learn: 0.5648480	test: 0.5651434	best: 0.5651434 (4)	total: 113ms	remaining: 7.48s
5:	learn: 0.5542323	test: 0.5544961	best: 0.5544961 (5)	total: 136ms	remaining: 7.44s
6:	learn: 0.5445946	test: 0.5448663	best: 0.5448663 (6)	total: 157ms	remaining: 7.37s
7:	learn: 0.5370113	test: 0.5372207	best: 0.5372207 (7)	total: 180ms	remaining: 7.35s
8:	learn: 0.5324618	test: 0.5326253	best: 0.5326253 (8)	total: 202ms	remaining: 7.3s
9:	learn: 0.5271007	test: 0.5272496	best: 0.5272496 (9)	total: 225ms	remaining: 7.32s
10:	learn: 0.

(<catboost.core.CatBoostClassifier at 0x1aa4e328d10>,
 {'iterations': 335,
  'depth': 7,
  'learning_rate': 0.06578041597022108,
  'l2_leaf_reg': 4.734732133666494})