In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
df_sample_submission = pd.read_csv('./sample_submission.csv')

In [None]:
# ID Ï†úÍ±∞
df_train.drop(columns=['ID'], inplace=True)
df_test_ids = df_test['ID']
df_test.drop(columns=['ID'], inplace=True)

In [None]:
# ÌÉÄÍ≤ü Î≥ÄÏàò Î∂ÑÎ¶¨
X = df_train.drop(columns=['ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä'])
y = df_train['ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä']

In [None]:
# üîπ Í≤∞Ï∏°Ïπò ÌôïÏù∏ Î∞è Ï≤òÎ¶¨
print("üìä Í≤∞Ï∏°Ïπò Í∞úÏàò:")
print(X.isnull().sum())

üìä Í≤∞Ï∏°Ïπò Í∞úÏàò:
ÏãúÏà† ÏãúÍ∏∞ ÏΩîÎìú                      0
ÏãúÏà† ÎãπÏãú ÎÇòÏù¥                      0
ÏûÑÏã† ÏãúÎèÑ ÎòêÎäî ÎßàÏßÄÎßâ ÏûÑÏã† Í≤ΩÍ≥º Ïó∞Ïàò    246981
ÏãúÏà† Ïú†Ìòï                         0
ÌäπÏ†ï ÏãúÏà† Ïú†Ìòï                      2
                          ...  
ÎÇúÏûê Ï±ÑÏ∑® Í≤ΩÍ≥ºÏùº                 57488
ÎÇúÏûê Ìï¥Îèô Í≤ΩÍ≥ºÏùº                254915
ÎÇúÏûê ÌòºÌï© Í≤ΩÍ≥ºÏùº                 53735
Î∞∞ÏïÑ Ïù¥Ïãù Í≤ΩÍ≥ºÏùº                 43566
Î∞∞ÏïÑ Ìï¥Îèô Í≤ΩÍ≥ºÏùº                215982
Length: 67, dtype: int64


In [None]:
# Ïà´ÏûêÌòï Î≥ÄÏàò Í≤∞Ï∏°Ïπò ÌèâÍ∑† ÎåÄÏ≤¥
numeric_features = X.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy="mean")
X[numeric_features] = imputer.fit_transform(X[numeric_features])
df_test[numeric_features] = imputer.transform(df_test[numeric_features])

In [None]:
# Î≤îÏ£ºÌòï Î≥ÄÏàò Ïù∏ÏΩîÎî© (Ordinal Encoding)
categorical_features = X.select_dtypes(include=['object']).columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_features] = encoder.fit_transform(X[categorical_features])
df_test[categorical_features] = encoder.transform(df_test[categorical_features])

In [None]:
# üîπ Feature Scaling (StandardScaler Ï†ÅÏö©) ‚Üí Î®ºÏ†Ä Ï†ÅÏö©
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])
df_test[numeric_features] = scaler.transform(df_test[numeric_features])


In [None]:
# üîπ Feature Selection (LightGBM Í∏∞Î∞ò, ÏûÑÍ≥ÑÍ∞í Ï°∞Ï†ï)
lgbm = LGBMClassifier(n_estimators=200, random_state=42, n_jobs=-1)
lgbm.fit(X, y)
feature_importance = pd.Series(lgbm.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features = feature_importance[feature_importance > 0.0005].index.tolist()
X = X[selected_features]
df_test = df_test[selected_features]

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 771
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


In [None]:
# üîπ Feature Selection Ïù¥ÌõÑ NaN Ï≤¥ÌÅ¨ Î∞è Ïû¨Ï≤òÎ¶¨
imputer = SimpleImputer(strategy="mean")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
df_test = pd.DataFrame(imputer.transform(df_test), columns=df_test.columns)

In [None]:
# ÌõàÎ†® / Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ Î∂ÑÎ¶¨
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# üîπ Optuna ÏµúÏ†ÅÌôî Ìï®Ïàò Ï†ïÏùò
def objective(trial):
    # Í∞úÎ≥Ñ Î™®Îç∏Ïùò ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù
    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 300, 500),
        'learning_rate': trial.suggest_loguniform('lgbm_learning_rate', 0.003, 0.015),
        'num_leaves': trial.suggest_int('lgbm_num_leaves', 30, 100),
        'max_depth': trial.suggest_int('lgbm_max_depth', 4, 12),
        'subsample': trial.suggest_float('lgbm_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }

    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 300, 500),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.003, 0.015),
        'max_depth': trial.suggest_int('xgb_max_depth', 4, 12),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }

    catboost_params = {
        'iterations': trial.suggest_int('catboost_iterations', 300, 500),
        'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 0.003, 0.015),
        'depth': trial.suggest_int('catboost_depth', 4, 12),
        'l2_leaf_reg': trial.suggest_float('catboost_l2_leaf_reg', 2, 10),
        'random_state': 42,
        'verbose': 0
    }

    # üîπ Í∞úÎ≥Ñ Î™®Îç∏ Ï†ïÏùò (Optuna ÌäúÎãù Ï†ÅÏö©)
    lgbm = LGBMClassifier(**lgbm_params)
    xgb = XGBClassifier(**xgb_params)
    catboost = CatBoostClassifier(**catboost_params)

    # üîπ Meta Model ÏÑ†ÌÉù (Logistic Regression vs Random Forest)
    meta_model_choice = trial.suggest_categorical('meta_model', ['logistic', 'random_forest'])

    if meta_model_choice == 'logistic':
        meta_model = LogisticRegression()
    else:
        meta_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # üîπ Stacking Î™®Îç∏ ÏÑ§Ï†ï
    stacked_model = StackingClassifier(
        estimators=[
            ('lgbm', lgbm),
            ('xgb', xgb),
            ('catboost', catboost)
        ],
        final_estimator=meta_model,
        stack_method='predict_proba',
        passthrough=True,
        n_jobs=-1
    )

    # üîπ Stacking ÌïôÏäµ
    stacked_model.fit(X_train, y_train)

    # üîπ Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä
    y_val_pred = stacked_model.predict_proba(X_val)[:, 1]
    return roc_auc_score(y_val, y_val_pred)

In [None]:
# üîπ Optuna Ïã§Ìñâ
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=30)

[I 2025-02-04 11:18:06,610] A new study created in memory with name: no-name-8daaa62a-b808-435e-930f-815253868960
  'learning_rate': trial.suggest_loguniform('lgbm_learning_rate', 0.003, 0.015),
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.003, 0.015),
  'learning_rate': trial.suggest_loguniform('catboost_learning_rate', 0.003, 0.015),
[I 2025-02-04 11:25:15,286] Trial 0 finished with value: 0.7139404937098265 and parameters: {'lgbm_n_estimators': 388, 'lgbm_learning_rate': 0.0042642770385571665, 'lgbm_num_leaves': 49, 'lgbm_max_depth': 6, 'lgbm_subsample': 0.5862004045294973, 'lgbm_colsample_bytree': 0.5609140602881211, 'xgb_n_estimators': 465, 'xgb_learning_rate': 0.0037380970261726653, 'xgb_max_depth': 9, 'xgb_subsample': 0.6714500597558724, 'xgb_colsample_bytree': 0.9465809988655078, 'catboost_iterations': 358, 'catboost_learning_rate': 0.0049790268126221815, 'catboost_depth': 4, 'catboost_l2_leaf_reg': 4.578384298942155, 'meta_model': 'random_forest'}. Best i

In [None]:
# üîπ ÏµúÏ†ÅÏùò ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ Ï∂úÎ†•
best_params = study.best_params
print(f"üîπ Best Params: {best_params}")

üîπ Best Params: {'lgbm_n_estimators': 436, 'lgbm_learning_rate': 0.014768177339829039, 'lgbm_num_leaves': 87, 'lgbm_max_depth': 12, 'lgbm_subsample': 0.6628572716745862, 'lgbm_colsample_bytree': 0.7129596637133632, 'xgb_n_estimators': 347, 'xgb_learning_rate': 0.006945354657061471, 'xgb_max_depth': 10, 'xgb_subsample': 0.5033839027206597, 'xgb_colsample_bytree': 0.5110990994971739, 'catboost_iterations': 300, 'catboost_learning_rate': 0.011409195641314061, 'catboost_depth': 12, 'catboost_l2_leaf_reg': 9.955027922167531, 'meta_model': 'logistic'}


In [None]:
# üîπ ÏµúÏ†ÅÏùò Stacking Î™®Îç∏ ÏÉùÏÑ±
final_lgbm = LGBMClassifier(
    n_estimators=best_params['lgbm_n_estimators'],
    learning_rate=best_params['lgbm_learning_rate'],
    num_leaves=best_params['lgbm_num_leaves'],
    max_depth=best_params['lgbm_max_depth'],
    subsample=best_params['lgbm_subsample'],
    colsample_bytree=best_params['lgbm_colsample_bytree'],
    random_state=42
)

final_xgb = XGBClassifier(
    n_estimators=best_params['xgb_n_estimators'],
    learning_rate=best_params['xgb_learning_rate'],
    max_depth=best_params['xgb_max_depth'],
    subsample=best_params['xgb_subsample'],
    colsample_bytree=best_params['xgb_colsample_bytree'],
    random_state=42
)

final_catboost = CatBoostClassifier(
    iterations=best_params['catboost_iterations'],
    learning_rate=best_params['catboost_learning_rate'],
    depth=best_params['catboost_depth'],
    l2_leaf_reg=best_params['catboost_l2_leaf_reg'],
    random_state=42,
    verbose=0
)

In [None]:
# üîπ ÏµúÏ†Å Meta Model ÏÑ§Ï†ï
if best_params['meta_model'] == 'logistic':
    final_meta_model = LogisticRegression()
else:
    final_meta_model = RandomForestClassifier(n_estimators=100, random_state=42)

# üîπ ÏµúÏ†Å Stacking Î™®Îç∏ ÌïôÏäµ
final_stacked_model = StackingClassifier(
    estimators=[
        ('lgbm', final_lgbm),
        ('xgb', final_xgb),
        ('catboost', final_catboost)
    ],
    final_estimator=final_meta_model,
    stack_method='predict_proba',
    passthrough=True,
    n_jobs=-1
)

final_stacked_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# üîπ Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä
y_val_pred = final_stacked_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'‚úÖ Validation ROC-AUC (Stacking + Optuna): {roc_auc:.4f}')

‚úÖ Validation ROC-AUC (Stacking + Optuna): 0.7353


In [None]:
# üîπ ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ ÏòàÏ∏°
test_preds = final_stacked_model.predict_proba(df_test)[:, 1]

In [None]:
# 7. Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ±
submission = pd.DataFrame({'ID': df_test_ids, 'probability': test_preds})
submission.to_csv('submission_7.csv', index=False)
print("‚úÖ Submission file saved: submission_7.csv")

‚úÖ Submission file saved: submission_7.csv
