# Ensemble CatBoost + XGBoost (Optuna 10-Fold)

This notebook reuses the full feature-engineering pipeline from `catboost_finalsolution.ipynb`, performs fresh 10-fold Optuna tuning for both CatBoost and XGBoost, and then blends the two models to produce `ensemble_submission.csv`.


In [None]:
# Install required packages
%pip install -q catboost xgboost optuna


## 1. Rebuild feature set via `catboost_finalsolution.ipynb`

Running the original notebook ensures every engineered feature (150+) is available here. The notebook will also train a baseline CatBoost model; we only reuse the engineered `train`, `test`, `features`, and helper lists for the fresh tuning below.


In [None]:

%run ./catboost_finalsolution.ipynb


In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import xgboost as xgb

np.random.seed(42)



In [None]:
required_vars = ['train', 'test', 'features', 'TARGET', 'categorical_features']
for var in required_vars:
    if var not in globals():
        raise ValueError(f"Missing '{var}' from catboost_finalsolution.ipynb run. Please ensure the notebook executed successfully.")

print(f"Feature count: {len(features)} | Train shape: {train.shape} | Test shape: {test.shape}")


In [None]:
TARGET = 'Default 12 Flag'
y = train[TARGET]
X = train[features]
X_test = test[features]
skf_10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)



## 2. CatBoost: 10-Fold Optuna Tuning


In [None]:
def build_cat_params(trial):
    return {
        'iterations': trial.suggest_int('iterations', 1200, 2200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2.0, 8.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'bootstrap_type': 'Bernoulli',
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_state': 42,
        'early_stopping_rounds': 200,
        'task_type': 'GPU' if 'USE_GPU' in globals() and USE_GPU else 'CPU',
        'devices': '0' if 'USE_GPU' in globals() and USE_GPU else None,
        'verbose': False,
        'cat_features': categorical_features,
    }


def cat_objective(trial):
    params = build_cat_params(trial)
    fold_scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf_10.split(X, y), start=1):
        model = CatBoostClassifier(**params)
        model.fit(
            X.iloc[trn_idx], y.iloc[trn_idx],
            eval_set=(X.iloc[val_idx], y.iloc[val_idx]),
            use_best_model=True
        )
        preds = model.predict_proba(X.iloc[val_idx])[:, 1]
        fold_scores.append(roc_auc_score(y.iloc[val_idx], preds))
        if fold_scores[-1] < 0.60:  # early prune very weak configs
            raise optuna.TrialPruned()
    return np.mean(fold_scores)

cat_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
cat_study.optimize(cat_objective, n_trials=8, show_progress_bar=True)

best_cat_params = build_cat_params(optuna.trial.FixedTrial(cat_study.best_params))
best_cat_params.update({'verbose': 200})
print(f"Best CatBoost AUC: {cat_study.best_value:.6f}")


In [None]:
cat_models = []
cat_oof = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(skf_10.split(X, y), start=1):
    model = CatBoostClassifier(**best_cat_params)
    model.fit(
        X.iloc[trn_idx], y.iloc[trn_idx],
        eval_set=(X.iloc[val_idx], y.iloc[val_idx]),
        use_best_model=True,
        verbose=False
    )
    preds = model.predict_proba(X.iloc[val_idx])[:, 1]
    cat_oof[val_idx] = preds
    cat_models.append(model)
    print(f"Fold {fold:02d} CatBoost AUC: {roc_auc_score(y.iloc[val_idx], preds):.6f}")

print(f"\nCatBoost 10-fold CV AUC: {roc_auc_score(y, cat_oof):.6f}")


In [None]:
cat_test_preds = np.mean([model.predict_proba(X_test)[:, 1] for model in cat_models], axis=0)


## 3. XGBoost: 10-Fold Optuna Tuning


In [None]:
def prepare_xgb_matrix(df: pd.DataFrame) -> pd.DataFrame:
    df_xgb = df.copy()
    for col in df_xgb.columns:
        if str(df_xgb[col].dtype) == 'category':
            df_xgb[col] = df_xgb[col].cat.codes.astype('int32')
    return df_xgb.fillna(-999)

X_xgb = prepare_xgb_matrix(X)
X_test_xgb = prepare_xgb_matrix(X_test)



In [None]:
use_gpu_hist = 'USE_GPU' in globals() and USE_GPU


def build_xgb_params(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': trial.suggest_float('eta', 0.01, 0.05, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1.0, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95),
        'lambda': trial.suggest_float('lambda', 1e-3, 10, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10, log=True),
        'max_bin': trial.suggest_int('max_bin', 128, 512),
    }
    if use_gpu_hist:
        params.update({'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'})
    else:
        params.update({'tree_method': 'hist', 'predictor': 'auto'})
    return params


def xgb_objective(trial):
    params = build_xgb_params(trial)
    num_boost_round = trial.suggest_int('num_boost_round', 600, 1600)
    fold_scores = []
    for fold, (trn_idx, val_idx) in enumerate(skf_10.split(X_xgb, y), start=1):
        dtrain = xgb.DMatrix(X_xgb.iloc[trn_idx], label=y.iloc[trn_idx])
        dvalid = xgb.DMatrix(X_xgb.iloc[val_idx], label=y.iloc[val_idx])
        booster = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=200,
            verbose_eval=False
        )
        preds = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
        score = roc_auc_score(y.iloc[val_idx], preds)
        fold_scores.append(score)
        if score < 0.60:
            raise optuna.TrialPruned()
    return np.mean(fold_scores)

xgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=2024))
xgb_study.optimize(xgb_objective, n_trials=10, show_progress_bar=True)

best_xgb_params = build_xgb_params(optuna.trial.FixedTrial(xgb_study.best_params))
best_xgb_boost = xgb_study.best_params['num_boost_round']
print(f"Best XGBoost AUC: {xgb_study.best_value:.6f}")


In [None]:
xgb_models = []
xgb_oof = np.zeros(len(X_xgb))

for fold, (trn_idx, val_idx) in enumerate(skf_10.split(X_xgb, y), start=1):
    dtrain = xgb.DMatrix(X_xgb.iloc[trn_idx], label=y.iloc[trn_idx])
    dvalid = xgb.DMatrix(X_xgb.iloc[val_idx], label=y.iloc[val_idx])
    booster = xgb.train(
        best_xgb_params,
        dtrain,
        num_boost_round=best_xgb_boost,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=200,
        verbose_eval=False
    )
    preds = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
    xgb_oof[val_idx] = preds
    xgb_models.append(booster)
    print(f"Fold {fold:02d} XGB AUC: {roc_auc_score(y.iloc[val_idx], preds):.6f}")

print(f"\nXGBoost 10-fold CV AUC: {roc_auc_score(y, xgb_oof):.6f}")

xgb_test_preds = np.mean([
    booster.predict(xgb.DMatrix(X_test_xgb), iteration_range=(0, booster.best_iteration + 1))
    for booster in xgb_models
], axis=0)



## 4. Blend + Export


In [None]:
cat_weight = 0.55
xgb_weight = 1.0 - cat_weight

ensemble_oof = cat_weight * cat_oof + xgb_weight * xgb_oof
ensemble_score = roc_auc_score(y, ensemble_oof)
print(f"Blend weights -> Cat: {cat_weight:.2f}, XGB: {xgb_weight:.2f}")
print(f"Ensemble OOF AUC: {ensemble_score:.6f}")

ensemble_test = cat_weight * cat_test_preds + xgb_weight * xgb_test_preds

ensemble_submission = pd.DataFrame({
    'ID': test['ID'],
    'Default 12 Flag': ensemble_test
})

output_path = 'ensemble_submission.csv'
ensemble_submission.to_csv(output_path, index=False)
print(f"Saved {output_path} with shape {ensemble_submission.shape}")
