In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
data_os = pd.read_csv('data_train_os.csv')
test_os = pd.read_csv('data_test_os.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce memory usage of a DataFrame by downcasting numeric columns
    and converting object columns to category where appropriate
    Datetime columns are preseved(skip)

    Paremeters:
    - df (pd.DataFrame): The input DataFrame to optimize
    - verbose (bool): If True, prints memory usage info
                      If False, just execute the operation

    Returns:
    - df (pd.DataFrame): Optimized DataFrame
    """
    # Calculate memory usage before optimization
    # 'deep=True' ensures object-type columns (e.g. strings) are fully measured
    # Divide by 1024^2 to convert bytes to megabytes
    start_mem = df.memory_usage(deep=True).sum() / 1024**2

    # Iterate through each columns in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Skip datetime columns to avoid corrupting temporal data
        if pd.api.types.is_datetime64_any_dtype(col_type):
            continue

        # If the column is numeric(integer or float)
        elif pd.api.types.is_numeric_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()

            # If the column is an integer type
            if pd.api.types.is_integer_dtype(col_type):
                # Try downcasting to the smallest possible integer type
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)

            # If the column is an float type
            else:
                # Try downcasting to float16, float32, or keep as float64
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #elif c_min >= np.finfo(np.float64).min and c_max <= np.finfo(np.float64).max:
                else:
                    df[col] = df[col].astype(np.float64)
        # If the column is an object type
        elif pd.api.types.is_object_dtype(col_type):
            num_unique = df[col].nunique()
            num_total = len(df[col])
            # if the unique ratio is below 50%, convert to category for memory
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')

        # Other types are not modified

    # Calculate memory usage after optimization
    end_mem = df.memory_usage(deep=True).sum() / 1024**2

    if verbose:
        # Print summary of memory usage before and after optimization
        print(f"Memory usage before optimization: {start_mem:.2f} MB")
        print(f"Memory usage after optimization: {end_mem:.2f} MB")
        print(f"Reduced by: {100 * (start_mem - end_mem) / start_mem:.1f}%")

    return df

In [4]:
data_os = reduce_mem_usage(data_os)
test_os = reduce_mem_usage(test_os)

Memory usage before optimization: 223.87 MB
Memory usage after optimization: 53.53 MB
Reduced by: 76.1%
Memory usage before optimization: 68.66 MB
Memory usage after optimization: 16.59 MB
Reduced by: 75.8%


In [5]:
y_os = data_os['isDefault']
X_os = data_os.drop(['isDefault'], axis=1)
print(y_os.shape, X_os.shape, test_os.shape)

(637893,) (637893, 45) (200000, 45)


In [11]:
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

# --- Configuration ---
N_SPLITS_CV = 5       # Number of folds for cross-validation
N_TRIALS_PER_STAGE = 50 # Number of Optuna trials per stage (adjust as needed)
OPTUNA_N_JOBS = 5    # Number of parallel jobs for Optuna (-1 uses all cores, adjust if needed)
XGB_N_JOBS = 5       # Number of parallel threads for XGBoost model training
RANDOM_SEED = 42


### Stage 1: Tune Tree Structure Parameters (max_depth, min_child_weight, gamma)

In [12]:
print("\n--- Stage 1: Tuning max_depth, min_child_weight, gamma ---")

def objective_stage1(trial):
    params = {
        'objective': 'binary:logistic', # For binary classification
        'eval_metric': 'auc',           # Metric for early stopping, consistent with Optuna's direction
        'booster': 'gbtree',
        'random_state': RANDOM_SEED,
        'n_jobs': XGB_N_JOBS,
        # 'tree_method': 'hist', # Consider 'hist' for faster training on large datasets, or 'gpu_hist' for GPU

        # Parameters to tune in Stage 1
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10, log=True),
        'gamma': trial.suggest_float('gamma', 0, 5, log=False), # often 0 to 0.5 or higher if needed

        # Fixed for this stage (will be tuned later or determined by early stopping)
        'learning_rate': 0.1, # Relatively high learning rate for faster exploration
        'n_estimators': 1000, # High number, early stopping will find the optimum

        # Default values for parameters tuned in later stages
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.0, # L1 regularization
        'reg_lambda': 1.0,# L2 regularization (XGBoost default is 1)
        'early_stopping_rounds': 100
    }

    cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_SEED)
    aucs = []

    for train_idx, valid_idx in cv.split(X_os, y_os):
        X_train_fold, X_valid_fold = X_os.iloc[train_idx], X_os.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_os.iloc[train_idx], y_os.iloc[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  verbose=False) # Suppress XGBoost training output during Optuna trials
        
        # model.best_iteration can be used if n_estimators is what you're optimizing
        preds_proba = model.predict_proba(X_valid_fold)[:, 1]
        auc = roc_auc_score(y_valid_fold, preds_proba)
        aucs.append(auc)
        
        # Optuna Pruning (optional, but recommended for longer stages)
        # trial.report(auc, len(aucs)) # Report intermediate value
        # if trial.should_prune():
        #     raise optuna.exceptions.TrialPruned()
            
    return np.mean(aucs)

study_stage1 = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_stage1.optimize(objective_stage1, n_trials=N_TRIALS_PER_STAGE, n_jobs=OPTUNA_N_JOBS, show_progress_bar=True)

best_params_stage1 = study_stage1.best_params
print("✅ Best parameters from Stage 1:")
print(best_params_stage1)
print(f"✅ Best AUC from Stage 1: {study_stage1.best_value:.4f}")

[I 2025-05-15 20:54:12,519] A new study created in memory with name: no-name-2954307d-dd6b-49fd-a2df-3f003d522818



--- Stage 1: Tuning max_depth, min_child_weight, gamma ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-15 21:01:56,471] Trial 2 finished with value: 0.7284397905713368 and parameters: {'max_depth': 10, 'min_child_weight': 6.6637722516123965, 'gamma': 3.7713014312115902}. Best is trial 2 with value: 0.7284397905713368.
[I 2025-05-15 21:02:33,717] Trial 0 finished with value: 0.7272144607443277 and parameters: {'max_depth': 10, 'min_child_weight': 1.204221982985744, 'gamma': 0.16761875177925323}. Best is trial 2 with value: 0.7284397905713368.
[I 2025-05-15 21:03:31,349] Trial 3 finished with value: 0.7312161125589098 and parameters: {'max_depth': 7, 'min_child_weight': 3.081866452662735, 'gamma': 4.461573741846035}. Best is trial 3 with value: 0.7312161125589098.
[I 2025-05-15 21:03:36,173] Trial 4 finished with value: 0.7311251989682902 and parameters: {'max_depth': 7, 'min_child_weight': 2.917231132209023, 'gamma': 3.463421854229179}. Best is trial 3 with value: 0.7312161125589098.
[I 2025-05-15 21:07:53,080] Trial 8 finished with value: 0.7276398337808245 and parameters: {'

### Stage 2: Tune Regularization Parameters (reg_alpha, reg_lambda)

In [13]:
print("\n--- Stage 2: Tuning reg_alpha, reg_lambda ---")

def objective_stage2(trial):
    # Start with best parameters from Stage 1
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'random_state': RANDOM_SEED,
        'n_jobs': XGB_N_JOBS,
        # 'tree_method': 'hist',

        # Parameters from Stage 1 (fixed)
        'max_depth': best_params_stage1['max_depth'],
        'min_child_weight': best_params_stage1['min_child_weight'],
        'gamma': best_params_stage1['gamma'],

        # Parameters to tune in Stage 2
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0, log=True),

        # Fixed for this stage
        'learning_rate': 0.1,
        'n_estimators': 1000,

        # Default values for parameters tuned in later stages
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'early_stopping_rounds': 100
    }

    cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_SEED)
    aucs = []
    for train_idx, valid_idx in cv.split(X_os, y_os):
        X_train_fold, X_valid_fold = X_os.iloc[train_idx], X_os.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_os.iloc[train_idx], y_os.iloc[valid_idx]
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  verbose=False)
        preds_proba = model.predict_proba(X_valid_fold)[:, 1]
        auc = roc_auc_score(y_valid_fold, preds_proba)
        aucs.append(auc)
    return np.mean(aucs)

study_stage2 = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_stage2.optimize(objective_stage2, n_trials=N_TRIALS_PER_STAGE, n_jobs=OPTUNA_N_JOBS, show_progress_bar=True)

best_params_stage2 = study_stage2.best_params
# Combine with previous stage best params
current_best_params = best_params_stage1.copy()
current_best_params.update(best_params_stage2)

print("✅ Best parameters from Stage 2 (alpha, lambda):")
print(best_params_stage2)
print(f"✅ Best AUC from Stage 2: {study_stage2.best_value:.4f}")

[I 2025-05-15 21:57:56,553] A new study created in memory with name: no-name-3ae8d25a-b68a-4a76-b5de-c02ef662f30b



--- Stage 2: Tuning reg_alpha, reg_lambda ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-15 22:04:34,885] Trial 1 finished with value: 0.7325666642569799 and parameters: {'reg_alpha': 0.013047368428776506, 'reg_lambda': 0.010636557353793534}. Best is trial 1 with value: 0.7325666642569799.
[I 2025-05-15 22:04:49,570] Trial 0 finished with value: 0.7328326558681195 and parameters: {'reg_alpha': 0.685782950220429, 'reg_lambda': 0.08441600456996912}. Best is trial 0 with value: 0.7328326558681195.
[I 2025-05-15 22:04:55,786] Trial 2 finished with value: 0.7326926818529526 and parameters: {'reg_alpha': 0.0035806907907943207, 'reg_lambda': 0.380825192327806}. Best is trial 0 with value: 0.7328326558681195.
[I 2025-05-15 22:05:00,047] Trial 3 finished with value: 0.7327399366209594 and parameters: {'reg_alpha': 0.03613145662663032, 'reg_lambda': 0.8391759968881708}. Best is trial 0 with value: 0.7328326558681195.
[I 2025-05-15 22:05:15,225] Trial 4 finished with value: 0.7327326687005788 and parameters: {'reg_alpha': 0.24674116098022147, 'reg_lambda': 0.01430393661855

### Stage 3: Tune Subsampling Parameters (subsample, colsample_bytree)

In [15]:
print("\n--- Stage 3: Tuning subsample, colsample_bytree ---")

def objective_stage3(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'random_state': RANDOM_SEED,
        'n_jobs': XGB_N_JOBS,
        # 'tree_method': 'hist',

        # Parameters from Stage 1 & 2 (fixed)
        'max_depth': current_best_params['max_depth'],
        'min_child_weight': current_best_params['min_child_weight'],
        'gamma': current_best_params['gamma'],
        'reg_alpha': current_best_params['reg_alpha'],
        'reg_lambda': current_best_params['reg_lambda'],

        # Parameters to tune in Stage 3
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        # Optional: You can also tune colsample_bylevel and colsample_bynode here if desired
        # 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        # 'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),

        # Fixed for this stage
        'learning_rate': 0.1,
        'n_estimators': 1000,
        'early_stopping_rounds': 100
    }

    cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_SEED)
    aucs = []
    for train_idx, valid_idx in cv.split(X_os, y_os):
        X_train_fold, X_valid_fold = X_os.iloc[train_idx], X_os.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_os.iloc[train_idx], y_os.iloc[valid_idx]
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  verbose=False)
        preds_proba = model.predict_proba(X_valid_fold)[:, 1]
        auc = roc_auc_score(y_valid_fold, preds_proba)
        aucs.append(auc)
    return np.mean(aucs)

study_stage3 = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_stage3.optimize(objective_stage3, n_trials=N_TRIALS_PER_STAGE, n_jobs=OPTUNA_N_JOBS, show_progress_bar=True)

best_params_stage3 = study_stage3.best_params
current_best_params.update(best_params_stage3) # Add new best params

print("✅ Best parameters from Stage 3 (subsample, colsample):")
print(best_params_stage3)
print(f"✅ Best AUC from Stage 3: {study_stage3.best_value:.4f}")

[I 2025-05-15 23:13:50,535] A new study created in memory with name: no-name-363f8f1c-8134-48d4-92a5-8665abef0ea3



--- Stage 3: Tuning subsample, colsample_bytree ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-15 23:19:10,580] Trial 0 finished with value: 0.7319845187363054 and parameters: {'subsample': 0.9964490976318493, 'colsample_bytree': 0.9725121221361984}. Best is trial 0 with value: 0.7319845187363054.
[I 2025-05-15 23:20:48,265] Trial 3 finished with value: 0.732285516213359 and parameters: {'subsample': 0.5040592132091599, 'colsample_bytree': 0.9549383616292366}. Best is trial 3 with value: 0.732285516213359.
[I 2025-05-15 23:20:48,701] Trial 2 finished with value: 0.7323496236785866 and parameters: {'subsample': 0.5519305123353417, 'colsample_bytree': 0.9173327233402682}. Best is trial 2 with value: 0.7323496236785866.
[I 2025-05-15 23:21:21,805] Trial 1 finished with value: 0.7327112051714566 and parameters: {'subsample': 0.6848168517523919, 'colsample_bytree': 0.66897114616099}. Best is trial 1 with value: 0.7327112051714566.
[I 2025-05-15 23:21:57,748] Trial 4 finished with value: 0.7327908501224126 and parameters: {'subsample': 0.7281928806929132, 'colsample_bytree'

### Stage 4: Tune Learning Rate (learning_rate) and n_estimators

In [18]:
print("\n--- Stage 4: Tuning learning_rate (and n_estimators via early stopping) ---")

def objective_stage4(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'random_state': RANDOM_SEED,
        'n_jobs': XGB_N_JOBS,
        # 'tree_method': 'hist',

        # Parameters from previous stages (fixed)
        'max_depth': current_best_params['max_depth'],
        'min_child_weight': current_best_params['min_child_weight'],
        'gamma': current_best_params['gamma'],
        'reg_alpha': current_best_params['reg_alpha'],
        'reg_lambda': current_best_params['reg_lambda'],
        'subsample': current_best_params['subsample'],
        'colsample_bytree': current_best_params['colsample_bytree'],

        # Parameters to tune in Stage 4
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True), # Finer search for LR
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000, step=100), # Tune n_estimators directly or rely on early stopping with a large fixed value
        'early_stopping_rounds': 100                                            # If tuning n_estimators directly, early_stopping_rounds should still be used.
    }
    
    # It's often better to set a high n_estimators and let early stopping find the optimal number
    # based on the learning rate. So, an alternative for this stage:
    # params['n_estimators'] = 3000 # Fixed high value
    # And you wouldn't suggest 'n_estimators' in the trial.

    cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_SEED)
    aucs = []
    actual_n_estimators_per_fold = []

    for train_idx, valid_idx in cv.split(X_os, y_os):
        X_train_fold, X_valid_fold = X_os.iloc[train_idx], X_os.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_os.iloc[train_idx], y_os.iloc[valid_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  verbose=False)
        
        preds_proba = model.predict_proba(X_valid_fold)[:, 1]
        auc = roc_auc_score(y_valid_fold, preds_proba)
        aucs.append(auc)
        actual_n_estimators_per_fold.append(model.best_iteration + 1 if model.best_iteration is not None else params['n_estimators']) # XGBoost best_iteration is 0-indexed

    # Log the average actual n_estimators if using early stopping with a fixed large n_estimators
    avg_actual_n_estimators = np.mean(actual_n_estimators_per_fold)
    trial.set_user_attr("avg_actual_n_estimators", avg_actual_n_estimators) # Store for later analysis

    return np.mean(aucs)

study_stage4 = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_stage4.optimize(objective_stage4, n_trials=N_TRIALS_PER_STAGE, n_jobs=OPTUNA_N_JOBS, show_progress_bar=True)

best_params_stage4 = study_stage4.best_params
final_best_params = current_best_params.copy()
final_best_params.update(best_params_stage4)

# If n_estimators was fixed high and determined by early stopping in Stage 4:
# You might want to set n_estimators in final_best_params to the average actual n_estimators from the best trial.
best_trial_stage4 = study_stage4.best_trial
if 'avg_actual_n_estimators' in best_trial_stage4.user_attrs:
    final_best_params['n_estimators'] = int(round(best_trial_stage4.user_attrs['avg_actual_n_estimators']))
    print(f"Setting n_estimators based on early stopping: {final_best_params['n_estimators']}")


print("\n--- Final Tuning Results ---")
print("✅ Final best parameters after all stages:")
print(final_best_params)
print(f"✅ Best AUC from Stage 4 (final): {study_stage4.best_value:.4f}")

print("\nTo train the final model, use all parameters in `final_best_params`:")
print("Example: final_model = xgb.XGBClassifier(**final_best_params, random_state=RANDOM_SEED, n_jobs=XGB_N_JOBS)")
# print("Then fit it on your full training data (X_os, y_os if it's the full training set, or X_train, y_train)")
# print("final_model.fit(X_os, y_os)")

[I 2025-05-16 11:54:12,526] A new study created in memory with name: no-name-dbc871f7-efee-4279-aaef-28a25bd73d8a



--- Stage 4: Tuning learning_rate (and n_estimators via early stopping) ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-16 11:59:31,310] Trial 2 finished with value: 0.719483229317824 and parameters: {'learning_rate': 0.00628184936845356, 'n_estimators': 700}. Best is trial 2 with value: 0.719483229317824.
[I 2025-05-16 12:01:28,881] Trial 3 finished with value: 0.7245319133916258 and parameters: {'learning_rate': 0.008575595966446624, 'n_estimators': 1000}. Best is trial 3 with value: 0.7245319133916258.
[I 2025-05-16 12:01:57,262] Trial 1 finished with value: 0.730185896398741 and parameters: {'learning_rate': 0.022306253682001575, 'n_estimators': 1100}. Best is trial 1 with value: 0.730185896398741.
[I 2025-05-16 12:04:27,426] Trial 0 finished with value: 0.732775092598562 and parameters: {'learning_rate': 0.034188699628264, 'n_estimators': 1500}. Best is trial 0 with value: 0.732775092598562.
[I 2025-05-16 12:05:58,940] Trial 4 finished with value: 0.7288081148870024 and parameters: {'learning_rate': 0.01054133957517877, 'n_estimators': 1700}. Best is trial 0 with value: 0.732775092598562

In [26]:
final_model = xgb.XGBClassifier(**final_best_params, random_state=RANDOM_SEED, n_jobs=XGB_N_JOBS, eval_metric='auc')

In [27]:
X_train, X_val, y_train, y_val = train_test_split(
    X_os, y_os,
    test_size=0.2,
    random_state=42,
    stratify=y_os  # 保持类别分布一致
)

In [31]:
import joblib # For saving/loading model (optional method)
import pickle # For saving/loading model (optional method)

print("Preparing to train the final XGBoost model...")

# 1. Determine the early_stopping_rounds value
chosen_early_stopping_rounds = 100 # For example, set to 100 rounds. Adjust as needed.

# 2. Prepare the final parameters dictionary
params_for_final_model = final_best_params.copy() # Copy to avoid modifying the original dictionary (if needed)
params_for_final_model['early_stopping_rounds'] = chosen_early_stopping_rounds
params_for_final_model['random_state'] = RANDOM_SEED # Ensure other necessary fixed parameters are also present
params_for_final_model['n_jobs'] = XGB_N_JOBS      # For XGBoost itself

# Ensure 'objective' and 'eval_metric' are present for early stopping to work correctly
if 'objective' not in params_for_final_model:
    params_for_final_model['objective'] = 'binary:logistic' # Adjust according to your task
if 'eval_metric' not in params_for_final_model:
    # For binary classification AUC, 'auc' or 'logloss' are commonly used.
    # 'auc' is more intuitive for early stopping. Ensure it's consistent with the metric used during optimization.
    params_for_final_model['eval_metric'] = 'auc'

print(f"Final parameters for XGBoost model: {params_for_final_model}")

# 3. Initialize and train the final model
final_model = xgb.XGBClassifier(**params_for_final_model)

print(f"\nTraining final model on X_os and y_os...")
# When training on the full dataset with early stopping, and no explicit eval_set is provided,
# XGBoost automatically splits a part of the training data as an internal validation set.
# If you don't want this behavior, and n_estimators is already the optimal value
# determined by cross-validation, you might consider not using early_stopping_rounds
# for the final fit, or provide a specific eval_set (if applicable).
# However, generally, using early stopping (with an internal validation set) during
# the final fit is an additional safeguard against overfitting.

# 4. Split X_os, y_os into a training part and an evaluation part for early stopping
# You can choose the test_size (e.g., 0.1 for 10% validation, 0.2 for 20%)
# Use stratify=y_os for classification tasks to maintain class proportions
X_train_final_fit, X_eval_final_fit, y_train_final_fit, y_eval_final_fit = train_test_split(
    X_os, y_os, test_size=0.1, random_state=RANDOM_SEED, stratify=y_os
)

print(f"\nTraining final model...")
print(f"Using {len(X_train_final_fit)} samples for training and {len(X_eval_final_fit)} for early stopping evaluation.")

# 5. Fit the model, providing the explicit eval_set
final_model.fit(X_train_final_fit, y_train_final_fit,
                eval_set=[(X_eval_final_fit, y_eval_final_fit)], # Pass the evaluation set
                verbose=200) # Or verbose=100, or False

print("\nFinal model training complete.")

# 4. Check if early stopping was triggered and the best iteration
# get_num_boosting_rounds() returns the actual number of trees built (equivalent to best_iteration + 1 if early stopping occurred)
actual_boosting_rounds = final_model.get_num_boosting_rounds()
print(f"Model was trained for {actual_boosting_rounds} rounds.")

if hasattr(final_model, 'best_iteration') and final_model.best_iteration is not None:
    # Note: best_iteration is 0-indexed, so the actual number of trees for the best model is best_iteration + 1
    # This should match actual_boosting_rounds if use_best_model=True (which is implicit with early stopping)
    if actual_boosting_rounds == final_model.best_iteration + 1:
        print(f"Early stopping was triggered. Best iteration was {final_model.best_iteration + 1}.")
    else:
        # This case might occur if n_estimators was reached before early stopping criteria were met for 'early_stopping_rounds'
        print(f"Training stopped at {actual_boosting_rounds} rounds (n_estimators limit or other condition). Best iteration reported by XGBoost (0-indexed): {final_model.best_iteration}.")
else:
    print(f"Early stopping might not have been triggered, or n_estimators limit was reached.")

# --- Save the model ---
model_filename_ubj = "final_xgboost_model.ubj"
model_filename_joblib = "final_xgboost_model.joblib"
model_filename_pkl = "final_xgboost_model.pkl"


# Save using XGBoost native method
try:
    final_model.save_model(model_filename_ubj)
    print(f"\nModel saved successfully to {model_filename_ubj} (XGBoost native UBJ format)")
except Exception as e:
    print(f"\nError saving model with XGBoost native save_model: {e}")

# Save using joblib
try:
    joblib.dump(final_model, model_filename_joblib)
    print(f"Model saved successfully to {model_filename_joblib} (joblib format)")
except Exception as e:
    print(f"\nError saving model with joblib: {e}")

# Save using pickle
try:
    with open(model_filename_pkl, "wb") as f:
        pickle.dump(final_model, f)
    print(f"Model saved successfully to {model_filename_pkl} (pickle format)")
except Exception as e:
    print(f"\nError saving model with pickle: {e}")


Preparing to train the final XGBoost model...
Final parameters for XGBoost model: {'max_depth': 4, 'min_child_weight': 1.123473602060215, 'gamma': 1.3432017845835642, 'reg_alpha': 0.9892855839750394, 'reg_lambda': 0.2097958169371025, 'subsample': 0.8980386684264224, 'colsample_bytree': 0.9088349516071941, 'learning_rate': 0.03811999537709678, 'n_estimators': 2765, 'early_stopping_rounds': 100, 'random_state': 42, 'n_jobs': 5, 'objective': 'binary:logistic', 'eval_metric': 'auc'}

Training final model on X_os and y_os...

Training final model...
Using 574103 samples for training and 63790 for early stopping evaluation.
[0]	validation_0-auc:0.68966
[200]	validation_0-auc:0.72287
[400]	validation_0-auc:0.72691
[600]	validation_0-auc:0.72886
[800]	validation_0-auc:0.73000
[1000]	validation_0-auc:0.73088
[1200]	validation_0-auc:0.73149
[1400]	validation_0-auc:0.73208
[1600]	validation_0-auc:0.73242
[1800]	validation_0-auc:0.73274
[2000]	validation_0-auc:0.73292
[2200]	validation_0-auc:0.733