# AttriPredict - Brute-Force Model Search

This notebook performs an exhaustive brute-force search over models, hyperparameters, feature engineering, and random seeds.

> Baseline: example.ipynb: 0.8797 (best model found during EDA)

---

In [None]:
# ============================================================================
# Environment Setup
# ============================================================================
import warnings

warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime


from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import RFECV, SelectKBest, f_classif
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


try:
    from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
    from imblearn.combine import SMOTETomek, SMOTEENN
    from imblearn.ensemble import BalancedRandomForestClassifier
    HAS_IMB = True
except Exception:
    HAS_IMB = False
    SMOTE = ADASYN = BorderlineSMOTE = SMOTETomek = SMOTEENN = None
    BalancedRandomForestClassifier = EasyEnsembleClassifier = None
    print("[WARN] imblearn not available, skipping advanced samplers")


try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False
    lgb = None
    print("[WARN] lightgbm not available")


try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False
    xgb = None
    print("[WARN] xgboost not available")


try:
    import catboost as cb
    HAS_CAT = True
except Exception:
    HAS_CAT = False
    cb = None
    print("[WARN] catboost not available")


# Global randomness control
RANDOM_STATE = 42
RANDOM_SEEDS = [42, 1337, 2024]
np.random.seed(RANDOM_STATE)


# In notebook experiment tracker
EXPERIMENT_LOG = []


print("="*80)
print("AttriPredict Production Ultimate Initialized")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Libraries: IMB={HAS_IMB}, LGB={HAS_LGB}, XGB={HAS_XGB}, CAT={HAS_CAT}")
print("="*80)


AttriPredict Production Ultimate Initialized
Timestamp: 2025-11-06 11:13:42
Libraries: IMB=True, LGB=True, XGB=True, CAT=True


In [None]:

# ============================================================================
# Load Data
# ============================================================================
print("\n[1/8] Loading Data...")

train_data = pd.read_csv('../../data/train.csv')
test_data = pd.read_csv('../../data/test.csv')

print(f"✓ Train shape: {train_data.shape}")
print(f"✓ Test shape: {test_data.shape}")

# Delete unneeded columns
drop_cols = ['Over18', 'StandardHours', 'EmployeeNumber']
train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

X_train_raw = train_data.drop('Attrition', axis=1)
y_train_raw = train_data['Attrition']
X_test_raw = test_data.drop('Attrition', axis=1)
y_test_raw = test_data['Attrition']



[1/8] Loading Data...
✓ Train shape: (1100, 31)
✓ Test shape: (350, 31)


In [None]:
# ============================================================================
# Feature Engineering - Level 1: Basic Preprocessing
# ============================================================================
print("\n[2/8] Feature Engineering - Level 1: Basic Preprocessing...")


def get_sampler(sampler_config):
    if sampler_config is None or not HAS_IMB:
        return None
    sampler_name = sampler_config.get('name', 'smote').lower()
    sampler_params = dict(sampler_config.get('params', {}))
    sampler_params.setdefault('random_state', sampler_config.get('random_state', RANDOM_STATE))

    if sampler_name == 'smote' and SMOTE is not None:
        return SMOTE(**sampler_params)
    if sampler_name == 'adasyn' and ADASYN is not None:
        return ADASYN(**sampler_params)
    if sampler_name == 'borderlinesmote' and BorderlineSMOTE is not None:
        return BorderlineSMOTE(**sampler_params)
    if sampler_name == 'smoteenn' and SMOTEENN is not None:
        return SMOTEENN(**sampler_params)
    if sampler_name == 'smotetomek' and SMOTETomek is not None:
        return SMOTETomek(**sampler_params)

    raise ValueError(f"Unsupported sampler '{sampler_name}' or imblearn component missing.")


def basic_preprocess(
    X_train,
    extra_datasets=None,
    sampler_config=None,
    y_train=None,
    verbose=True
):
    """
    Basic preprocessing pipeline: encoding + log1p on skewed numerics + scaling,
    with optional sampler applied at the basic stage.
    """
    if extra_datasets is None:
        extra_datasets = []

    if verbose:
        print("  [Basic] OneHot encoding categorical features...")

    nominal_features = ['BusinessTravel', 'Department', 'EducationField',
                        'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    encoded_train = encoder.fit_transform(X_train[nominal_features])
    encoded_df_train = pd.DataFrame(
        encoded_train,
        columns=encoder.get_feature_names_out(nominal_features),
        index=X_train.index
    )
    X_train_proc = X_train.drop(nominal_features, axis=1).copy()
    X_train_proc = pd.concat([X_train_proc, encoded_df_train], axis=1)

    processed_extras = []
    for X_extra in extra_datasets:
        encoded_extra = encoder.transform(X_extra[nominal_features])
        encoded_df_extra = pd.DataFrame(
            encoded_extra, # type: ignore
            columns=encoder.get_feature_names_out(nominal_features),
            index=X_extra.index
        )
        X_extra_proc = X_extra.drop(nominal_features, axis=1).copy()
        X_extra_proc = pd.concat([X_extra_proc, encoded_df_extra], axis=1)
        processed_extras.append(X_extra_proc)

    feature_columns = X_train_proc.columns
    encoded_count = encoded_df_train.shape[1]

    if verbose:
        print(f"    [Basic] Encoded features added: {encoded_count}")
        print("  [Basic] Handling skewness with log1p...")

    numeric_cols = X_train_proc.select_dtypes(include=['float64', 'int64']).columns
    skewed_features = []

    for col in numeric_cols:
        skewness = X_train_proc[col].skew()
        if abs(skewness) > 0.5: # type: ignore
            X_train_proc[col] = np.log1p(X_train_proc[col])
            for dataset in processed_extras:
                if col in dataset.columns:
                    dataset[col] = np.log1p(dataset[col])
            skewed_features.append(col)

    if verbose:
        print(f"    [Basic] Log-transformed: {len(skewed_features)} features")
        print("  [Basic] Standardizing features...")

    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_proc),
        columns=feature_columns,
        index=X_train_proc.index
    )

    scaled_extras = []
    for dataset in processed_extras:
        scaled_dataset = pd.DataFrame(
            scaler.transform(dataset[feature_columns]),
            columns=feature_columns,
            index=dataset.index
        )
        scaled_extras.append(scaled_dataset)

    if verbose:
        print(f"    [Basic] Scaled feature space: {X_train_scaled.shape[1]} columns")

    sampler = get_sampler(sampler_config)
    y_train_final = y_train

    if sampler is not None and y_train is not None:
        if verbose:
            print(f"  [Basic] Applying sampler: {sampler.__class__.__name__} ...")
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled.values, np.asarray(y_train)) # type: ignore
        X_train_scaled = pd.DataFrame(X_resampled, columns=feature_columns)
        y_train_final = y_resampled
        if verbose:
            print(f"    [Basic] Samples: {len(np.asarray(y_train))} -> {len(y_resampled)}")

    return X_train_scaled, scaled_extras, y_train_final


print("  ✓ Basic preprocessing function enhanced")




[2/8] Feature Engineering - Level 1: Basic Preprocessing...
  ✓ Basic preprocessing function enhanced


In [None]:
# ============================================================================
# Feature Engineering - Level 2: Advanced Features
# ============================================================================
print("\n[3/8] Feature Engineering - Level 2: Advanced Features...")


def create_advanced_features(X_train, extra_datasets=None, advanced_config=None, verbose=True):
    """Create interaction, ratio, delta, and aggregation features."""
    if extra_datasets is None:
        extra_datasets = []
    if advanced_config is None:
        advanced_config = {}

    X_train_adv = X_train.copy()
    extras_adv = [dataset.copy() for dataset in extra_datasets]

    interaction_pairs = advanced_config.get('interaction_pairs', [
        ('Age', 'MonthlyIncome'),
        ('DistanceFromHome', 'OverTime_Yes'),
        ('YearsAtCompany', 'JobLevel'),
        ('TotalWorkingYears', 'Age'),
        ('WorkLifeBalance', 'OverTime_Yes'),
        ('JobSatisfaction', 'EnvironmentSatisfaction'),
        ('TotalWorkingYears', 'NumCompaniesWorked'),
        ('MonthlyIncome', 'JobLevel'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(interaction_pairs)} interaction features...")

    for col1, col2 in interaction_pairs:
        if col1 in X_train_adv.columns and col2 in X_train_adv.columns:
            feat_name = f"{col1}_x_{col2}"
            X_train_adv[feat_name] = X_train_adv[col1] * X_train_adv[col2]
            for dataset in extras_adv:
                if col1 in dataset.columns and col2 in dataset.columns:
                    dataset[feat_name] = dataset[col1] * dataset[col2]

    ratio_specs = advanced_config.get('ratio_features', [
        ('MonthlyIncome', 'Age', 'Income_per_Age'),
        ('YearsAtCompany', 'TotalWorkingYears', 'Company_vs_Total_Years'),
        ('YearsSinceLastPromotion', 'YearsAtCompany', 'Promo_Frequency'),
        ('YearsWithCurrManager', 'YearsInCurrentRole', 'Manager_Stability'),
        ('TotalWorkingYears', 'NumCompaniesWorked', 'Years_per_Company'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(ratio_specs)} ratio features...")

    for num_col, denom_col, feat_name in ratio_specs:
        if num_col in X_train_adv.columns and denom_col in X_train_adv.columns:
            X_train_adv[feat_name] = X_train_adv[num_col] / (X_train_adv[denom_col] + 1e-5)
            for dataset in extras_adv:
                if num_col in dataset.columns and denom_col in dataset.columns:
                    dataset[feat_name] = dataset[num_col] / (dataset[denom_col] + 1e-5)

    delta_specs = advanced_config.get('delta_features', [
        ('YearsAtCompany', 'YearsInCurrentRole', 'Tenure_vs_Role'),
        ('YearsAtCompany', 'YearsWithCurrManager', 'Tenure_vs_Manager'),
        ('PerformanceRating', 'EnvironmentSatisfaction', 'Performance_vs_Environment'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(delta_specs)} delta features...")

    for top_col, bottom_col, feat_name in delta_specs:
        if top_col in X_train_adv.columns and bottom_col in X_train_adv.columns:
            X_train_adv[feat_name] = X_train_adv[top_col] - X_train_adv[bottom_col]
            for dataset in extras_adv:
                if top_col in dataset.columns and bottom_col in dataset.columns:
                    dataset[feat_name] = dataset[top_col] - dataset[bottom_col]

    satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction',
                         'RelationshipSatisfaction', 'WorkLifeBalance']

    if all(col in X_train_adv.columns for col in satisfaction_cols):
        if verbose:
            print("  [Advanced] Aggregating satisfaction metrics...")
        X_train_adv['Satisfaction_Mean'] = X_train_adv[satisfaction_cols].mean(axis=1)
        X_train_adv['Satisfaction_Std'] = X_train_adv[satisfaction_cols].std(axis=1)
        X_train_adv['Satisfaction_Max'] = X_train_adv[satisfaction_cols].max(axis=1)
        for dataset in extras_adv:
            dataset['Satisfaction_Mean'] = dataset[satisfaction_cols].mean(axis=1)
            dataset['Satisfaction_Std'] = dataset[satisfaction_cols].std(axis=1)
            dataset['Satisfaction_Max'] = dataset[satisfaction_cols].max(axis=1)

    if verbose:
        print(f"  [Advanced] Total features after expansion: {X_train_adv.shape[1]}")

    return X_train_adv, extras_adv


def apply_polynomial_features(X_train, extra_datasets=None, poly_config=None, verbose=True):
    """Apply polynomial feature expansion on a subset of columns."""
    if poly_config is None:
        return X_train, extra_datasets or [], []

    if extra_datasets is None:
        extra_datasets = []

    max_base = poly_config.get('max_base_features', 12)
    degree = poly_config.get('degree', 2)
    interaction_only = poly_config.get('interaction_only', False)
    include_bias = poly_config.get('include_bias', False)

    numeric_cols = list(X_train.select_dtypes(include=[np.number]).columns)
    base_cols = poly_config.get('columns')
    if base_cols is None:
        std_series = X_train[numeric_cols].std().sort_values(ascending=False)
        base_cols = list(std_series.head(max_base).index)
    else:
        base_cols = [col for col in base_cols if col in X_train.columns]

    if len(base_cols) == 0:
        return X_train, extra_datasets, []

    poly = PolynomialFeatures(
        degree=degree,
        interaction_only=interaction_only,
        include_bias=include_bias
    )

    train_poly = poly.fit_transform(X_train[base_cols])
    poly_feature_names = poly.get_feature_names_out(base_cols)
    poly_df_train = pd.DataFrame(train_poly, columns=poly_feature_names, index=X_train.index)

    new_columns = [col for col in poly_df_train.columns if col not in X_train.columns]
    poly_df_train = poly_df_train[new_columns]
    X_train_poly = pd.concat([X_train, poly_df_train], axis=1)

    extras_poly = []
    for dataset in extra_datasets:
        transformed = poly.transform(dataset[base_cols]) # type: ignore
        poly_df_extra = pd.DataFrame(transformed, columns=poly_feature_names, index=dataset.index) # type: ignore
        poly_df_extra = poly_df_extra[new_columns]
        extras_poly.append(pd.concat([dataset, poly_df_extra], axis=1))

    if verbose:
        print(f"  [Poly] Degree {degree} expansion on {len(base_cols)} base cols -> {len(new_columns)} new cols")

    return X_train_poly, extras_poly, new_columns


def apply_feature_selector(X_train, y_train, extra_datasets=None, selector_config=None, verbose=True):
    """Apply a feature selection strategy and keep aligned columns for extras."""
    if selector_config is None:
        return X_train, extra_datasets or [], X_train.columns.tolist(), "none"

    if extra_datasets is None:
        extra_datasets = []

    strategy = selector_config.get('strategy', 'selectk').lower()
    selector = None
    summary = 'none'

    if strategy == 'selectk':
        k = selector_config.get('k', min(120, X_train.shape[1]))
        k = min(k, X_train.shape[1])
        selector = SelectKBest(score_func=f_classif, k=k)
        summary = f'selectk_{k}'
    elif strategy == 'rfecv':
        step = selector_config.get('step', 1)
        min_features = selector_config.get('min_features', max(25, X_train.shape[1] // 6))
        base_estimator = selector_config.get('estimator')
        if base_estimator is None:
            base_estimator = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=2000)
        selector = RFECV(
            estimator=base_estimator,
            step=step,
            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
            scoring='roc_auc',
            min_features_to_select=min_features,
            n_jobs=-1
        )
        summary = f'rfecv_step{step}_min{min_features}'
    else:
        raise ValueError(f"Unknown feature selector strategy: {strategy}")

    selector.fit(X_train, y_train)
    support_mask = selector.get_support()
    selected_columns = X_train.columns[support_mask]

    X_train_selected = X_train[selected_columns].copy()
    extras_selected = [dataset[selected_columns].copy() for dataset in extra_datasets]

    if verbose:
        print(f"  [Selector] Strategy={summary} retained {len(selected_columns)} columns")

    return X_train_selected, extras_selected, selected_columns.tolist(), summary


print("  ✓ Advanced feature engineering utilities ready")



[3/8] Feature Engineering - Level 2: Advanced Features...
  ✓ Advanced feature engineering utilities ready


In [48]:
# ============================================================================
# Experiment Tracking System
# ============================================================================
print("\n[4/8] Setting up Experiment Tracking...")


def log_experiment(exp_name, model_name, cv_auc_mean, cv_auc_std, test_auc,
                   features_used, hyperparams, notes="", metadata=None):
    """Persist experiment metadata into the global experiment log."""
    experiment = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'exp_name': exp_name,
        'model': model_name,
        'cv_auc_mean': cv_auc_mean,
        'cv_auc_std': cv_auc_std,
        'test_auc': test_auc,
        'features_count': features_used,
        'hyperparams': str(hyperparams),
        'notes': notes
    }

    if metadata:
        experiment.update(metadata)

    EXPERIMENT_LOG.append(experiment)
    print(f"  -> Logged: {exp_name} | CV: {cv_auc_mean:.4f}±{cv_auc_std:.4f} | Test: {test_auc:.4f}")
    return experiment


def get_results_df():
    """Return experiments as sorted DataFrame."""
    df = pd.DataFrame(EXPERIMENT_LOG)
    if len(df) > 0:
        df = df.sort_values('test_auc', ascending=False)
    return df


print("  ✓ Experiment tracking system ready")




[4/8] Setting up Experiment Tracking...
  ✓ Experiment tracking system ready


In [None]:
# ============================================================================
# Cross-Validation Utilities
# ============================================================================
print("  ✓ CV evaluation function enhanced")


def evaluate_with_cv(model, X, y, X_test, y_test, feature_config, cv_override=None):
    """
    Perform cross-validation with dynamic feature preparation and aggregate
    test performance using the configured CV strategy.
    """
    cv_mode = feature_config.get('cv_mode', 'kfold')
    n_folds = feature_config.get('n_folds', 5)

    if cv_override is not None:
        splitter = cv_override
    elif cv_mode == 'repeated_5x2':
        splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=RANDOM_STATE)
    elif cv_mode == 'kfold10':
        splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
    else:
        splitter = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

    cv_scores = []
    test_preds = np.zeros(len(X_test))
    fold_count = 0
    feature_count = None
    last_metadata = {}

    for fold, (train_idx, val_idx) in enumerate(splitter.split(X, y)):
        fold_count += 1
        X_fold_train = X.iloc[train_idx]
        X_fold_val = X.iloc[val_idx]

        if hasattr(y, 'iloc'):
            y_fold_train = y.iloc[train_idx]
            y_fold_val = y.iloc[val_idx]
        else:
            y_fold_train = y[train_idx]
            y_fold_val = y[val_idx]

        prep_tuple = prepare_feature_set( # type: ignore
            feature_config,
            X_fold_train,
            X_fold_val,
            X_test,
            y_fold_train
        )

        if len(prep_tuple) == 4:
            X_train_ready, X_val_ready, X_test_ready, y_train_ready = prep_tuple
            prep_meta = {}
        else:
            X_train_ready, X_val_ready, X_test_ready, y_train_ready, prep_meta = prep_tuple

        feature_count = X_train_ready.shape[1]
        last_metadata = prep_meta
        y_val_array = np.asarray(y_fold_val)

        model_clone = model.__class__(**model.get_params()) if hasattr(model, 'get_params') else model
        model_clone.fit(X_train_ready, y_train_ready)

        val_pred = model_clone.predict_proba(X_val_ready)[:, 1]
        val_auc = roc_auc_score(y_val_array, val_pred)
        cv_scores.append(val_auc)

        fold_test_pred = model_clone.predict_proba(X_test_ready)[:, 1]
        test_preds += fold_test_pred

    cv_mean = float(np.mean(cv_scores)) if cv_scores else 0.0
    cv_std = float(np.std(cv_scores)) if cv_scores else 0.0
    test_preds /= max(fold_count, 1)
    test_auc = roc_auc_score(np.asarray(y_test), test_preds)

    return cv_mean, cv_std, test_auc, test_preds, feature_count, last_metadata


  ✓ CV evaluation function enhanced


In [None]:
# ============================================================================
# Feature Configurations
# ============================================================================
print("\n[5/8] Preparing Feature Sets...")


FEATURE_SETS = {
    'basic': {
        'use_advanced': False,
        'samplers': {'basic': {'name': 'smote'}},
        'description': 'Basic preprocessing with per-fold SMOTE',
        'cv_mode': 'kfold',
        'n_folds': 5
    },
    'advanced': {
        'use_advanced': True,
        'samplers': {'advanced': {'name': 'smote'}},
        'description': 'Advanced features with post-engineering SMOTE',
        'cv_mode': 'kfold',
        'n_folds': 5
    },
    'adv_poly': {
        'use_advanced': True,
        'poly': {'degree': 2, 'interaction_only': False, 'include_bias': False, 'max_base_features': 14},
        'samplers': {'advanced': {'name': 'smote'}},
        'description': 'Advanced + polynomial bump (top variance features)',
        'cv_mode': 'kfold10',
        'n_folds': 10
    },
    'adv_poly_select': {
        'use_advanced': True,
        'poly': {'degree': 2, 'interaction_only': True, 'include_bias': False, 'max_base_features': 12},
        'feature_selector': {'strategy': 'selectk', 'k': 90},
        'samplers': {'advanced': {'name': 'adasyn'}},
        'description': 'Advanced + interaction-only poly + SelectKBest + ADASYN',
        'cv_mode': 'kfold',
        'n_folds': 5
    },
    'adv_rfecv_dual': {
        'use_advanced': True,
        'feature_selector': {'strategy': 'rfecv', 'step': 2, 'min_features': 40},
        'samplers': {'basic': {'name': 'smote'}, 'advanced': {'name': 'smoteenn', 'params': {'sampling_strategy': 'auto'}}},
        'description': 'Advanced + RFECV + dual-stage resampling (SMOTE -> SMOTEENN)',
        'cv_mode': 'repeated_5x2',
        'n_folds': 5
    }
}


def prepare_feature_set(config, X_train, X_val, X_test, y_train):
    extra_datasets = [X_val, X_test]
    samplers = config.get('samplers', {})
    basic_sampler_cfg = samplers.get('basic')

    X_train_basic, extras_basic, y_train_basic = basic_preprocess(
        X_train,
        extra_datasets=extra_datasets,
        sampler_config=basic_sampler_cfg,
        y_train=y_train,
        verbose=False
    )

    if len(extras_basic) == 2:
        X_val_basic, X_test_basic = extras_basic
    else:
        X_val_basic, X_test_basic = extras_basic[0], extras_basic[0]

    X_train_ready = X_train_basic
    X_val_ready = X_val_basic
    X_test_ready = X_test_basic
    y_train_ready = y_train_basic

    prep_meta = {
        'feature_set_name': config.get('description', 'unknown'),
        'basic_sampler': basic_sampler_cfg['name'] if basic_sampler_cfg else 'none',
        'advanced_sampler': 'none',
        'poly_added': 0,
        'selector': 'none',
        'cv_mode': config.get('cv_mode', 'kfold')
    }

    if config.get('use_advanced'):
        X_train_ready, extras_adv = create_advanced_features(
            X_train_ready,
            extra_datasets=[X_val_ready, X_test_ready],
            advanced_config=config.get('advanced_config'),
            verbose=False
        )
        X_val_ready, X_test_ready = extras_adv

    if config.get('poly'):
        X_train_ready, extras_poly, new_cols = apply_polynomial_features(
            X_train_ready,
            extra_datasets=[X_val_ready, X_test_ready],
            poly_config=config['poly'],
            verbose=False
        )
        if extras_poly:
            X_val_ready, X_test_ready = extras_poly
        prep_meta['poly_added'] = len(new_cols)

    selector_cfg = config.get('feature_selector')
    if selector_cfg and y_train_ready is not None:
        X_train_ready, extras_sel, selected_cols, selector_summary = apply_feature_selector(
            X_train_ready,
            np.asarray(y_train_ready),
            extra_datasets=[X_val_ready, X_test_ready],
            selector_config=selector_cfg,
            verbose=False
        )
        if extras_sel:
            X_val_ready, X_test_ready = extras_sel
        prep_meta['selector'] = selector_summary

    advanced_sampler_cfg = samplers.get('advanced')
    if advanced_sampler_cfg and HAS_IMB and y_train_ready is not None:
        sampler = get_sampler(advanced_sampler_cfg)
        X_resampled, y_resampled = sampler.fit_resample(X_train_ready.values, np.asarray(y_train_ready)) # type: ignore
        X_train_ready = pd.DataFrame(X_resampled, columns=X_train_ready.columns)
        y_train_ready = y_resampled
        prep_meta['advanced_sampler'] = advanced_sampler_cfg.get('name', 'unknown')

    return (
        X_train_ready,
        X_val_ready,
        X_test_ready,
        np.asarray(y_train_ready) if y_train_ready is not None else y_train_ready,
        prep_meta
    )


print(f"  ✓ Prepared {len(FEATURE_SETS)} feature configurations:")
for name, cfg in FEATURE_SETS.items():
    print(f"    - {name}: advanced={cfg.get('use_advanced', False)}, samplers={list((cfg.get('samplers') or {}).keys())}, cv={cfg.get('cv_mode', 'kfold')}")




[5/8] Preparing Feature Sets...
  ✓ Prepared 5 feature configurations:
    - basic: advanced=False, samplers=['basic'], cv=kfold
    - advanced: advanced=True, samplers=['advanced'], cv=kfold
    - adv_poly: advanced=True, samplers=['advanced'], cv=kfold10
    - adv_poly_select: advanced=True, samplers=['advanced'], cv=kfold
    - adv_rfecv_dual: advanced=True, samplers=['basic', 'advanced'], cv=repeated_5x2


In [51]:
# ============================================================================
# Model Catalog and Hyperparameter Grids
# ============================================================================
print("\n[6/8] Defining Models and Hyperparameter Grids...")


LR_CONFIGS = [
    {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 4000},
    {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 4000},
    {'C': 5.0, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 4000, 'class_weight': 'balanced'},
    {'C': 10.0, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 4000},
    {'C': 0.5, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 5000},
    {'C': 1.0, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 5000, 'class_weight': 'balanced'}
]


RF_CONFIGS = [
    {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
    {'n_estimators': 400, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
    {'n_estimators': 600, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'},
    {'n_estimators': 400, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.6},
    {'n_estimators': 800, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.8, 'class_weight': 'balanced'},
    {'n_estimators': 500, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'}
]


ET_CONFIGS = [
    {'n_estimators': 400, 'max_depth': 14, 'min_samples_split': 2, 'max_features': 'sqrt'},
    {'n_estimators': 600, 'max_depth': 18, 'min_samples_split': 2, 'max_features': 0.7},
    {'n_estimators': 800, 'max_depth': None, 'min_samples_split': 2, 'max_features': 0.5},
]


GB_CONFIGS = [
    {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 4, 'subsample': 0.8},
    {'n_estimators': 400, 'learning_rate': 0.03, 'max_depth': 5, 'subsample': 0.9},
    {'n_estimators': 600, 'learning_rate': 0.02, 'max_depth': 6, 'subsample': 0.9},
    {'n_estimators': 800, 'learning_rate': 0.015, 'max_depth': 6, 'subsample': 0.8},
]


HGB_CONFIGS = [
    {'learning_rate': 0.05, 'max_depth': 3, 'max_iter': 300, 'l2_regularization': 0.0},
    {'learning_rate': 0.03, 'max_depth': 4, 'max_iter': 500, 'l2_regularization': 0.1},
    {'learning_rate': 0.02, 'max_depth': 6, 'max_iter': 800, 'l2_regularization': 0.2},
]


MLP_CONFIGS = [
    {'hidden_layer_sizes': (128, 64), 'alpha': 0.0005, 'learning_rate_init': 0.001, 'max_iter': 400},
    {'hidden_layer_sizes': (256, 128, 64), 'alpha': 0.0001, 'learning_rate_init': 0.0005, 'max_iter': 500},
]


SVC_CONFIGS = [
    {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True},
    {'C': 5.0, 'kernel': 'rbf', 'gamma': 0.1, 'probability': True, 'class_weight': 'balanced'},
]


MODEL_CONFIGS = [
    ('LR', LogisticRegression, LR_CONFIGS),
    ('RF', RandomForestClassifier, RF_CONFIGS),
    ('ET', ExtraTreesClassifier, ET_CONFIGS),
    ('GB', GradientBoostingClassifier, GB_CONFIGS),
    ('HGB', HistGradientBoostingClassifier, HGB_CONFIGS),
    ('MLP', MLPClassifier, MLP_CONFIGS),
    ('SVC', SVC, SVC_CONFIGS),
]


if HAS_IMB and BalancedRandomForestClassifier is not None:
    BRF_CONFIGS = [
        {'n_estimators': 400, 'max_depth': 10, 'max_features': 'sqrt'},
        {'n_estimators': 600, 'max_depth': 12, 'max_features': 0.7},
        {'n_estimators': 800, 'max_depth': None, 'max_features': 0.8},
    ]
    MODEL_CONFIGS.append(('BRF', BalancedRandomForestClassifier, BRF_CONFIGS))


if HAS_XGB and xgb is not None:
    XGB_CONFIGS = [
        {'n_estimators': 400, 'learning_rate': 0.05, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0},
        {'n_estimators': 600, 'learning_rate': 0.03, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 1.5, 'scale_pos_weight': 1.5},
        {'n_estimators': 800, 'learning_rate': 0.02, 'max_depth': 6, 'subsample': 0.85, 'colsample_bytree': 0.9, 'reg_lambda': 2.0, 'scale_pos_weight': 2.0},
        {'n_estimators': 1000, 'learning_rate': 0.015, 'max_depth': 6, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 2.5, 'scale_pos_weight': 2.5},
    ]
    MODEL_CONFIGS.append(('XGB', xgb.XGBClassifier, XGB_CONFIGS))


if HAS_LGB and lgb is not None:
    LGB_CONFIGS = [
        {'n_estimators': 400, 'learning_rate': 0.05, 'num_leaves': 48, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_lambda': 0.0},
        {'n_estimators': 600, 'learning_rate': 0.03, 'num_leaves': 64, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha': 0.0, 'reg_lambda': 0.5},
        {'n_estimators': 800, 'learning_rate': 0.02, 'num_leaves': 96, 'subsample': 0.85, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.8},
        {'n_estimators': 1000, 'learning_rate': 0.015, 'num_leaves': 120, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.1, 'reg_lambda': 1.0},
    ]
    MODEL_CONFIGS.append(('LGB', lgb.LGBMClassifier, LGB_CONFIGS))


if HAS_CAT and cb is not None:
    CAT_CONFIGS = [
        {'iterations': 400, 'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 3.0},
        {'iterations': 600, 'learning_rate': 0.04, 'depth': 7, 'l2_leaf_reg': 2.0},
        {'iterations': 800, 'learning_rate': 0.03, 'depth': 8, 'l2_leaf_reg': 2.5},
        {'iterations': 1000, 'learning_rate': 0.025, 'depth': 8, 'l2_leaf_reg': 3.5},
    ]
    MODEL_CONFIGS.append(('CAT', cb.CatBoostClassifier, CAT_CONFIGS))


total_model_types = len(MODEL_CONFIGS)
config_count = sum(len(configs) for _, _, configs in MODEL_CONFIGS)
feature_config_count = len(FEATURE_SETS)

print(f"  ✓ Models: {total_model_types} types")
print(f"  ✓ Total configurations: {config_count}")
print(f"  ✓ Feature sets: {feature_config_count}")
print(f"  ✓ Total base experiments (per seed): {config_count * feature_config_count}")



[6/8] Defining Models and Hyperparameter Grids...
  ✓ Models: 11 types
  ✓ Total configurations: 41
  ✓ Feature sets: 5
  ✓ Total base experiments (per seed): 205


In [52]:
experiment_count = 0

MODEL_BASE_PARAMS = {
    'LR': lambda seed: {'random_state': seed},
    'RF': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'ET': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'GB': lambda seed: {'random_state': seed},
    'HGB': lambda seed: {'random_state': seed},
    'MLP': lambda seed: {'random_state': seed},
    'SVC': lambda seed: {'random_state': seed, 'cache_size': 1000},
    'BRF': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'XGB': lambda seed: {'random_state': seed, 'use_label_encoder': False, 'eval_metric': 'auc', 'n_jobs': -1, 'tree_method': 'hist'},
    'LGB': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'CAT': lambda seed: {'random_seed': seed, 'verbose': 0, 'eval_metric': 'AUC'}
}


for seed in RANDOM_SEEDS:
    print(f"\n{'#'*80}")
    print(f"GLOBAL SEED: {seed}")
    print(f"{'#'*80}")

    np.random.seed(seed)

    for feat_name, feat_config in FEATURE_SETS.items():
        print(f"\n{'='*80}")
        print(f"FEATURE SET: {feat_name.upper()} | Config: advanced={feat_config.get('use_advanced', False)}, CV={feat_config.get('cv_mode', 'kfold')}")
        print(f"{'='*80}")

        for model_name, model_class, configs in MODEL_CONFIGS:
            print(f"\n  [{model_name}] Testing {len(configs)} configurations...")

            for idx, config in enumerate(configs, 1):
                experiment_count += 1

                base_params_builder = MODEL_BASE_PARAMS.get(model_name, lambda s: {})
                base_params = base_params_builder(seed)
                base_params.setdefault('random_state', seed)
                if model_name in ['RF', 'ET', 'BRF', 'XGB', 'LGB']:
                    base_params.setdefault('n_jobs', -1)
                if model_name == 'CAT':
                    base_params.pop('random_state', None)

                merged_params = {**base_params, **config}

                try:
                    model = model_class(**merged_params)
                except TypeError as e:
                    print(f"    Config {idx} skipped (param error): {e}")
                    continue

                try:
                    cv_mean, cv_std, test_auc, _, feature_count, prep_meta = evaluate_with_cv(
                        model,
                        X_train_raw,
                        y_train_raw,
                        X_test_raw,
                        y_test_raw,
                        feature_config=feat_config
                    )

                    exp_name = f"seed{seed}_{feat_name}_{model_name}_v{idx}"
                    metadata = {
                        'feature_set_key': feat_name,
                        'random_seed': seed,
                        'poly_added': prep_meta.get('poly_added', 0),
                        'selector': prep_meta.get('selector', 'none'),
                        'basic_sampler': prep_meta.get('basic_sampler', 'none'),
                        'advanced_sampler': prep_meta.get('advanced_sampler', 'none'),
                        'cv_mode': prep_meta.get('cv_mode', feat_config.get('cv_mode', 'kfold'))
                    }

                    log_experiment(
                        exp_name=exp_name,
                        model_name=model_name,
                        cv_auc_mean=cv_mean,
                        cv_auc_std=cv_std,
                        test_auc=test_auc,
                        features_used=feature_count,
                        hyperparams=merged_params,
                        notes=feat_config.get('description', ''),
                        metadata=metadata
                    )

                except Exception as e:
                    print(f"    Config {idx} failed: {str(e)}")
                    continue

            print(f"  Completed {len(configs)} {model_name} experiments")

print(f"\nTotal experiments scheduled: {experiment_count}")



################################################################################
GLOBAL SEED: 42
################################################################################

FEATURE SET: BASIC | Config: advanced=False, CV=kfold

  [LR] Testing 6 configurations...
  -> Logged: seed42_basic_LR_v1 | CV: 0.8224±0.0245 | Test: 0.8864
  -> Logged: seed42_basic_LR_v2 | CV: 0.8176±0.0255 | Test: 0.8840
  -> Logged: seed42_basic_LR_v3 | CV: 0.8168±0.0253 | Test: 0.8835
  -> Logged: seed42_basic_LR_v4 | CV: 0.8166±0.0254 | Test: 0.8831
  -> Logged: seed42_basic_LR_v5 | CV: 0.8191±0.0256 | Test: 0.8827
  -> Logged: seed42_basic_LR_v6 | CV: 0.8181±0.0254 | Test: 0.8839
  Completed 6 LR experiments

  [RF] Testing 6 configurations...
  -> Logged: seed42_basic_RF_v1 | CV: 0.7944±0.0288 | Test: 0.8218
  -> Logged: seed42_basic_RF_v2 | CV: 0.7910±0.0330 | Test: 0.8256
  -> Logged: seed42_basic_RF_v3 | CV: 0.8007±0.0315 | Test: 0.8238
  -> Logged: seed42_basic_RF_v4 | CV: 0.7666±0.0374 | Test: 0.

In [53]:
# ============================================================================
# Results Consolidation and Reporting
# ============================================================================
print("\n[8/8] Results Analysis and Ranking...")
print("="*80)

results_df = get_results_df()

if len(results_df) == 0:
    print("[WARN] No experiments recorded!")
else:
    results_df.to_csv('experiment_results_full.csv', index=False)
    print("[INFO] Full results saved to: experiment_results_full.csv")

    print("\n" + "="*120)
    print("TOP 25 CONFIGURATIONS BY TEST AUC")
    print("="*120)
    print(f"{'Rank':<6} {'Exp Name':<34} {'Model':<8} {'Seed':<6} {'CV AUC':<18} {'Test AUC':<10} {'Features':<10} {'Poly':<6} {'Selector':<12}")
    print("-"*120)

    for rank, row in enumerate(results_df.head(25).itertuples(), 1):
        cv_str = f"{row.cv_auc_mean:.4f}±{row.cv_auc_std:.4f}"
        print(f"{rank:<6} {row.exp_name:<34} {row.model:<8} {getattr(row, 'random_seed', 'na')!s:<6} {cv_str:<18} {row.test_auc:<10.4f} {row.features_count:<10} {getattr(row, 'poly_added', 0):<6} {getattr(row, 'selector', 'none'):<12}")

    print("-"*120)

    print("\n" + "="*80)
    print("STATISTICAL SUMMARY")
    print("="*80)

    baseline = 0.8797
    best_row = results_df.iloc[0]

    print(f"\n[Perf] Overall Performance:")
    print(f"  Total Experiments: {len(results_df)}")
    print(f"  Best Test AUC: {results_df['test_auc'].max():.4f}")
    print(f"  Mean Test AUC: {results_df['test_auc'].mean():.4f}")
    print(f"  Median Test AUC: {results_df['test_auc'].median():.4f}")
    print(f"  Std Test AUC: {results_df['test_auc'].std():.4f}")

    print(f"\n[Perf] Best Configuration:")
    print(f"  Name: {best_row['exp_name']}")
    print(f"  Model: {best_row['model']}")
    print(f"  CV AUC: {best_row['cv_auc_mean']:.4f} ± {best_row['cv_auc_std']:.4f}")
    print(f"  Test AUC: {best_row['test_auc']:.4f}")
    print(f"  Features: {best_row['features_count']}")
    print(f"  Seed: {best_row.get('random_seed', 'na')}")
    print(f"  Feature set: {best_row.get('feature_set_key', 'na')}")
    print(f"  Poly added: {best_row.get('poly_added', 0)} | Selector: {best_row.get('selector', 'none')}")
    print(f"  Hyperparameters: {best_row['hyperparams']}")

    print(f"\n[Baseline] vs Baseline (example.ipynb: {baseline:.4f}):")
    improvement = best_row['test_auc'] - baseline
    if improvement > 0:
        print(f"  BEAT BASELINE by {improvement:.4f} ({improvement/baseline*100:.2f}%)")
    elif improvement > -0.005:
        print(f"  MATCHED BASELINE (within {abs(improvement):.4f})")
    else:
        print(f"  Below baseline by {abs(improvement):.4f} ({abs(improvement)/baseline*100:.2f}%)")

    print(f"\n[Model] Performance by Model Type:")
    model_stats = results_df.groupby('model')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
    print("\n" + model_stats.to_string())

    print(f"\n[Feature] Performance by Feature Set:")
    feat_stats = results_df.groupby('feature_set_key')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
    print("\n" + feat_stats.to_string())

    if 'cv_mode' in results_df.columns:
        print(f"\n[CV] Performance by CV Mode:")
        cv_stats = results_df.groupby('cv_mode')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
        print("\n" + cv_stats.to_string())

    if 'random_seed' in results_df.columns:
        print(f"\n[Seed] Distribution by random seed:")
        seed_stats = results_df.groupby('random_seed')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
        print("\n" + seed_stats.to_string())

    above_baseline = (results_df['test_auc'] >= baseline).sum()
    print(f"\n[Counts] Configurations beating baseline: {above_baseline}/{len(results_df)} ({above_baseline/len(results_df)*100:.1f}%)")



[8/8] Results Analysis and Ranking...
[INFO] Full results saved to: experiment_results_full.csv

TOP 25 CONFIGURATIONS BY TEST AUC
Rank   Exp Name                           Model    Seed   CV AUC             Test AUC   Features   Poly   Selector    
------------------------------------------------------------------------------------------------------------------------
1      seed1337_advanced_LR_v4            LR       1337   0.7962±0.0318      0.8908     67         0      none        
2      seed42_advanced_LR_v4              LR       42     0.7962±0.0318      0.8908     67         0      none        
3      seed2024_advanced_LR_v4            LR       2024   0.7962±0.0318      0.8908     67         0      none        
4      seed2024_advanced_LR_v3            LR       2024   0.7961±0.0320      0.8907     67         0      none        
5      seed1337_advanced_LR_v3            LR       1337   0.7961±0.0320      0.8907     67         0      none        
6      seed42_advanced_LR_v3     

In [None]:
# ============================================================================
# Visualization
# ============================================================================
print("\n" + "="*80)
print("VISUALIZATION")
print("="*80)

if len(results_df) > 0:
    baseline = 0.8797
    fig, axes = plt.subplots(2, 2, figsize=(18, 14))

    ax = axes[0, 0]
    ax.hist(results_df['test_auc'], bins=30, edgecolor='black', alpha=0.7)
    ax.axvline(baseline, color='red', linestyle='--', linewidth=2, label=f'Baseline ({baseline})')
    ax.axvline(results_df['test_auc'].max(), color='green', linestyle='--', linewidth=2, label='Best')
    ax.set_xlabel('Test AUC', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Test AUC Distribution', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    ax = axes[0, 1]
    for model in results_df['model'].unique():
        model_data = results_df[results_df['model'] == model]
        ax.scatter(model_data['cv_auc_mean'], model_data['test_auc'], label=model, alpha=0.3, s=100)
    ax.plot([0.7, 0.9], [0.7, 0.9], 'k--', alpha=0.3, label='Perfect correlation')
    ax.set_xlim(0.65, 0.84)
    ax.set_ylim(0.75, 0.9)
    ax.set_xlabel('CV AUC (mean)', fontsize=12)
    ax.set_ylabel('Test AUC', fontsize=12)
    ax.set_title('CV vs Test AUC Correlation', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    ax = axes[1, 0]
    results_df.boxplot(column='test_auc', by='model', ax=ax)
    ax.axhline(baseline, color='red', linestyle='--', linewidth=2, alpha=0.5)
    ax.set_xlabel('Model Type', fontsize=12)
    ax.set_ylabel('Test AUC', fontsize=12)
    ax.set_title('Test AUC by Model Type', fontsize=14, fontweight='bold')
    plt.sca(ax)
    plt.xticks(rotation=45)

    ax = axes[1, 1]
    top10 = results_df.head(10)
    colors = plt.cm.viridis(np.linspace(0, 1, len(top10))) # type: ignore
    bars = ax.barh(range(len(top10)), top10['test_auc'], color=colors)
    ax.set_yticks(range(len(top10)))
    labels = []
    for row in top10.itertuples():
        label = getattr(row, 'exp_name')
        labels.append(label if len(label) <= 40 else label[:37] + '...')
    ax.set_yticklabels(labels, fontsize=9)
    ax.axvline(baseline, color='red', linestyle='--', linewidth=2, label=f'Baseline ({baseline})')
    ax.set_xlabel('Test AUC', fontsize=12)
    ax.set_title('Top 10 Configurations', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='x')

    plt.tight_layout()
    plt.savefig('production_ultimate_results.png', dpi=150, bbox_inches='tight')
    print("\n[INFO] Visualization saved to: production_ultimate_results.png")
    plt.close()
else:
    print("[WARN] No results to visualize")



VISUALIZATION

[INFO] Visualization saved to: production_ultimate_results.png


In [None]:
# ============================================================================
# Production Recommendations
# ============================================================================
print("\n" + "="*80)
print("PRODUCTION DEPLOYMENT RECOMMENDATIONS")
print("="*80)

if len(results_df) > 0:
    best_config = results_df.iloc[0]

    print(f"\n[Deploy] Recommended Configuration for Production:")
    print(f"  Primary Model: {best_config['model']} | Seed: {best_config.get('random_seed', 'na')}")
    print(f"  Experiment ID: {best_config['exp_name']}")
    print(f"  Expected Performance: {best_config['test_auc']:.4f} AUC")

    print(f"\n  Feature Engineering:")
    print(f"    Feature Set Key: {best_config.get('feature_set_key', 'na')}")
    print(f"    Total Features: {best_config['features_count']}")
    print(f"    Poly Added: {best_config.get('poly_added', 0)} | Selector: {best_config.get('selector', 'none')}")
    print(f"    Samplers: basic={best_config.get('basic_sampler', 'none')} -> advanced={best_config.get('advanced_sampler', 'none')}")

    print(f"\n  Training Strategy:")
    print(f"    CV Mode: {best_config.get('cv_mode', 'kfold')} | CV Mean: {best_config['cv_auc_mean']:.4f}")
    print(f"    Use selected feature pipeline per fold, then refit on full data with same preprocessing")

    print(f"\n  Monitoring:")
    print(f"    Track validation AUC weekly and alert if below {best_config['test_auc'] - 0.02:.4f}")
    print(f"    Re-run heavy search quarterly or when data drift observed")

    top5 = results_df.head(5)
    if len(top5) >= 2:
        ensemble_candidates = ', '.join(f"{row.model}@{row.test_auc:.3f}" for row in top5.itertuples())
        print(f"\n  Ensemble Option:")
        print(f"    Soft-vote top 5 models: {ensemble_candidates}")
        print(f"    Expect +0.001 ~ +0.005 AUC if calibration is consistent")

print("\n" + "="*80)
print("PRODUCTION ULTIMATE PIPELINE COMPLETED")
print(f"Total time: See logs above | Results saved: experiment_results_full.csv")
print("="*80)
