# AttriPredict - Brute-Force Model Search

This notebook performs an exhaustive brute-force search over models, hyperparameters, feature engineering, and random seeds.

> Baseline: example.ipynb: 0.8797 (best model found during EDA)

---

In [None]:

# ============================================================================
# Environment Setup
# ============================================================================
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

import ast
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import joblib


from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import RFECV, SelectKBest, f_classif
from sklearn.metrics import roc_auc_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


try:
    from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
    from imblearn.combine import SMOTETomek, SMOTEENN
    from imblearn.ensemble import BalancedRandomForestClassifier
    HAS_IMB = True
except Exception:
    HAS_IMB = False
    SMOTE = ADASYN = BorderlineSMOTE = SMOTETomek = SMOTEENN = None
    BalancedRandomForestClassifier = EasyEnsembleClassifier = None
    print("[WARN] imblearn not available, skipping advanced samplers")


try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False
    lgb = None
    print("[WARN] lightgbm not available")


try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False
    xgb = None
    print("[WARN] xgboost not available")


try:
    import catboost as cb
    HAS_CAT = True
except Exception:
    HAS_CAT = False
    cb = None
    print("[WARN] catboost not available")


# Global randomness control
RANDOM_STATE = 42
RANDOM_SEEDS = [42, 2025]
np.random.seed(RANDOM_STATE)


# Artifact roots (mirrors 02 project layout)
ARTIFACT_ROOT = Path('artifacts/fair_bruteforce')
SUMMARY_DIR = ARTIFACT_ROOT / 'summaries'
GLOBAL_FIG_DIR = ARTIFACT_ROOT / 'global_figures'
for path in [ARTIFACT_ROOT, SUMMARY_DIR, GLOBAL_FIG_DIR]:
    path.mkdir(parents=True, exist_ok=True)
FAMILY_ARTIFACTS = {}


# In notebook experiment tracker
EXPERIMENT_LOG = []


print("="*80)
print("AttriPredict Production Ultimate Initialized")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Libraries: IMB={HAS_IMB}, LGB={HAS_LGB}, XGB={HAS_XGB}, CAT={HAS_CAT}")
print(f"Artifacts root: {ARTIFACT_ROOT.resolve()}")
print("="*80)


In [None]:

# ============================================================================
# Load Data
# ============================================================================
print("\n[1/8] Loading Data...")

train_data = pd.read_csv('../../data/train.csv')
test_data = pd.read_csv('../../data/test.csv')

print(f"✓ Train shape: {train_data.shape}")
print(f"✓ Test shape: {test_data.shape}")

drop_cols = ['Over18', 'StandardHours', 'EmployeeNumber']
train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

X_train_raw = train_data.drop('Attrition', axis=1)
y_train_raw = train_data['Attrition']
X_test_raw = test_data.drop('Attrition', axis=1)
y_test_raw = test_data['Attrition']

NOMINAL_CATEGORICALS = [
    'BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime'
]
NUMERIC_FEATURES = [col for col in X_train_raw.columns if col not in NOMINAL_CATEGORICALS]

print(f"✓ Nominal categorical features: {len(NOMINAL_CATEGORICALS)}")
print(f"✓ Numeric features: {len(NUMERIC_FEATURES)}")

SUMMARY_DIR.mkdir(parents=True, exist_ok=True)
pd.Series({
    'train_rows': len(train_data),
    'test_rows': len(test_data),
    'feature_count': X_train_raw.shape[1]
}).to_frame('value').to_csv(SUMMARY_DIR / 'dataset_overview.csv')
y_train_raw.value_counts().to_csv(SUMMARY_DIR / 'train_class_balance.csv', header=['count'])
train_data.describe(include='all').transpose().to_csv(SUMMARY_DIR / 'train_describe.csv')
test_data.describe(include='all').transpose().to_csv(SUMMARY_DIR / 'test_describe.csv')


In [None]:

# ============================================================================
# Feature Engineering - Level 1: Basic Preprocessing
# ============================================================================
print("\n[2/8] Feature Engineering - Level 1: Basic Preprocessing...")


def get_sampler(sampler_config):
    if sampler_config is None or not HAS_IMB:
        return None
    sampler_name = sampler_config.get('name', 'smote').lower()
    sampler_params = dict(sampler_config.get('params', {}))
    sampler_params.setdefault('random_state', sampler_config.get('random_state', RANDOM_STATE))

    if sampler_name == 'smote' and SMOTE is not None:
        return SMOTE(**sampler_params)
    if sampler_name == 'adasyn' and ADASYN is not None:
        return ADASYN(**sampler_params)
    if sampler_name == 'borderlinesmote' and BorderlineSMOTE is not None:
        return BorderlineSMOTE(**sampler_params)
    if sampler_name == 'smoteenn' and SMOTEENN is not None:
        return SMOTEENN(**sampler_params)
    if sampler_name == 'smotetomek' and SMOTETomek is not None:
        return SMOTETomek(**sampler_params)

    raise ValueError(f"Unsupported sampler '{sampler_name}' or imblearn component missing.")


def basic_preprocess(
    X_train,
    extra_datasets=None,
    sampler_config=None,
    y_train=None,
    verbose=True
):
    """
    Basic preprocessing pipeline: encoding + log1p on skewed numerics + scaling,
    with optional sampler applied at the basic stage.
    """
    if extra_datasets is None:
        extra_datasets = []

    nominal_features = NOMINAL_CATEGORICALS

    if verbose:
        print("  [Basic] OneHot encoding categorical features...")

    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    encoded_train = encoder.fit_transform(X_train[nominal_features])
    encoded_df_train = pd.DataFrame(
        encoded_train,
        columns=encoder.get_feature_names_out(nominal_features),
        index=X_train.index
    )
    X_train_proc = X_train.drop(nominal_features, axis=1).copy()
    X_train_proc = pd.concat([X_train_proc, encoded_df_train], axis=1)

    processed_extras = []
    for X_extra in extra_datasets:
        encoded_extra = encoder.transform(X_extra[nominal_features])
        encoded_df_extra = pd.DataFrame(
            encoded_extra, # type: ignore
            columns=encoder.get_feature_names_out(nominal_features),
            index=X_extra.index
        )
        X_extra_proc = X_extra.drop(nominal_features, axis=1).copy()
        X_extra_proc = pd.concat([X_extra_proc, encoded_df_extra], axis=1)
        processed_extras.append(X_extra_proc)

    feature_columns = X_train_proc.columns
    encoded_count = encoded_df_train.shape[1]

    if verbose:
        print(f"    [Basic] Encoded features added: {encoded_count}")
        print("  [Basic] Handling skewness with log1p...")

    numeric_cols = X_train_proc.select_dtypes(include=['float64', 'int64']).columns
    skewed_features = []

    for col in numeric_cols:
        skewness = X_train_proc[col].skew()
        if abs(skewness) > 0.5: # type: ignore
            X_train_proc[col] = np.log1p(X_train_proc[col])
            for dataset in processed_extras:
                if col in dataset.columns:
                    dataset[col] = np.log1p(dataset[col])
            skewed_features.append(col)

    if verbose:
        print(f"    [Basic] Log-transformed: {len(skewed_features)} features")
        print("  [Basic] Standardizing features...")

    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_proc),
        columns=feature_columns,
        index=X_train_proc.index
    )

    scaled_extras = []
    for dataset in processed_extras:
        scaled_dataset = pd.DataFrame(
            scaler.transform(dataset[feature_columns]),
            columns=feature_columns,
            index=dataset.index
        )
        scaled_extras.append(scaled_dataset)

    if verbose:
        print(f"    [Basic] Scaled feature space: {X_train_scaled.shape[1]} columns")

    sampler = get_sampler(sampler_config)
    y_train_final = y_train

    if sampler is not None and y_train is not None:
        if verbose:
            print(f"  [Basic] Applying sampler: {sampler.__class__.__name__} ...")
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled.values, np.asarray(y_train)) # type: ignore
        X_train_scaled = pd.DataFrame(X_resampled, columns=feature_columns)
        y_train_final = y_resampled
        if verbose:
            print(f"    [Basic] Samples: {len(np.asarray(y_train))} -> {len(y_resampled)}")

    return X_train_scaled, scaled_extras, y_train_final


print("  ✓ Basic preprocessing function enhanced")


In [None]:

# ============================================================================
# Feature Engineering - Level 2: Advanced Features
# ============================================================================
print("\n[3/8] Feature Engineering - Level 2: Advanced Features...")


def create_advanced_features(X_train, extra_datasets=None, advanced_config=None, verbose=True):
    """Create interaction, ratio, delta, and aggregation features."""
    if extra_datasets is None:
        extra_datasets = []
    if advanced_config is None:
        advanced_config = {}

    X_train_adv = X_train.copy()
    extras_adv = [dataset.copy() for dataset in extra_datasets]

    interaction_pairs = advanced_config.get('interaction_pairs', [
        ('Age', 'MonthlyIncome'),
        ('DistanceFromHome', 'OverTime_Yes'),
        ('YearsAtCompany', 'JobLevel'),
        ('TotalWorkingYears', 'Age'),
        ('WorkLifeBalance', 'OverTime_Yes'),
        ('JobSatisfaction', 'EnvironmentSatisfaction'),
        ('TotalWorkingYears', 'NumCompaniesWorked'),
        ('MonthlyIncome', 'JobLevel'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(interaction_pairs)} interaction features...")

    for col1, col2 in interaction_pairs:
        if col1 in X_train_adv.columns and col2 in X_train_adv.columns:
            feat_name = f"{col1}_x_{col2}"
            X_train_adv[feat_name] = X_train_adv[col1] * X_train_adv[col2]
            for dataset in extras_adv:
                if col1 in dataset.columns and col2 in dataset.columns:
                    dataset[feat_name] = dataset[col1] * dataset[col2]

    ratio_specs = advanced_config.get('ratio_features', [
        ('MonthlyIncome', 'Age', 'Income_per_Age'),
        ('YearsAtCompany', 'TotalWorkingYears', 'Company_vs_Total_Years'),
        ('YearsSinceLastPromotion', 'YearsAtCompany', 'Promo_Frequency'),
        ('YearsWithCurrManager', 'YearsInCurrentRole', 'Manager_Stability'),
        ('TotalWorkingYears', 'NumCompaniesWorked', 'Years_per_Company'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(ratio_specs)} ratio features...")

    for num_col, denom_col, feat_name in ratio_specs:
        if num_col in X_train_adv.columns and denom_col in X_train_adv.columns:
            X_train_adv[feat_name] = X_train_adv[num_col] / (X_train_adv[denom_col] + 1e-5)
            for dataset in extras_adv:
                if num_col in dataset.columns and denom_col in dataset.columns:
                    dataset[feat_name] = dataset[num_col] / (dataset[denom_col] + 1e-5)

    delta_specs = advanced_config.get('delta_features', [
        ('YearsAtCompany', 'YearsInCurrentRole', 'Tenure_vs_Role'),
        ('YearsAtCompany', 'YearsWithCurrManager', 'Tenure_vs_Manager'),
        ('PerformanceRating', 'EnvironmentSatisfaction', 'Performance_vs_Environment'),
    ])

    if verbose:
        print(f"  [Advanced] Creating {len(delta_specs)} delta features...")

    for top_col, bottom_col, feat_name in delta_specs:
        if top_col in X_train_adv.columns and bottom_col in X_train_adv.columns:
            X_train_adv[feat_name] = X_train_adv[top_col] - X_train_adv[bottom_col]
            for dataset in extras_adv:
                if top_col in dataset.columns and bottom_col in dataset.columns:
                    dataset[feat_name] = dataset[top_col] - dataset[bottom_col]

    satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction',
                         'RelationshipSatisfaction', 'WorkLifeBalance']

    if all(col in X_train_adv.columns for col in satisfaction_cols):
        if verbose:
            print("  [Advanced] Aggregating satisfaction metrics...")
        X_train_adv['Satisfaction_Mean'] = X_train_adv[satisfaction_cols].mean(axis=1)
        X_train_adv['Satisfaction_Std'] = X_train_adv[satisfaction_cols].std(axis=1)
        X_train_adv['Satisfaction_Max'] = X_train_adv[satisfaction_cols].max(axis=1)
        for dataset in extras_adv:
            dataset['Satisfaction_Mean'] = dataset[satisfaction_cols].mean(axis=1)
            dataset['Satisfaction_Std'] = dataset[satisfaction_cols].std(axis=1)
            dataset['Satisfaction_Max'] = dataset[satisfaction_cols].max(axis=1)

    if verbose:
        print(f"  [Advanced] Total features after expansion: {X_train_adv.shape[1]}")

    return X_train_adv, extras_adv


def apply_polynomial_features(X_train, extra_datasets=None, poly_config=None, verbose=True):
    """Apply polynomial feature expansion on a subset of columns."""
    if poly_config is None:
        return X_train, extra_datasets or [], []

    if extra_datasets is None:
        extra_datasets = []

    max_base = poly_config.get('max_base_features', 12)
    degree = poly_config.get('degree', 2)
    interaction_only = poly_config.get('interaction_only', False)
    include_bias = poly_config.get('include_bias', False)

    numeric_cols = list(X_train.select_dtypes(include=[np.number]).columns)
    base_cols = poly_config.get('columns')
    if base_cols is None:
        std_series = X_train[numeric_cols].std().sort_values(ascending=False)
        base_cols = list(std_series.head(max_base).index)
    else:
        base_cols = [col for col in base_cols if col in X_train.columns]

    if len(base_cols) == 0:
        return X_train, extra_datasets, []

    poly = PolynomialFeatures(
        degree=degree,
        interaction_only=interaction_only,
        include_bias=include_bias
    )

    train_poly = poly.fit_transform(X_train[base_cols])
    poly_feature_names = poly.get_feature_names_out(base_cols)
    poly_df_train = pd.DataFrame(train_poly, columns=poly_feature_names, index=X_train.index)

    new_columns = [col for col in poly_df_train.columns if col not in X_train.columns]
    poly_df_train = poly_df_train[new_columns]
    X_train_poly = pd.concat([X_train, poly_df_train], axis=1)

    extras_poly = []
    for dataset in extra_datasets:
        transformed = poly.transform(dataset[base_cols]) # type: ignore
        poly_df_extra = pd.DataFrame(transformed, columns=poly_feature_names, index=dataset.index) # type: ignore
        poly_df_extra = poly_df_extra[new_columns]
        extras_poly.append(pd.concat([dataset, poly_df_extra], axis=1))

    if verbose:
        print(f"  [Poly] Degree {degree} expansion on {len(base_cols)} base cols -> {len(new_columns)} new cols")

    return X_train_poly, extras_poly, new_columns


def apply_feature_selector(X_train, y_train, extra_datasets=None, selector_config=None, verbose=True):
    """Apply a feature selection strategy and keep aligned columns for extras."""
    if selector_config is None:
        return X_train, extra_datasets or [], X_train.columns.tolist(), "none"

    if extra_datasets is None:
        extra_datasets = []

    strategy = selector_config.get('strategy', 'selectk').lower()
    summary = 'none'
    support_mask = None

    if strategy == 'selectk':
        k = selector_config.get('k', min(120, X_train.shape[1]))
        k = min(k, X_train.shape[1])
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X_train, y_train)
        support_mask = selector.get_support()
        summary = f'selectk_{k}'
    elif strategy == 'rfecv':
        step = selector_config.get('step', 1)
        min_features = selector_config.get('min_features', max(25, X_train.shape[1] // 6))
        base_estimator = selector_config.get('estimator')
        if base_estimator is None:
            base_estimator = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=2000)
        selector = RFECV(
            estimator=base_estimator,
            step=step,
            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
            scoring='roc_auc',
            min_features_to_select=min_features,
            n_jobs=-1
        )
        selector.fit(X_train, y_train)
        support_mask = selector.get_support()
        summary = f'rfecv_step{step}_min{min_features}'
    elif strategy == 'l1':
        C_value = selector_config.get('C', 0.5)
        max_iter = selector_config.get('max_iter', 4000)
        threshold = selector_config.get('threshold', 1e-6)
        min_features = selector_config.get('min_features', min(40, X_train.shape[1]))
        base_estimator = LogisticRegression(
            penalty='l1',
            solver='saga',
            C=C_value,
            max_iter=max_iter,
            random_state=selector_config.get('random_state', RANDOM_STATE)
        )
        base_estimator.fit(X_train, np.asarray(y_train))
        coefs = np.abs(base_estimator.coef_).ravel()
        mask = coefs > threshold
        if not mask.any():
            top_idx = np.argsort(coefs)[-min_features:]
            mask = np.zeros_like(coefs, dtype=bool)
            mask[top_idx] = True
        support_mask = mask
        summary = f'l1_{support_mask.sum()}'
    else:
        raise ValueError(f"Unknown feature selector strategy: {strategy}")

    selected_columns = X_train.columns[support_mask]

    X_train_selected = X_train[selected_columns].copy()
    extras_selected = [dataset[selected_columns].copy() for dataset in extra_datasets]

    if verbose:
        print(f"  [Selector] Strategy={summary} retained {len(selected_columns)} columns")

    return X_train_selected, extras_selected, selected_columns.tolist(), summary


print("  ✓ Advanced feature engineering utilities ready")


In [None]:
# ============================================================================
# Experiment Tracking System
# ============================================================================
print("\n[4/8] Setting up Experiment Tracking...")


def log_experiment(exp_name, model_name, cv_auc_mean, cv_auc_std, test_auc,
                   features_used, hyperparams, notes="", metadata=None):
    """Persist experiment metadata into the global experiment log."""
    experiment = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'exp_name': exp_name,
        'model': model_name,
        'cv_auc_mean': cv_auc_mean,
        'cv_auc_std': cv_auc_std,
        'test_auc': test_auc,
        'features_count': features_used,
        'hyperparams': str(hyperparams),
        'notes': notes
    }

    if metadata:
        experiment.update(metadata)

    EXPERIMENT_LOG.append(experiment)
    print(f"  -> Logged: {exp_name} | CV: {cv_auc_mean:.4f}±{cv_auc_std:.4f} | Test: {test_auc:.4f}")
    return experiment


def get_results_df():
    """Return experiments as sorted DataFrame."""
    df = pd.DataFrame(EXPERIMENT_LOG)
    if len(df) > 0:
        df = df.sort_values('test_auc', ascending=False)
    return df


print("  ✓ Experiment tracking system ready")



In [None]:

# ============================================================================
# Cross-Validation Utilities
# ============================================================================
print("  ✓ CV evaluation function enhanced")


def evaluate_with_cv(model, X, y, X_test, y_test, feature_config, pipeline_key, cv_override=None):
    """
    Perform cross-validation with dynamic feature preparation and aggregate
    test performance using the configured CV strategy.
    """
    cv_mode = feature_config.get('cv_mode', 'kfold')
    n_folds = feature_config.get('n_folds', 5)

    if cv_override is not None:
        splitter = cv_override
    elif cv_mode == 'repeated_5x2':
        splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=RANDOM_STATE)
    elif cv_mode == 'kfold10':
        splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
    else:
        splitter = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

    cv_scores = []
    test_preds = np.zeros(len(X_test))
    fold_count = 0
    feature_count = None
    last_metadata = {}

    for fold, (train_idx, val_idx) in enumerate(splitter.split(X, y)):
        fold_count += 1
        X_fold_train = X.iloc[train_idx]
        X_fold_val = X.iloc[val_idx]

        if hasattr(y, 'iloc'):
            y_fold_train = y.iloc[train_idx]
            y_fold_val = y.iloc[val_idx]
        else:
            y_fold_train = y[train_idx]
            y_fold_val = y[val_idx]

        prep_tuple = prepare_feature_set( # type: ignore
            pipeline_key,
            feature_config,
            X_fold_train,
            X_fold_val,
            X_test,
            y_fold_train
        )

        if len(prep_tuple) == 4:
            X_train_ready, X_val_ready, X_test_ready, y_train_ready = prep_tuple
            prep_meta = {}
        else:
            X_train_ready, X_val_ready, X_test_ready, y_train_ready, prep_meta = prep_tuple

        feature_count = X_train_ready.shape[1]
        last_metadata = prep_meta
        y_val_array = np.asarray(y_fold_val)

        model_clone = model.__class__(**model.get_params()) if hasattr(model, 'get_params') else model
        model_clone.fit(X_train_ready, y_train_ready)

        val_pred = model_clone.predict_proba(X_val_ready)[:, 1]
        val_auc = roc_auc_score(y_val_array, val_pred)
        cv_scores.append(val_auc)

        fold_test_pred = model_clone.predict_proba(X_test_ready)[:, 1]
        test_preds += fold_test_pred

    cv_mean = float(np.mean(cv_scores)) if cv_scores else 0.0
    cv_std = float(np.std(cv_scores)) if cv_scores else 0.0
    test_preds /= max(fold_count, 1)
    test_auc = roc_auc_score(np.asarray(y_test), test_preds)

    return cv_mean, cv_std, test_auc, test_preds, feature_count, last_metadata


In [None]:

# ============================================================================
# Feature Pipelines & Routing
# ============================================================================
print("\n[5/8] Building family-specific feature pipelines...")

MODEL_FAMILIES = {
    'linear': ['LR', 'SVC'],
    'tree': ['XGB', 'LGB', 'CAT', 'RF', 'GB', 'HGB', 'ET', 'BRF'],
    'neural': ['MLP']
}
MODEL_TO_FAMILY = {model: family for family, models in MODEL_FAMILIES.items() for model in models}

LINEAR_PIPELINES = {
    'linear_basic': {
        'family': 'linear',
        'description': 'Scaled OHE + log1p base features (no manual extras)',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {
            'use_advanced': False,
            'samplers': {'basic': {'name': 'smote'}}
        }
    },
    'linear_advanced': {
        'family': 'linear',
        'description': 'Scaled base + engineered interactions',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {
            'use_advanced': True,
            'samplers': {'basic': {'name': 'smote'}}
        }
    },
    'linear_poly': {
        'family': 'linear',
        'description': 'Advanced features + polynomial bump',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {
            'use_advanced': True,
            'poly': {'degree': 2, 'interaction_only': False, 'include_bias': False, 'max_base_features': 16},
            'samplers': {'basic': {'name': 'smote'}}
        }
    },
    'linear_sparse': {
        'family': 'linear',
        'description': 'Advanced features + L1 feature squeeze',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {
            'use_advanced': True,
            'feature_selector': {'strategy': 'l1', 'C': 0.8, 'threshold': 1e-5, 'min_features': 60},
            'samplers': {'basic': {'name': 'smote'}}
        }
    }
}

TREE_PIPELINES = {
    'tree_raw': {
        'family': 'tree',
        'description': 'Label encoding + raw numeric signals',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'encoding': 'label', 'use_smote': False}
    },
    'tree_raw_smote': {
        'family': 'tree',
        'description': 'Label encoding + SMOTE uplift',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'encoding': 'label', 'use_smote': True, 'sampler': {'name': 'smote', 'params': {'k_neighbors': 4}}}
    },
    'tree_target': {
        'family': 'tree',
        'description': 'Target encoding on categoricals',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'encoding': 'target', 'use_smote': False}
    },
    'tree_onehot': {
        'family': 'tree',
        'description': 'One-hot encoding without scaling',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'encoding': 'onehot', 'use_smote': False}
    }
}

NEURAL_PIPELINES = {
    'neural_standard': {
        'family': 'neural',
        'description': 'One-hot + StandardScaler + SMOTE',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'scaler': 'standard', 'use_smote': True, 'sampler': {'name': 'smote', 'params': {'k_neighbors': 5}}}
    },
    'neural_minmax': {
        'family': 'neural',
        'description': 'One-hot + MinMaxScaler + SMOTE',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'scaler': 'minmax', 'use_smote': True, 'sampler': {'name': 'smote', 'params': {'k_neighbors': 6}}}
    },
    'neural_raw': {
        'family': 'neural',
        'description': 'One-hot + StandardScaler (no sampling)',
        'cv_mode': 'kfold',
        'n_folds': 5,
        'options': {'scaler': 'standard', 'use_smote': False}
    }
}

FEATURE_PIPELINES = {**LINEAR_PIPELINES, **TREE_PIPELINES, **NEURAL_PIPELINES}

print(f"  ✓ Linear pipelines: {', '.join(LINEAR_PIPELINES.keys())}")
print(f"  ✓ Tree pipelines: {', '.join(TREE_PIPELINES.keys())}")
print(f"  ✓ Neural pipelines: {', '.join(NEURAL_PIPELINES.keys())}")




def init_family_artifacts():
    global FAMILY_ARTIFACTS
    FAMILY_ARTIFACTS = {}
    for family in MODEL_FAMILIES:
        base = ARTIFACT_ROOT / family
        data_dir = base / 'data'
        fig_dir = base / 'figures'
        model_dir = base / 'models'
        for artifact_path in [base, data_dir, fig_dir, model_dir]:
            artifact_path.mkdir(parents=True, exist_ok=True)
        FAMILY_ARTIFACTS[family] = {
            'base': base,
            'data': data_dir,
            'figures': fig_dir,
            'models': model_dir
        }


init_family_artifacts()
def prepare_linear_features(pipeline_name, config, X_train, X_val, X_test, y_train):
    options = dict(config.get('options', {}))
    samplers = options.get('samplers', {})
    basic_sampler_cfg = samplers.get('basic')
    advanced_sampler_cfg = samplers.get('advanced')
    extra_datasets = [X_val, X_test]

    X_train_ready, extras_basic, y_train_ready = basic_preprocess(
        X_train,
        extra_datasets=extra_datasets,
        sampler_config=basic_sampler_cfg,
        y_train=y_train,
        verbose=False
    )

    if len(extras_basic) == 2:
        X_val_ready, X_test_ready = extras_basic
    else:
        X_val_ready = extras_basic[0]
        X_test_ready = extras_basic[0]

    poly_added = 0
    selector_summary = 'none'
    manual_features = 'none'

    if options.get('use_advanced'):
        X_train_ready, extras_adv = create_advanced_features(
            X_train_ready,
            extra_datasets=[X_val_ready, X_test_ready],
            advanced_config=options.get('advanced_config'),
            verbose=False
        )
        X_val_ready, X_test_ready = extras_adv
        manual_features = 'advanced19'

    poly_cfg = options.get('poly')
    if poly_cfg:
        X_train_ready, extras_poly, new_cols = apply_polynomial_features(
            X_train_ready,
            extra_datasets=[X_val_ready, X_test_ready],
            poly_config=poly_cfg,
            verbose=False
        )
        X_val_ready, X_test_ready = extras_poly
        poly_added = len(new_cols)

    selector_cfg = options.get('feature_selector')
    if selector_cfg and y_train_ready is not None:
        X_train_ready, extras_sel, _, selector_summary = apply_feature_selector(
            X_train_ready,
            np.asarray(y_train_ready),
            extra_datasets=[X_val_ready, X_test_ready],
            selector_config=selector_cfg,
            verbose=False
        )
        X_val_ready, X_test_ready = extras_sel

    if advanced_sampler_cfg and y_train_ready is not None:
        sampler = get_sampler(advanced_sampler_cfg)
        X_resampled, y_resampled = sampler.fit_resample(X_train_ready.values, np.asarray(y_train_ready)) # type: ignore
        X_train_ready = pd.DataFrame(X_resampled, columns=X_train_ready.columns)
        y_train_ready = y_resampled
        adv_sampler = advanced_sampler_cfg.get('name', 'unknown')
    else:
        adv_sampler = 'none'

    prep_meta = {
        'feature_pipeline': pipeline_name,
        'family': config.get('family', 'linear'),
        'description': config.get('description', ''),
        'poly_added': poly_added,
        'selector': selector_summary,
        'basic_sampler': basic_sampler_cfg['name'] if basic_sampler_cfg else 'none',
        'advanced_sampler': adv_sampler,
        'sampler_summary': f"basic:{basic_sampler_cfg['name'] if basic_sampler_cfg else 'none'} -> advanced:{adv_sampler}",
        'cv_mode': config.get('cv_mode', 'kfold'),
        'encoder': 'onehot',
        'scaler': 'standard',
        'manual_features': manual_features
    }

    return X_train_ready, X_val_ready, X_test_ready, np.asarray(y_train_ready), prep_meta


def prepare_tree_features(pipeline_name, config, X_train, X_val, X_test, y_train):
    options = dict(config.get('options', {}))
    encoding = options.get('encoding', 'label')
    use_smote = bool(options.get('use_smote', False)) and HAS_IMB
    sampler_cfg = options.get('sampler', {'name': 'smote'})
    cat_cols = [col for col in NOMINAL_CATEGORICALS if col in X_train.columns]

    X_train_proc = X_train.copy()
    X_val_proc = X_val.copy()
    X_test_proc = X_test.copy()
    encoder_label = 'none'

    if cat_cols:
        if encoding == 'label':
            ordinal = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            X_train_proc[cat_cols] = ordinal.fit_transform(X_train_proc[cat_cols])
            X_val_proc[cat_cols] = ordinal.transform(X_val_proc[cat_cols])
            X_test_proc[cat_cols] = ordinal.transform(X_test_proc[cat_cols])
            encoder_label = 'label'
        elif encoding == 'onehot':
            oh = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
            oh.fit(X_train_proc[cat_cols])

            def _transform(dataset):
                encoded = oh.transform(dataset[cat_cols])
                encoded_df = pd.DataFrame(encoded, columns=oh.get_feature_names_out(cat_cols), index=dataset.index) # type: ignore
                base = dataset.drop(columns=cat_cols)
                return pd.concat([base, encoded_df], axis=1)

            X_train_proc = _transform(X_train_proc)
            X_val_proc = _transform(X_val_proc)
            X_test_proc = _transform(X_test_proc)
            encoder_label = 'onehot'
        elif encoding == 'target':
            if y_train is None:
                raise ValueError("Target encoding requires y_train")
            y_series = pd.Series(np.asarray(y_train), index=X_train_proc.index)
            global_mean = float(y_series.mean())
            mappings = {col: y_series.groupby(X_train_proc[col]).mean() for col in cat_cols}

            def _apply_target(dataset):
                base = dataset.drop(columns=cat_cols).copy()
                for col in cat_cols:
                    base[f"{col}_target"] = dataset[col].map(mappings[col]).fillna(global_mean)
                return base

            X_train_proc = _apply_target(X_train_proc)
            X_val_proc = _apply_target(X_val_proc)
            X_test_proc = _apply_target(X_test_proc)
            encoder_label = 'target'
        else:
            raise ValueError(f"Unknown tree encoding: {encoding}")

    y_ready = np.asarray(y_train) if y_train is not None else None
    sampler_summary = 'none'
    if use_smote and y_ready is not None:
        sampler = get_sampler(sampler_cfg)
        X_resampled, y_resampled = sampler.fit_resample(X_train_proc.values, y_ready) # type: ignore
        X_train_proc = pd.DataFrame(X_resampled, columns=X_train_proc.columns)
        y_ready = y_resampled
        sampler_summary = sampler_cfg.get('name', 'smote')

    prep_meta = {
        'feature_pipeline': pipeline_name,
        'family': config.get('family', 'tree'),
        'description': config.get('description', ''),
        'poly_added': 0,
        'selector': 'none',
        'basic_sampler': sampler_summary,
        'advanced_sampler': 'none',
        'sampler_summary': sampler_summary,
        'cv_mode': config.get('cv_mode', 'kfold'),
        'encoder': encoder_label,
        'scaler': 'none',
        'manual_features': 'none'
    }

    return X_train_proc, X_val_proc, X_test_proc, y_ready, prep_meta


def prepare_neural_features(pipeline_name, config, X_train, X_val, X_test, y_train):
    options = dict(config.get('options', {}))
    scaler_name = options.get('scaler', 'standard').lower()
    use_smote = bool(options.get('use_smote', False)) and HAS_IMB
    sampler_cfg = options.get('sampler', {'name': 'smote'})
    cat_cols = [col for col in NOMINAL_CATEGORICALS if col in X_train.columns]

    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoder.fit(X_train[cat_cols])

    def _encode(dataset):
        encoded = encoder.transform(dataset[cat_cols])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols), index=dataset.index) # type: ignore
        base = dataset.drop(columns=cat_cols)
        return pd.concat([base, encoded_df], axis=1)

    X_train_encoded = _encode(X_train)
    X_val_encoded = _encode(X_val)
    X_test_encoded = _encode(X_test)

    scaler = StandardScaler() if scaler_name == 'standard' else MinMaxScaler()
    feature_columns = X_train_encoded.columns
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_encoded),
        columns=feature_columns,
        index=X_train_encoded.index
    )
    X_val_scaled = pd.DataFrame(
        scaler.transform(X_val_encoded),
        columns=feature_columns,
        index=X_val_encoded.index
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test_encoded),
        columns=feature_columns,
        index=X_test_encoded.index
    )

    y_ready = np.asarray(y_train) if y_train is not None else None
    sampler_summary = 'none'
    if use_smote and y_ready is not None:
        sampler = get_sampler(sampler_cfg)
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled.values, y_ready) # type: ignore
        X_train_scaled = pd.DataFrame(X_resampled, columns=feature_columns)
        y_ready = y_resampled
        sampler_summary = sampler_cfg.get('name', 'smote')

    prep_meta = {
        'feature_pipeline': pipeline_name,
        'family': config.get('family', 'neural'),
        'description': config.get('description', ''),
        'poly_added': 0,
        'selector': 'none',
        'basic_sampler': sampler_summary,
        'advanced_sampler': 'none',
        'sampler_summary': sampler_summary,
        'cv_mode': config.get('cv_mode', 'kfold'),
        'encoder': 'onehot',
        'scaler': 'minmax' if scaler_name == 'minmax' else 'standard',
        'manual_features': 'none'
    }

    return X_train_scaled, X_val_scaled, X_test_scaled, y_ready, prep_meta


def prepare_feature_set(pipeline_name, config, X_train, X_val, X_test, y_train):
    family = config.get('family', 'linear')
    if family == 'linear':
        return prepare_linear_features(pipeline_name, config, X_train, X_val, X_test, y_train)
    if family == 'tree':
        return prepare_tree_features(pipeline_name, config, X_train, X_val, X_test, y_train)
    if family == 'neural':
        return prepare_neural_features(pipeline_name, config, X_train, X_val, X_test, y_train)
    raise ValueError(f"Unknown model family '{family}' for pipeline '{pipeline_name}'")


In [None]:

print("\n[6/8] Configuring Model Grids...")

LR_CONFIGS = [
    {'C': 0.3, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 3000},
    {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 4000},
    {'C': 2.0, 'penalty': 'l2', 'solver': 'lbfgs', 'class_weight': 'balanced', 'max_iter': 4000},
    {'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 3500},
    {'C': 1.0, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.3, 'max_iter': 4500}
]

SVC_CONFIGS = [
    {'C': 0.5, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True},
    {'C': 2.0, 'kernel': 'rbf', 'gamma': 0.05, 'probability': True, 'class_weight': 'balanced'}
]

RF_CONFIGS = [
    {'n_estimators': 600, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2},
    {'n_estimators': 900, 'max_depth': 12, 'max_features': 0.6, 'min_samples_leaf': 2},
    {'n_estimators': 1100, 'max_depth': 14, 'max_features': 0.5, 'min_samples_leaf': 2},
    {'n_estimators': 900, 'max_depth': None, 'max_features': 0.7, 'min_samples_leaf': 3}
]

ET_CONFIGS = [
    {'n_estimators': 500, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2},
    {'n_estimators': 800, 'max_depth': 12, 'max_features': 0.6, 'min_samples_leaf': 1},
    {'n_estimators': 1000, 'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 2}
]

GB_CONFIGS = [
    {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.9},
    {'n_estimators': 800, 'learning_rate': 0.03, 'max_depth': 4, 'subsample': 0.85},
    {'n_estimators': 900, 'learning_rate': 0.025, 'max_depth': 4, 'subsample': 0.85},
    {'n_estimators': 1100, 'learning_rate': 0.02, 'max_depth': 5, 'subsample': 0.8}
]

HGB_CONFIGS = [
    {'learning_rate': 0.05, 'max_depth': 6, 'max_iter': 600, 'l2_regularization': 0.1},
    {'learning_rate': 0.04, 'max_depth': 8, 'max_iter': 800, 'l2_regularization': 0.1},
    {'learning_rate': 0.03, 'max_depth': 10, 'max_iter': 900, 'l2_regularization': 0.05},
    {'learning_rate': 0.02, 'max_depth': 12, 'max_iter': 1100, 'l2_regularization': 0.05}
]

XGB_CONFIGS = [
    {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 8, 'subsample': 0.9, 'colsample_bytree': 0.75, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0},
    {'n_estimators': 1400, 'learning_rate': 0.04, 'max_depth': 9, 'subsample': 0.85, 'colsample_bytree': 0.8, 'reg_lambda': 1.2, 'scale_pos_weight': 1.2},
    {'n_estimators': 1600, 'learning_rate': 0.03, 'max_depth': 10, 'subsample': 0.85, 'colsample_bytree': 0.8, 'reg_lambda': 1.5, 'scale_pos_weight': 1.5},
    {'n_estimators': 1800, 'learning_rate': 0.025, 'max_depth': 11, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.8, 'scale_pos_weight': 1.8}
]

LGB_CONFIGS = [
    {'n_estimators': 1000, 'learning_rate': 0.05, 'num_leaves': 64, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 0.0},
    {'n_estimators': 1300, 'learning_rate': 0.035, 'num_leaves': 80, 'subsample': 0.85, 'colsample_bytree': 0.8, 'reg_lambda': 0.3},
    {'n_estimators': 1500, 'learning_rate': 0.03, 'num_leaves': 96, 'subsample': 0.85, 'colsample_bytree': 0.85, 'reg_lambda': 0.5},
    {'n_estimators': 1700, 'learning_rate': 0.025, 'num_leaves': 120, 'subsample': 0.8, 'colsample_bytree': 0.85, 'reg_lambda': 0.8}
]

CAT_CONFIGS = [
    {'iterations': 1000, 'learning_rate': 0.05, 'depth': 7, 'l2_leaf_reg': 3.0},
    {'iterations': 1300, 'learning_rate': 0.04, 'depth': 8, 'l2_leaf_reg': 2.5},
    {'iterations': 1500, 'learning_rate': 0.035, 'depth': 9, 'l2_leaf_reg': 2.0},
    {'iterations': 1700, 'learning_rate': 0.03, 'depth': 9, 'l2_leaf_reg': 2.5}
]

MLP_CONFIGS = [
    {'hidden_layer_sizes': (512,), 'alpha': 1e-4, 'learning_rate_init': 0.001, 'max_iter': 1500, 'early_stopping': True, 'n_iter_no_change': 30},
    {'hidden_layer_sizes': (256,), 'alpha': 4e-4, 'learning_rate_init': 0.001, 'max_iter': 1500, 'early_stopping': True, 'n_iter_no_change': 30},
    {'hidden_layer_sizes': (512, 256), 'alpha': 1e-4, 'learning_rate_init': 0.0008, 'max_iter': 1700, 'early_stopping': True, 'n_iter_no_change': 25},
    {'hidden_layer_sizes': (256, 128), 'alpha': 2e-4, 'learning_rate_init': 0.0008, 'max_iter': 1700, 'early_stopping': True, 'n_iter_no_change': 25},
    {'hidden_layer_sizes': (512, 256, 128), 'alpha': 7e-5, 'learning_rate_init': 0.0005, 'max_iter': 1800, 'early_stopping': True, 'n_iter_no_change': 30},
    {'hidden_layer_sizes': (256, 128, 64), 'alpha': 1.5e-4, 'learning_rate_init': 0.0005, 'max_iter': 1800, 'early_stopping': True, 'n_iter_no_change': 30}
]

MODEL_CONFIGS = [
    ('LR', LogisticRegression, LR_CONFIGS),
    ('RF', RandomForestClassifier, RF_CONFIGS),
    ('ET', ExtraTreesClassifier, ET_CONFIGS),
    ('GB', GradientBoostingClassifier, GB_CONFIGS),
    ('HGB', HistGradientBoostingClassifier, HGB_CONFIGS),
    ('MLP', MLPClassifier, MLP_CONFIGS),
    ('SVC', SVC, SVC_CONFIGS)
]

if HAS_IMB and BalancedRandomForestClassifier is not None:
    BRF_CONFIGS = [
        {'n_estimators': 600, 'max_depth': 10, 'max_features': 'sqrt'},
        {'n_estimators': 900, 'max_depth': 12, 'max_features': 0.6},
        {'n_estimators': 1100, 'max_depth': 14, 'max_features': 0.5}
    ]
    MODEL_CONFIGS.append(('BRF', BalancedRandomForestClassifier, BRF_CONFIGS))

if HAS_XGB and xgb is not None:
    MODEL_CONFIGS.append(('XGB', xgb.XGBClassifier, XGB_CONFIGS))

if HAS_LGB and lgb is not None:
    MODEL_CONFIGS.append(('LGB', lgb.LGBMClassifier, LGB_CONFIGS))

if HAS_CAT and cb is not None:
    MODEL_CONFIGS.append(('CAT', cb.CatBoostClassifier, CAT_CONFIGS))

family_pipeline_counts = {family: sum(1 for cfg in FEATURE_PIPELINES.values() if cfg['family'] == family) for family in MODEL_FAMILIES}
per_seed_experiments = 0
for model_name, _, configs in MODEL_CONFIGS:
    family = MODEL_TO_FAMILY.get(model_name)
    if family is None:
        continue
    per_seed_experiments += len(configs) * family_pipeline_counts.get(family, 0)

print(f"  ✓ Model families: {', '.join(MODEL_FAMILIES.keys())}")
print(f"  ✓ Total pipelines: {len(FEATURE_PIPELINES)}")
print(f"  ✓ Total model configs: {sum(len(cfgs) for _, _, cfgs in MODEL_CONFIGS)}")
print(f"  ✓ Base experiments per seed: {per_seed_experiments}")


In [None]:

# ============================================================================
# Artifact Utilities
# ============================================================================
print("\n[6b/8] Preparing artifact utility helpers...")


def export_family_tables(results_df: pd.DataFrame, top_n: int = 10) -> None:
    if len(results_df) == 0:
        print("  [WARN] No results to export for families")
        return
    summary_rows = []
    for family, dirs in FAMILY_ARTIFACTS.items():
        fam_df = results_df[results_df['family'] == family]
        if fam_df.empty:
            continue
        all_path = dirs['data'] / 'all_results.csv'
        top_path = dirs['data'] / f'top{top_n}_results.csv'
        fam_df.to_csv(all_path, index=False)
        fam_df.head(top_n).to_csv(top_path, index=False)
        summary_rows.append({
            'family': family,
            'all_results_path': str(all_path),
            'top_results_path': str(top_path),
            'experiment_count': len(fam_df)
        })
    if summary_rows:
        pd.DataFrame(summary_rows).to_csv(SUMMARY_DIR / 'family_data_manifest.csv', index=False)
        print(f"  ✓ Stored family tables for {len(summary_rows)} families")
    else:
        print("  [WARN] No family tables created (no results)")


def plot_family_val_vs_test(results_df: pd.DataFrame, top_n: int = 10) -> None:
    if len(results_df) == 0:
        return
    families = list(MODEL_FAMILIES.keys())
    fig, axes = plt.subplots(1, len(families), figsize=(6 * len(families), 6), sharey=True)
    if len(families) == 1:
        axes = [axes]
    for ax, family in zip(axes, families):
        fam_df = results_df[results_df['family'] == family].sort_values('test_auc', ascending=False).head(top_n)
        if fam_df.empty:
            ax.set_title(f"{family.title()} (no runs)")
            continue
        idx = np.arange(len(fam_df))
        width = 0.35
        ax.bar(idx - width / 2, fam_df['cv_auc_mean'], width, label='CV AUC', color='#4c72b0')
        ax.bar(idx + width / 2, fam_df['test_auc'], width, label='Test AUC', color='#dd8452')
        ax.set_xticks(idx)
        ax.set_xticklabels(fam_df['model'], rotation=45, ha='right')
        ax.set_ylabel('AUC')
        ax.set_ylim(0.7, 0.92)
        ax.set_title(f"{family.title()} top{len(fam_df)}")
        ax.grid(axis='y', linestyle='--', alpha=0.4)
    axes[0].legend()
    fig.tight_layout()
    fig_path = GLOBAL_FIG_DIR / 'family_val_vs_test.png'
    fig.savefig(fig_path, dpi=200, bbox_inches='tight')
    plt.close(fig)
    for family, dirs in FAMILY_ARTIFACTS.items():
        target = dirs['figures'] / 'family_val_vs_test.png'
        try:
            shutil.copy(fig_path, target)
        except Exception:
            pass
    print(f"  ✓ Saved family CV/Test comparison: {fig_path}")


def plot_pipeline_contrast(results_df: pd.DataFrame) -> None:
    if len(results_df) == 0:
        return
    pipeline_stats = results_df.groupby('feature_pipeline')[['cv_auc_mean', 'test_auc']].mean().sort_values('test_auc', ascending=False)
    fig, ax = plt.subplots(figsize=(10, 6))
    idx = np.arange(len(pipeline_stats))
    width = 0.35
    ax.bar(idx - width / 2, pipeline_stats['cv_auc_mean'], width, label='CV AUC', color='#55a868')
    ax.bar(idx + width / 2, pipeline_stats['test_auc'], width, label='Test AUC', color='#c44e52')
    ax.set_xticks(idx)
    ax.set_xticklabels(pipeline_stats.index, rotation=45, ha='right')
    ax.set_ylabel('AUC')
    ax.set_title('Average performance by feature pipeline')
    ax.legend()
    ax.grid(axis='y', linestyle='--', alpha=0.4)
    fig.tight_layout()
    fig_path = GLOBAL_FIG_DIR / 'pipeline_val_test_comparison.png'
    fig.savefig(fig_path, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"  ✓ Saved pipeline comparison: {fig_path}")


def plot_model_scatter(results_df: pd.DataFrame) -> None:
    if len(results_df) == 0:
        return
    fig, ax = plt.subplots(figsize=(8, 6))
    fam_to_idx = {family: idx for idx, family in enumerate(MODEL_FAMILIES.keys())}
    colors = [fam_to_idx.get(fam, -1) for fam in results_df['family']]
    scatter = ax.scatter(results_df['cv_auc_mean'], results_df['test_auc'], c=colors, cmap='tab10', alpha=0.65, s=45)
    ax.plot([0, 1], [0, 1], linestyle='--', color='gray', alpha=0.3)
    ax.set_xlim(0.74, 0.83)
    ax.set_ylim(0.8, 0.89)
    ax.set_xlabel('CV AUC')
    ax.set_ylabel('Test AUC')
    ax.set_title('CV vs Test AUC (colored by family)')
    handles = []
    labels = []
    for fam, idx in fam_to_idx.items():
        handles.append(plt.Line2D([], [], marker='o', linestyle='', color=scatter.cmap(scatter.norm(idx)))) # type: ignore
        labels.append(fam.title())
    ax.legend(handles, labels, title='Family', loc='lower right')
    ax.grid(alpha=0.4)
    fig.tight_layout()
    fig_path = GLOBAL_FIG_DIR / 'cv_vs_test_scatter.png'
    fig.savefig(fig_path, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"  ✓ Saved CV/Test scatter: {fig_path}")


def persist_top_models(results_df: pd.DataFrame, top_n: int = 10) -> None:
    if len(results_df) == 0:
        print("  [WARN] No models available for persistence")
        return
    saved_rows = []
    for family, dirs in FAMILY_ARTIFACTS.items():
        fam_df = results_df[results_df['family'] == family].sort_values('test_auc', ascending=False).head(top_n)
        if fam_df.empty:
            continue
        for row in fam_df.itertuples():
            params = row.hyperparams
            if isinstance(params, str):
                try:
                    params = ast.literal_eval(params)
                except Exception:
                    params = {}
            model_cls = MODEL_CLASS_LOOKUP.get(row.model)
            pipeline_cfg = FEATURE_PIPELINES.get(row.feature_pipeline)
            if model_cls is None or pipeline_cfg is None:
                continue
            np.random.seed(getattr(row, 'random_seed', RANDOM_STATE))
            prep_tuple = prepare_feature_set(
                row.feature_pipeline,
                pipeline_cfg,
                X_train_raw,
                X_train_raw,
                X_test_raw,
                y_train_raw
            )
            if len(prep_tuple) == 5:
                X_ready, _, _, y_ready, prep_meta = prep_tuple
            else:
                X_ready, _, _, y_ready, prep_meta = prep_tuple[:5] # type: ignore
            model = model_cls(**params)
            model.fit(X_ready, y_ready)
            payload = {
                'model': model,
                'model_name': row.model,
                'family': family,
                'feature_pipeline': row.feature_pipeline,
                'feature_columns': list(X_ready.columns),
                'hyperparams': params,
                'training_seed': getattr(row, 'random_seed', RANDOM_STATE),
                'cv_auc_mean': row.cv_auc_mean,
                'cv_auc_std': row.cv_auc_std,
                'test_auc': row.test_auc,
                'feature_count': row.features_count,
                'prep_meta': prep_meta
            }
            model_path = dirs['models'] / f"{row.exp_name}.joblib"
            joblib.dump(payload, model_path)
            saved_rows.append({
                'family': family,
                'exp_name': row.exp_name,
                'model_path': str(model_path),
                'cv_auc_mean': row.cv_auc_mean,
                'test_auc': row.test_auc
            })
    if saved_rows:
        manifest_path = SUMMARY_DIR / 'top_model_manifest.csv'
        pd.DataFrame(saved_rows).to_csv(manifest_path, index=False)
        print(f"  ✓ Persisted {len(saved_rows)} trained models -> {manifest_path}")
    else:
        print("  [WARN] No models persisted (check filters)")


In [None]:

experiment_count = 0

MODEL_BASE_PARAMS = {
    'LR': lambda seed: {'random_state': seed},
    'RF': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'ET': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'GB': lambda seed: {'random_state': seed},
    'HGB': lambda seed: {'random_state': seed},
    'MLP': lambda seed: {'random_state': seed},
    'SVC': lambda seed: {'random_state': seed, 'cache_size': 1000},
    'BRF': lambda seed: {'random_state': seed, 'n_jobs': -1},
    'XGB': lambda seed: {'random_state': seed, 'eval_metric': 'auc', 'n_jobs': -1, 'tree_method': 'hist', 'verbosity': 0},
    'LGB': lambda seed: {'random_state': seed, 'n_jobs': -1, 'verbosity': -1},
    'CAT': lambda seed: {'random_seed': seed,'verbose': 0,'eval_metric': 'AUC','train_dir': None}
}

MODEL_CLASS_LOOKUP = {name: cls for name, cls, _ in MODEL_CONFIGS}

for seed in RANDOM_SEEDS:
    print(f"\n{'#'*80}")
    print(f"GLOBAL SEED: {seed}")
    print(f"{'#'*80}")

    np.random.seed(seed)

    for pipeline_name, pipeline_config in FEATURE_PIPELINES.items():
        family = pipeline_config.get('family')
        eligible_models = [entry for entry in MODEL_CONFIGS if MODEL_TO_FAMILY.get(entry[0]) == family]
        if not eligible_models:
            continue

        print(f"\n{'='*80}")
        print(f"PIPELINE: {pipeline_name.upper()} | Family: {family} | CV: {pipeline_config.get('cv_mode', 'kfold')}")
        print(pipeline_config.get('description', ''))
        print(f"{'='*80}")

        for model_name, model_class, configs in eligible_models:
            print(f"\n  [{model_name}] Testing {len(configs)} configurations...")

            for idx, config in enumerate(configs, 1):
                experiment_count += 1

                base_params = MODEL_BASE_PARAMS.get(model_name, lambda s: {})(seed)
                base_params.setdefault('random_state', seed)
                if model_name in ['RF', 'ET', 'BRF', 'XGB', 'LGB']:
                    base_params.setdefault('n_jobs', -1)
                if model_name == 'CAT':
                    base_params.pop('random_state', None)

                merged_params = {**base_params, **config}

                try:
                    model = model_class(**merged_params)
                except TypeError as e:
                    print(f"    Config {idx} skipped (param error): {e}")
                    continue

                try:
                    cv_mean, cv_std, test_auc, _, feature_count, prep_meta = evaluate_with_cv(
                        model,
                        X_train_raw,
                        y_train_raw,
                        X_test_raw,
                        y_test_raw,
                        feature_config=pipeline_config,
                        pipeline_key=pipeline_name
                    )

                    metadata = {
                        'feature_pipeline': pipeline_name,
                        'family': family,
                        'feature_description': pipeline_config.get('description', ''),
                        'random_seed': seed,
                        'cv_mode': pipeline_config.get('cv_mode', 'kfold'),
                        'sampler_summary': prep_meta.get('sampler_summary', 'none'),
                        'encoder': prep_meta.get('encoder', 'onehot'),
                        'scaler': prep_meta.get('scaler', 'standard'),
                        'manual_features': prep_meta.get('manual_features', 'none'),
                        'poly_added': prep_meta.get('poly_added', 0),
                        'selector': prep_meta.get('selector', 'none'),
                        'basic_sampler': prep_meta.get('basic_sampler', 'none'),
                        'advanced_sampler': prep_meta.get('advanced_sampler', 'none')
                    }

                    exp_name = f"seed{seed}_{pipeline_name}_{model_name}_v{idx}"

                    log_experiment(
                        exp_name=exp_name,
                        model_name=model_name,
                        cv_auc_mean=cv_mean,
                        cv_auc_std=cv_std,
                        test_auc=test_auc,
                        features_used=feature_count,
                        hyperparams=merged_params,
                        notes=pipeline_config.get('description', ''),
                        metadata=metadata
                    )

                except Exception as e:
                    print(f"    Config {idx} failed: {str(e)}")
                    continue

            print(f"  Completed {len(configs)} {model_name} experiments")

print(f"\nTotal experiments scheduled: {experiment_count}")


In [None]:

# ============================================================================
# Results Consolidation and Reporting
# ============================================================================
print("\n[8/8] Results Analysis and Ranking...")
print("="*80)

results_df = get_results_df()

if len(results_df) == 0:
    print("[WARN] No experiments recorded!")
else:
    full_path = SUMMARY_DIR / 'experiment_results_fair_full.csv'
    results_df.to_csv(full_path, index=False)
    print(f"[INFO] Full results saved to: {full_path}")

    family_top = results_df.sort_values('test_auc', ascending=False).groupby('family', group_keys=False).head(10)
    best_family_path = SUMMARY_DIR / 'best_by_family.csv'
    family_top.to_csv(best_family_path, index=False)
    print(f"[INFO] Per-family Top 10 saved to: {best_family_path}")

    overall_top = results_df.head(50)
    overall_path = SUMMARY_DIR / 'overall_ranking.csv'
    overall_top.to_csv(overall_path, index=False)
    print(f"[INFO] Overall leaderboard saved to: {overall_path}")

    champion = results_df.iloc[0]
    print("\n========== OVERALL CHAMPION ==========")
    print(f"Winner: {champion['model']} | Pipeline: {champion['family']}::{champion['feature_pipeline']} | Test AUC: {champion['test_auc']:.4f}")

    print("\n========== FAMILY CHAMPIONS ==========")
    for fam_key in MODEL_FAMILIES.keys():
        fam_df = results_df[results_df['family'] == fam_key]
        if len(fam_df) == 0:
            continue
        fam_best = fam_df.iloc[0]
        print(f"{fam_key.capitalize():<7}: {fam_best['model']} ({fam_best['feature_pipeline']})  {fam_best['test_auc']:.4f}")

    counts = results_df['family'].value_counts()
    print("\n========== FAIRNESS CHECK ==========")
    print(f"✅ Linear: Scaled + manual feature boosts ({counts.get('linear', 0)} exps)")
    print(f"✅ Tree: Raw signal + minimal encoding ({counts.get('tree', 0)} exps)")
    print(f"✅ Neural: Scaled raw inputs + early stop ({counts.get('neural', 0)} exps)")
    print("✅ Experiments run under seeds 42 & 2025")

    print("\n" + "="*120)
    print("TOP 25 CONFIGURATIONS BY TEST AUC")
    print("="*120)
    print(f"{'Rank':<6} {'Exp Name':<34} {'Model':<8} {'Family':<8} {'Pipeline':<16} {'Seed':<6} {'CV AUC':<18} {'Test AUC':<10} {'Features':<10} {'Poly':<6} {'Selector':<12}")
    print("-"*120)

    for rank, row in enumerate(results_df.head(25).itertuples(), 1):
        cv_str = f"{row.cv_auc_mean:.4f}±{row.cv_auc_std:.4f}"
        print(f"{rank:<6} {row.exp_name:<34} {row.model:<8} {getattr(row, 'family', 'na'):<8} {getattr(row, 'feature_pipeline', 'na'):<16} {getattr(row, 'random_seed', 'na')!s:<6} {cv_str:<18} {row.test_auc:<10.4f} {row.features_count:<10} {getattr(row, 'poly_added', 0):<6} {getattr(row, 'selector', 'none'):<12}")

    print("-"*120)

    print("\n" + "="*80)
    print("STATISTICAL SUMMARY")
    print("="*80)

    baseline = 0.8797
    best_row = results_df.iloc[0]

    print(f"\n[Perf] Overall Performance:")
    print(f"  Total Experiments: {len(results_df)}")
    print(f"  Best Test AUC: {results_df['test_auc'].max():.4f}")
    print(f"  Mean Test AUC: {results_df['test_auc'].mean():.4f}")
    print(f"  Median Test AUC: {results_df['test_auc'].median():.4f}")
    print(f"  Std Test AUC: {results_df['test_auc'].std():.4f}")

    print(f"\n[Perf] Best Configuration:")
    print(f"  Name: {best_row['exp_name']}")
    print(f"  Model: {best_row['model']}")
    print(f"  CV AUC: {best_row['cv_auc_mean']:.4f} ± {best_row['cv_auc_std']:.4f}")
    print(f"  Test AUC: {best_row['test_auc']:.4f}")
    print(f"  Features: {best_row['features_count']}")
    print(f"  Seed: {best_row.get('random_seed', 'na')}")
    print(f"  Pipeline: {best_row.get('family', 'na')}::{best_row.get('feature_pipeline', 'na')}")
    print(f"  Poly added: {best_row.get('poly_added', 0)} | Selector: {best_row.get('selector', 'none')}")
    print(f"  Hyperparameters: {best_row['hyperparams']}")

    print(f"\n[Baseline] vs Baseline (example.ipynb: {baseline:.4f}):")
    improvement = best_row['test_auc'] - baseline
    if improvement > 0:
        print(f"  BEAT BASELINE by {improvement:.4f} ({improvement/baseline*100:.2f}%)")
    elif improvement > -0.005:
        print(f"  MATCHED BASELINE (within {abs(improvement):.4f})")
    else:
        print(f"  Below baseline by {abs(improvement):.4f} ({abs(improvement)/baseline*100:.2f}%)")

    print(f"\n[Model] Performance by Model Type:")
    model_stats = results_df.groupby('model')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
    print("\n" + model_stats.to_string())

    print(f"\n[Feature] Performance by Feature Pipeline:")
    feat_stats = results_df.groupby('feature_pipeline')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
    print("\n" + feat_stats.to_string())

    if 'cv_mode' in results_df.columns:
        print(f"\n[CV] Performance by CV Mode:")
        cv_stats = results_df.groupby('cv_mode')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
        print("\n" + cv_stats.to_string())

    if 'random_seed' in results_df.columns:
        print(f"\n[Seed] Distribution by random seed:")
        seed_stats = results_df.groupby('random_seed')['test_auc'].agg(['mean', 'max', 'count']).sort_values('max', ascending=False)
        print("\n" + seed_stats.to_string())

    above_baseline = (results_df['test_auc'] >= baseline).sum()
    print(f"\n[Counts] Configurations beating baseline: {above_baseline}/{len(results_df)} ({above_baseline/len(results_df)*100:.1f}%)")

    # Persist supporting artifacts so reruns are optional
    export_family_tables(results_df)
    plot_family_val_vs_test(results_df)
    plot_pipeline_contrast(results_df)
    plot_model_scatter(results_df)
    persist_top_models(results_df)


In [None]:

# ============================================================================
# Production Recommendations
# ============================================================================
print("\n" + "="*80)
print("PRODUCTION DEPLOYMENT RECOMMENDATIONS")
print("="*80)

if len(results_df) > 0:
    best_config = results_df.iloc[0]

    print(f"\n[Deploy] Recommended Configuration for Production:")
    print(f"  Primary Model: {best_config['model']} | Seed: {best_config.get('random_seed', 'na')}")
    print(f"  Experiment ID: {best_config['exp_name']}")
    print(f"  Feature Pipeline: {best_config.get('family', 'na')}::{best_config.get('feature_pipeline', 'na')}")
    print(f"  Expected Performance: {best_config['test_auc']:.4f} AUC")

    print(f"\n  Feature Engineering:")
    print(f"    Pipeline Description: {best_config.get('feature_description', 'na')}")
    print(f"    Total Features: {best_config['features_count']}")
    print(f"    Poly Added: {best_config.get('poly_added', 0)} | Selector: {best_config.get('selector', 'none')}")
    print(f"    Samplers: {best_config.get('sampler_summary', 'none')}")

    print(f"\n  Training Strategy:")
    print(f"    CV Mode: {best_config.get('cv_mode', 'kfold')} | CV Mean: {best_config['cv_auc_mean']:.4f}")
    print("    Keep family-specific preprocessing when refitting on full data")

    print(f"\n  Monitoring:")
    print(f"    Track validation AUC weekly and alert if below {best_config['test_auc'] - 0.02:.4f}")
    print("    Re-run family search quarterly or with data drift signals")

    top5 = results_df.head(5)
    if len(top5) >= 2:
        ensemble_candidates = ', '.join(f"{row.model}@{row.test_auc:.3f}" for row in top5.itertuples())
        print(f"\n  Ensemble Option:")
        print(f"    Soft-vote top 5 models: {ensemble_candidates}")
        print(f"    Expect +0.001 ~ +0.005 AUC if calibration is consistent")

    manifest_path = SUMMARY_DIR / 'top_model_manifest.csv'
    if manifest_path.exists():
        family_roots = ', '.join(str(ARTIFACT_ROOT / fam) for fam in MODEL_FAMILIES.keys())
        print(f"\n  Model Artifacts Manifest: {manifest_path}")
        print(f"    Per-family binaries stored under: {family_roots}")

print("\n" + "="*80)
print("PRODUCTION ULTIMATE PIPELINE COMPLETED")
print(f"Total time: see logs above | Results saved under: {SUMMARY_DIR}")
print(f"Artifact root (models/data/figures): {ARTIFACT_ROOT.resolve()}")
print("="*80)
