In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from scipy import stats
from sklearn.svm import SVC
import xgboost as xgb
import shap
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.feature_selection import mutual_info_regression


from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import *

from scipy.stats import chi2_contingency, ttest_ind 
import optuna


from typing import Dict, List, Optional, Tuple
import pandas as pd

# scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# from sklearn.preprocessing import StandardScaler  # only if you uncomment it

# target encoding
import category_encoders as ce  # pip install category-encoders

%matplotlib inline

In [2]:
def load_tables(data_path="data/"):
    """Simple function to load all Home Credit tables"""
    
    tables = {}
    
    # Load each file with encoding handling
    files = ['application_train.csv', 'bureau.csv', 'bureau_balance.csv', 
             'previous_application.csv', 'installments_payments.csv',
             'credit_card_balance.csv', 'POS_CASH_balance.csv']
    
    for filename in files:
        table_name = filename.replace('.csv', '')
        try:
            # Try common encodings
             tables[table_name] = pd.read_csv(f"{data_path}/{filename}", encoding=encoding)
        except:
            tables[table_name] = pd.read_csv(f"{data_path}/{filename}", encoding='windows-1252')
    
    return tables

tables = load_tables("data/")

In [3]:
import pandas as pd
import numpy as np

class HomeCreditFeatureEngineer:
    def __init__(self, tables: dict):
        self.tables = tables
        self.features = {}

    # -------------------------
    # Application Features
    # -------------------------
    def create_application_features(self):
        df = self.tables['application_train'].copy()
        print('Creating application features...')

        # Credit/income ratios
        df['CREDIT_INCOME_PERCENT']  = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] + 1e-8)
        df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / (df['AMT_INCOME_TOTAL'] + 1e-8)
        df['CREDIT_TERM']            = df['AMT_ANNUITY'] / (df['AMT_CREDIT'] + 1e-8)
        df['GOODS_PRICE_CREDIT_PCT'] = df['AMT_GOODS_PRICE'] / (df['AMT_CREDIT'] + 1e-8)

        # Age/employment
        df['DAYS_BIRTH'] = df['DAYS_BIRTH'].abs()
        df['AGE_YEARS']  = df['DAYS_BIRTH'] / 365.25
        df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / (df['DAYS_BIRTH'] + 1e-8)
        df['INCOME_PER_PERSON']     = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS'] + 1e-8)
        df['CHILDREN_RATIO']        = df['CNT_CHILDREN'] / (df['CNT_FAM_MEMBERS'] + 1e-8)

        # External sources
        ext = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
        df['EXT_SOURCES_MEAN'] = df[ext].mean(axis=1, skipna=True)
        df['EXT_SOURCES_STD']  = df[ext].std(axis=1,  skipna=True)

        # Documents
        doc_cols = [c for c in df.columns if 'FLAG_DOCUMENT' in c]
        df['DOCUMENT_COUNT'] = df[doc_cols].sum(axis=1)

        # Contact info
        df['PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / (df['DAYS_BIRTH'] + 1e-8)

        self.features['application'] = df
        print(f"Application features created: {df.shape}")
        return df

    # -------------------------
    # Bureau + Bureau Balance
    # -------------------------
    def create_bureau_features(self):
        bureau     = self.tables['bureau'].copy()
        bureau_bal = self.tables['bureau_balance'].copy()
        print("Creating bureau features...")

        # Map STATUS to numeric delinquency
        status_map = {'C':0,'X':0,'0':0,'1':1,'2':2,'3':3,'4':4,'5':5}
        bureau_bal['STATUS_NUM'] = bureau_bal['STATUS'].map(status_map).fillna(0)

        bb_agg = bureau_bal.groupby('SK_ID_BUREAU').agg(
            MONTHS_BALANCE_MIN=('MONTHS_BALANCE','min'),
            MONTHS_BALANCE_MAX=('MONTHS_BALANCE','max'),
            MONTHS_BALANCE_SIZE=('MONTHS_BALANCE','count'),
            STATUS_C_COUNT=('STATUS', lambda s: (s=='C').sum()),
            STATUS_BAD_COUNT=('STATUS_NUM', lambda s: (s>=1).sum()),
            MAX_DELQ=('STATUS_NUM','max'),
            MEAN_DELQ=('STATUS_NUM','mean')
        ).reset_index()

        bureau = bureau.merge(bb_agg, on='SK_ID_BUREAU', how='left')

        # Credit utilization
        bureau['CREDIT_UTILIZATION'] = bureau['AMT_CREDIT_SUM_DEBT'] / (bureau['AMT_CREDIT_SUM_LIMIT'] + 1e-8)

        # Aggregations per client
        num_aggs = {
            'DAYS_CREDIT':['mean','min','max'],
            'CREDIT_DAY_OVERDUE':['sum','max'],
            'DAYS_CREDIT_ENDDATE':['mean','min','max'],
            'DAYS_ENDDATE_FACT':['mean','min','max'],
            'AMT_CREDIT_MAX_OVERDUE':['mean','max'],
            'CNT_CREDIT_PROLONG':['sum','mean'],
            'AMT_CREDIT_SUM':['sum','mean','max'],
            'AMT_CREDIT_SUM_DEBT':['sum','mean'],
            'AMT_CREDIT_SUM_LIMIT':['sum','mean','max'],
            'DAYS_CREDIT_UPDATE':['mean','max'],
            'AMT_ANNUITY':['mean','sum'],
            'CREDIT_UTILIZATION':['mean','max'],
            'MAX_DELQ':['max'],
            'MEAN_DELQ':['mean']
        }
        b_num = bureau.groupby('SK_ID_CURR').agg(num_aggs)
        b_num.columns = ['BUREAU__' + '_'.join(c).upper() for c in b_num.columns.to_flat_index()]

        b_cat = bureau.groupby('SK_ID_CURR').agg(
            CREDIT_ACTIVE_COUNT=('CREDIT_ACTIVE','count'),
            CREDIT_ACTIVE_ACTIVE=('CREDIT_ACTIVE', lambda s: (s=='Active').sum()),
            CREDIT_TYPE_NUNIQUE=('CREDIT_TYPE','nunique')
        )

        bureau_df = b_num.join(b_cat, how='left').reset_index()

        # Recent 2 years
        recent = bureau[bureau['DAYS_CREDIT'] >= -730]
        if not recent.empty:
            recent_agg = recent.groupby('SK_ID_CURR').agg(
                RECENT_CREDIT_COUNT=('SK_ID_BUREAU','count'),
                RECENT_CREDIT_SUM=('AMT_CREDIT_SUM','sum')
            )
            bureau_df = bureau_df.merge(recent_agg.reset_index(), on='SK_ID_CURR', how='left')

        self.features['bureau'] = bureau_df
        print(f"Bureau features created: {bureau_df.shape}")
        return bureau_df

    # -------------------------
    # Previous Applications
    # -------------------------
    def create_previous_application_features(self):
        prev = self.tables['previous_application'].copy()
        print("Creating previous application features...")

        prev['APP_CREDIT_RATIO'] = prev['AMT_APPLICATION'] / (prev['AMT_CREDIT'] + 1e-8)

        num_aggs = {
            'AMT_ANNUITY':['mean','max','min'],
            'AMT_APPLICATION':['mean','max','min'],
            'AMT_CREDIT':['mean','max','min'],
            'AMT_DOWN_PAYMENT':['mean','max','min'],
            'AMT_GOODS_PRICE':['mean','max','min'],
            'HOUR_APPR_PROCESS_START':['mean','max','min'],
            'RATE_DOWN_PAYMENT':['mean','max','min'],
            'DAYS_DECISION':['mean','max'],
            'CNT_PAYMENT':['mean','max','min'],
            'APP_CREDIT_RATIO':['mean','max','min']
        }
        p_num = prev.groupby('SK_ID_CURR').agg(num_aggs)
        p_num.columns = ['PREV__' + '_'.join(c).upper() for c in p_num.columns.to_flat_index()]

        p_cat = prev.groupby('SK_ID_CURR').agg(
            PREV_APP_COUNT=('SK_ID_PREV','count'),
            PREV_APP_APPROVED=('NAME_CONTRACT_STATUS', lambda s: (s=='Approved').sum()),
            PREV_APP_REFUSED=('NAME_CONTRACT_STATUS', lambda s: (s=='Refused').sum())
        )

        prev_df = p_num.join(p_cat, how='left').reset_index()
        self.features['previous'] = prev_df
        print(f"Previous application features created: {prev_df.shape}")
        return prev_df

    # -------------------------
    # Installments
    # -------------------------
    def create_installments_features(self):
        ins = self.tables['installments_payments'].copy()
        print("Creating installment features...")

        ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / (ins['AMT_INSTALMENT'] + 1e-8)
        ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
        ins['DPD'] = (ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']).clip(lower=0)
        ins['LATE_PAYMENT'] = (ins['DPD'] > 0).astype(int)

        ins_aggs = {
            'PAYMENT_PERC':['mean','std'],
            'PAYMENT_DIFF':['mean'],
            'DPD':['mean','max','var'],
            'AMT_PAYMENT':['mean','max'],
            'SK_ID_PREV':['count']
        }
        i_num = ins.groupby('SK_ID_CURR').agg(ins_aggs)
        i_num.columns = ['INSTALL__' + '_'.join(c).upper() for c in i_num.columns.to_flat_index()]

        i_cat = ins.groupby('SK_ID_CURR').agg(
            INSTALL_LATE_PAYMENT_COUNT=('LATE_PAYMENT','sum'),
            INSTALL_LATE_PAYMENT_RATE=('LATE_PAYMENT','mean')
        )

        ins_df = i_num.join(i_cat, how='left').reset_index()
        self.features['installments'] = ins_df
        print(f"Installments features created: {ins_df.shape}")
        return ins_df

    # -------------------------
    # Credit Card
    # -------------------------
    def create_credit_card_features(self):
        cc = self.tables['credit_card_balance'].copy()
        print("Creating credit card features...")

        cc['BALANCE_LIMIT_RATIO'] = cc['AMT_BALANCE'] / (cc['AMT_CREDIT_LIMIT_ACTUAL'] + 1e-8)
        cc['PAYMENT_MIN_RATIO']   = cc['AMT_PAYMENT_CURRENT'] / (cc['AMT_INST_MIN_REGULARITY'] + 1e-8)
        cc['LATE_PAYMENT']        = (cc['SK_DPD'] > 0).astype(int)

        cc_aggs = {
            'AMT_BALANCE':['mean','max'],
            'AMT_CREDIT_LIMIT_ACTUAL':['mean'],
            'BALANCE_LIMIT_RATIO':['mean'],
            'PAYMENT_MIN_RATIO':['mean'],
            'SK_DPD':['mean','max'],
            'SK_ID_PREV':['nunique']
        }
        c_num = cc.groupby('SK_ID_CURR').agg(cc_aggs)
        c_num.columns = ['CC__' + '_'.join(c).upper() for c in c_num.columns.to_flat_index()]

        c_cat = cc.groupby('SK_ID_CURR').agg(
            CC_LATE_PAYMENT_COUNT=('LATE_PAYMENT','sum')
        )

        cc_df = c_num.join(c_cat, how='left').reset_index()
        self.features['credit_card'] = cc_df
        print(f"Credit card features created: {cc_df.shape}")
        return cc_df

    # -------------------------
    # POS / Cash
    # -------------------------
    def create_pos_cash_features(self):
        pos = self.tables['POS_CASH_balance'].copy()
        print("Creating POS/Cash features...")

        pos['LATE_PAYMENT'] = (pos['SK_DPD'] > 0).astype(int)

        pos_aggs = {
            'CNT_INSTALMENT':['mean','max'],
            'SK_DPD':['mean','max'],
            'SK_ID_PREV':['nunique']
        }
        p_num = pos.groupby('SK_ID_CURR').agg(pos_aggs)
        p_num.columns = ['POS__' + '_'.join(c).upper() for c in p_num.columns.to_flat_index()]

        p_cat = pos.groupby('SK_ID_CURR').agg(
            POS_LATE_PAYMENT_COUNT=('LATE_PAYMENT','sum')
        )

        pos_df = p_num.join(p_cat, how='left').reset_index()
        self.features['pos_cash'] = pos_df
        print(f"POS/Cash features created: {pos_df.shape}")
        return pos_df

    # -------------------------
    # Merge All
    # -------------------------
    def create_all_features(self):
        app_features  = self.create_application_features()
        bureau_feats  = self.create_bureau_features()
        prev_feats    = self.create_previous_application_features()
        ins_feats     = self.create_installments_features()
        cc_feats      = self.create_credit_card_features()
        pos_feats     = self.create_pos_cash_features()

        final_df = app_features.copy()
        print("Merging all features...")

        for name, f in [
            ('bureau', bureau_feats),
            ('previous', prev_feats),
            ('installments', ins_feats),
            ('credit_card', cc_feats),
            ('pos_cash', pos_feats)
        ]:
            final_df = final_df.merge(f, on='SK_ID_CURR', how='left')
            print(f"After {name}: {final_df.shape}")

        self.features['final'] = final_df
        print(f"Final feature set: {final_df.shape}")
        return final_df


In [4]:
# Advanced Feature Engineering Class
class AdvancedFeatureEngineer:
    def __init__(self, df):
        self.df = df
    
    def create_polynomial_features(self, feature_pairs, degree=2):
        """Create polynomial combinations of important features"""
        for feat1, feat2 in feature_pairs:
            if feat1 in self.df.columns and feat2 in self.df.columns:
                self.df[f'{feat1}_{feat2}_POLY'] = self.df[feat1] * self.df[feat2]
    
    def create_ratio_features(self, feature_pairs):
        """Create ratio features from feature pairs"""
        for feat1, feat2 in feature_pairs:
            if feat1 in self.df.columns and feat2 in self.df.columns:
                self.df[f'{feat1}_{feat2}_RATIO'] = self.df[feat1] / (self.df[feat2] + 1e-8)
    
    def create_rank_features(self, features):
        """Create percentile rank features"""
        for feat in features:
            if feat in self.df.columns:
                self.df[f'{feat}_RANK'] = self.df[feat].rank(pct=True)
    
    def apply_advanced_engineering(self):
        """Apply all advanced feature engineering"""
        
        # Important feature pairs for interactions
        important_pairs = [
            ('AMT_CREDIT', 'AMT_INCOME_TOTAL'),
            ('AMT_ANNUITY', 'AMT_INCOME_TOTAL'),
            ('EXT_SOURCE_1', 'EXT_SOURCE_2'),
            ('EXT_SOURCE_2', 'EXT_SOURCE_3'),
            ('BUREAU_AMT_CREDIT_SUM_MEAN', 'AMT_INCOME_TOTAL'),
            ('CC_BALANCE_MEAN', 'AMT_INCOME_TOTAL')
        ]
        
        self.create_polynomial_features(important_pairs)
        self.create_ratio_features(important_pairs)
        
        # Rank features for important variables
        rank_features = ['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'EXT_SOURCES_MEAN', 
                        'BUREAU_AMT_CREDIT_SUM_MEAN', 'CREDIT_INCOME_PERCENT']
        self.create_rank_features(rank_features)
        
        return self.df


In [5]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import mutual_info_classif
from heapq import nlargest
from itertools import combinations

class AutoFeatureEngineerTransformer(BaseEstimator, TransformerMixin):
    """
    Memory-safe AutoFE:
      - Preselect top-K numeric base features by MI (or variance if y is None).
      - Generate pairwise products & ratios one-by-one (streamed), no big dense matrix.
      - Keep top-M by MI (or variance) for each type.
    """
    def __init__(self,
                 cols=None,                 # optional numeric columns to consider
                 base_top_k=30,             # how many base numeric cols to keep before crossing
                 max_poly_features=60,      # how many product terms to keep
                 max_ratio_features=60,     # how many ratio terms to keep
                 use_mi=True,               # use mutual info with y to rank (else variance)
                 random_state=42,
                 dtype='float32'):
        self.cols = cols
        self.base_top_k = base_top_k
        self.max_poly_features = max_poly_features
        self.max_ratio_features = max_ratio_features
        self.use_mi = use_mi
        self.random_state = random_state
        self.dtype = dtype

        # learned
        self.num_cols_ = None
        self.base_cols_ = None
        self.medians_ = None
        self.keep_poly_pairs_ = []   # list of (f1, f2)
        self.keep_ratio_pairs_ = []  # list of (f1, f2)

    def _numeric_cols(self, X):
        if self.cols is None:
            return X.select_dtypes(include=[np.number]).columns.tolist()
        return [c for c in self.cols if c in X.columns]

    def _score_series(self, s, y):
        # s: 1D series (float), y may be None
        if self.use_mi and (y is not None):
            # mutual_info_classif expects 2D
            return float(mutual_info_classif(s.fillna(0).to_frame(), y,
                                             discrete_features=False,
                                             random_state=self.random_state)[0])
        else:
            return float(np.nanvar(s))

    def fit(self, X, y=None):
        X = X.copy()
        self.num_cols_ = self._numeric_cols(X)
        if len(self.num_cols_) < 2:
            # nothing to do
            self.base_cols_ = self.num_cols_
            self.medians_ = X[self.num_cols_].median() if self.num_cols_ else pd.Series(dtype=float)
            return self

        # medians for NaN handling
        self.medians_ = X[self.num_cols_].median()

        # --- preselect base numeric cols (top-K) ---
        # score each numeric col
        scores = {}
        for c in self.num_cols_:
            s = X[c].fillna(self.medians_[c]).astype(self.dtype)
            scores[c] = self._score_series(s, y)
        # pick top-K
        self.base_cols_ = [c for c, _ in nlargest(self.base_top_k, scores.items(), key=lambda kv: kv[1])]

        # --- stream products & ratios across base cols ---
        poly_heap = []   # list of tuples (score, (f1, f2))
        ratio_heap = []  # list of tuples (score, (f1, f2))

        for f1, f2 in combinations(self.base_cols_, 2):
            # product
            s_prod = (X[f1].fillna(self.medians_[f1]).astype(self.dtype) *
                      X[f2].fillna(self.medians_[f2]).astype(self.dtype))
            sc_prod = self._score_series(s_prod, y)
            poly_heap.append((sc_prod, (f1, f2)))

            # ratio (f1 / f2)
            s_ratio = X[f1].fillna(self.medians_[f1]).astype(self.dtype) / (
                      X[f2].fillna(self.medians_[f2]).astype(self.dtype) + 1e-8)
            sc_ratio = self._score_series(s_ratio, y)
            ratio_heap.append((sc_ratio, (f1, f2)))

        # keep top-M by score
        self.keep_poly_pairs_  = [pair for _, pair in nlargest(self.max_poly_features,  poly_heap,  key=lambda t: t[0])]
        self.keep_ratio_pairs_ = [pair for _, pair in nlargest(self.max_ratio_features, ratio_heap, key=lambda t: t[0])]

        return self

    def transform(self, X):
        X = X.copy()
        if not self.base_cols_:
            return X

        # Recreate products
        for f1, f2 in self.keep_poly_pairs_:
            if f1 in X.columns and f2 in X.columns:
                name = f"{f1}__x__{f2}"
                X[name] = (X[f1].fillna(self.medians_.get(f1, 0)).astype(self.dtype) *
                           X[f2].fillna(self.medians_.get(f2, 0)).astype(self.dtype))

        # Recreate ratios
        for f1, f2 in self.keep_ratio_pairs_:
            if f1 in X.columns and f2 in X.columns:
                name = f"{f1}__div__{f2}"
                X[name] = X[f1].fillna(self.medians_.get(f1, 0)).astype(self.dtype) / (
                          X[f2].fillna(self.medians_.get(f2, 0)).astype(self.dtype) + 1e-8)

        return X


In [6]:
class HomeCreditPipeline:
    def __init__(self, tables: dict):
        self.tables = tables

    def fit_transform(self):
        # Base features
        base_fe = HomeCreditFeatureEngineer(self.tables)
        base_df = base_fe.create_all_features()

        # Advanced features
        adv_fe = AdvancedFeatureEngineer(base_df)
        final_df = adv_fe.apply_advanced_engineering()

        print(f"Pipeline complete: {final_df.shape}")
        return final_df


In [7]:
# pip install scikit-learn category_encoders numpy pandas
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional

from sklearn.base import BaseEstimator, TransformerMixin

# -----------------------------
# Utility / Custom Transformers
# -----------------------------

class IdentityDropper(BaseEstimator, TransformerMixin):
    """Drop columns (e.g., IDs) from X; pass-thru others."""
    def __init__(self, drop_cols: List[str]):
        self.drop_cols = drop_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Assumes a DataFrame (used before ColumnTransformer)
        cols = [c for c in X.columns if c not in self.drop_cols]
        return X[cols]


class CyclicalHourEncoder(BaseEstimator, TransformerMixin):
    """Encode an hour-of-day column (0-23) into sin/cos; drops the original."""
    def __init__(self, hour_col: str):
        self.hour_col = hour_col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Assumes a DataFrame (used before ColumnTransformer)
        X = X.copy()
        if self.hour_col not in X.columns:
            return X
        h = (X[self.hour_col].fillna(0).astype(float) % 24)
        X[self.hour_col + "_SIN"] = np.sin(2 * np.pi * h / 24)
        X[self.hour_col + "_COS"] = np.cos(2 * np.pi * h / 24)
        X = X.drop(columns=[self.hour_col])
        return X


class QuantileClipper(BaseEstimator, TransformerMixin):
    """
    Clip numeric columns to [q_low, q_high] percentiles to reduce outliers.
    Works with both pandas DataFrames and NumPy ndarrays (ColumnTransformer safe).
    """
    def __init__(self, q_low=0.01, q_high=0.99, cols: Optional[List[str]] = None):
        self.q_low = q_low
        self.q_high = q_high
        self.cols = cols
        # fitted state
        self.bounds_ = None      # dict(col -> (lo, hi)) when DataFrame
        self.lo_ = None          # ndarray quantile lows when ndarray
        self.hi_ = None          # ndarray quantile highs when ndarray
        self._df_mode = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self._df_mode = True
            use_cols = self.cols or X.columns.tolist()
            self.bounds_ = {}
            for c in use_cols:
                s = pd.to_numeric(X[c], errors='coerce')
                lo = s.quantile(self.q_low)
                hi = s.quantile(self.q_high)
                self.bounds_[c] = (lo, hi)
            self.lo_, self.hi_ = None, None
        else:
            self._df_mode = False
            X = np.asarray(X, dtype=float)
            self.lo_ = np.nanquantile(X, self.q_low, axis=0).astype(np.float32, copy=False)
            self.hi_ = np.nanquantile(X, self.q_high, axis=0).astype(np.float32, copy=False)
            self.bounds_ = None
        return self

    def transform(self, X):
        if self._df_mode or isinstance(X, pd.DataFrame):
            # DF path
            X = X.copy()
            for c, (lo, hi) in self.bounds_.items():
                if c in X.columns:
                    X[c] = pd.to_numeric(X[c], errors='coerce').clip(lower=lo, upper=hi)
            return X
        else:
            # ndarray path
            X = np.asarray(X, dtype=float)
            return np.clip(X, self.lo_, self.hi_)


class Log1pTransformer(BaseEstimator, TransformerMixin):
    """
    Apply np.log1p to specified numeric columns (DataFrame) or to ALL columns (ndarray).
    Safely shifts columns if any values < -1 to keep inputs valid for log1p.
    Column-wise shifts are learned in fit() and reused in transform().
    """
    def __init__(self, cols: Optional[List[str]] = None):
        self.cols = cols
        # fitted state
        self._df_mode = None
        self.shifts_ = None      # dict(col -> shift) when DataFrame
        self.arr_shift_ = None   # ndarray shifts when ndarray

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self._df_mode = True
            self.shifts_ = {}
            cols = self.cols or X.columns.tolist()
            for c in cols:
                if c not in X.columns:
                    continue
                s = pd.to_numeric(X[c], errors='coerce')
                min_val = np.nanmin(s.values) if s.size else 0.0
                self.shifts_[c] = 0.0 if (np.isnan(min_val) or min_val >= -1.0) else (-1.0 - min_val + 1e-8)
            self.arr_shift_ = None
        else:
            self._df_mode = False
            X = np.asarray(X, dtype=float)
            mins = np.nanmin(X, axis=0)
            shift = np.where(mins >= -1.0, 0.0, (-1.0 - mins + 1e-8))
            self.arr_shift_ = shift.astype(np.float32, copy=False)
            self.shifts_ = None
        return self

    def transform(self, X):
        if self._df_mode or isinstance(X, pd.DataFrame):
            # DF path
            X = X.copy()
            cols = self.cols or X.columns.tolist()
            for c in cols:
                if c in X.columns:
                    v = pd.to_numeric(X[c], errors='coerce').values
                    shift = self.shifts_.get(c, 0.0)
                    X[c] = np.log1p(v + shift)
            return X
        else:
            # ndarray path
            X = np.asarray(X, dtype=float)
            return np.log1p(X + self.arr_shift_)


class BinaryCleaner(BaseEstimator, TransformerMixin):
    """
    Convert common Home Credit binary flags (0/1, 'Y'/'N', 'Yes'/'No', True/False) into 0/1.
    Only applies to listed columns that exist. (Runs before ColumnTransformer.)
    """
    def __init__(self, cols: List[str]):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Assumes a DataFrame (used before ColumnTransformer)
        X = X.copy()
        for c in self.cols:
            if c not in X.columns:
                continue
            s = X[c]
            if s.dtype == 'bool':
                X[c] = s.astype(int)
            else:
                X[c] = (
                    s.replace({'Y': 1, 'N': 0, 'Yes': 1, 'No': 0, 'True': 1, 'False': 0})
                     .astype(str).str.lower()
                     .replace({'true': 1, 'false': 0, 'y': 1, 'n': 0})
                )
                X[c] = pd.to_numeric(X[c], errors='coerce')
        return X


In [8]:
# -----------------------------
# Column detection helpers
# -----------------------------

def detect_column_groups(
    X: pd.DataFrame,
    known_ordinal: Dict[str, List[str]] = None,
    known_binary: List[str] = None,
    high_card_threshold: int = 12,
    force_categorical_like: List[str] = None
) -> Dict[str, List[str]]:
    """
    Detect numeric vs categorical and split cat into low/high cardinality.
    Also supports overrides for ordinal and "categorical-looking numeric" codes.
    """
    known_ordinal = known_ordinal or {}
    known_binary = known_binary or []
    force_categorical_like = force_categorical_like or []  # e.g., ['SELLERPLACE_AREA']

    id_cols = [c for c in X.columns if c in ('SK_ID_CURR','SK_ID_PREV','SK_ID_BUREAU')]
    hour_cols = [c for c in X.columns if c == 'HOUR_APPR_PROCESS_START']

    # Base detection
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    # Move forced categorical-like from numeric -> cat
    for c in force_categorical_like:
        if c in numeric_cols:
            numeric_cols.remove(c)

    # Categorical = objects + forced
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    categorical_cols += [c for c in force_categorical_like if c in X.columns]

    # Remove IDs and hour col from groups
    for c in id_cols + hour_cols:
        if c in numeric_cols: numeric_cols.remove(c)
        if c in categorical_cols: 
            try: categorical_cols.remove(c)
            except ValueError: pass

    # Binary flags (override)
    binary_cols = [c for c in known_binary if c in X.columns]

    # Ordinal (override with provided order lists)
    ordinal_cols = [c for c in known_ordinal.keys() if c in X.columns]

    # Non-ordinal categorical (excluding binary & ordinal)
    cat_nominal = [c for c in categorical_cols if c not in binary_cols + ordinal_cols]

    # Split nominal into low/high cardinality
    low_card_cats, high_card_cats = [], []
    for c in cat_nominal:
        nuni = X[c].nunique(dropna=True)
        (low_card_cats if nuni <= high_card_threshold else high_card_cats).append(c)

    return {
        'ids': id_cols,
        'hour': hour_cols,
        'numeric': numeric_cols,
        'binary': binary_cols,
        'ordinal': ordinal_cols,
        'ordinal_orders': known_ordinal,
        'low_card_nominal': low_card_cats,
        'high_card_nominal': high_card_cats
    }





In [9]:
# -----------------------------
# Build the preprocessing pipeline
# -----------------------------

def build_homecredit_preprocessor(
    X: pd.DataFrame,
    *,
    known_ordinal: Dict[str, List[str]] = None,
    known_binary: List[str] = None,
    force_categorical_like: List[str] = None,
    high_card_threshold: int = 12,
    log1p_cols: Optional[List[str]] = None,
    clip_quantiles: Tuple[float,float] = (0.01, 0.99),
    onehot_drop: str = 'if_binary',
    target_smoothing: float = 0.2
) -> Tuple[Pipeline, Dict[str, List[str]]]:
    """
    Returns a sklearn Pipeline that:
      - drops IDs,
      - encodes hour cyclically,
      - imputes + clips + log1p numerics,
      - encodes binary, ordinal, low-card (OHE), high-card (TargetEncoder).
    """
    log1p_cols = log1p_cols or []
    col_groups = detect_column_groups(
        X,
        known_ordinal=known_ordinal,
        known_binary=known_binary,
        force_categorical_like=force_categorical_like,
        high_card_threshold=high_card_threshold
    )

    # 1) First stage: column-level wrangling (drop IDs, cyclic hour, binary normalization)
    initial_steps = []
    if col_groups['ids']:
        initial_steps.append(("drop_ids", IdentityDropper(drop_cols=col_groups['ids'])))
    if col_groups['hour']:
        # apply cyclical to the known hour column
        initial_steps.append(("hour_cyc", CyclicalHourEncoder(hour_col=col_groups['hour'][0])))
    if col_groups['binary']:
        initial_steps.append(("binary_clean", BinaryCleaner(cols=col_groups['binary'])))

    initial = Pipeline(initial_steps) if initial_steps else 'passthrough'

    # 2) ColumnTransformer for typed processing
    transformers = []

    # Numeric pipeline
    num_pipe = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='median')),
        ('clip', QuantileClipper(q_low=clip_quantiles[0], q_high=clip_quantiles[1], cols=None)),
        ('log1p', Log1pTransformer(cols=log1p_cols)),
        # (Optional scaler can be added for linear/NN models)
        # ('scaler', StandardScaler())
    ])
    if col_groups['numeric']:
        transformers.append(('num', num_pipe, col_groups['numeric']))

    # Binary (already normalized to 0/1) – impute if any missing
    if col_groups['binary']:
        bin_pipe = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent'))
        ])
        transformers.append(('bin', bin_pipe, col_groups['binary']))

    # Ordinal pipeline (with explicit order)
    if col_groups['ordinal']:
        # Build categories in the required order for each ordinal col
        categories = [col_groups['ordinal_orders'][c] for c in col_groups['ordinal']]
        ord_pipe = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('ord', OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1))
        ])
        transformers.append(('ord', ord_pipe, col_groups['ordinal']))

    # Low-card nominal → OneHot
    if col_groups['low_card_nominal']:
        ohe_pipe = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore', drop=onehot_drop, sparse_output=False))
        ])
        transformers.append(('ohe', ohe_pipe, col_groups['low_card_nominal']))

    # High-card nominal → TargetEncoder (CV-safe when used inside sklearn CV)
    if col_groups['high_card_nominal']:
        te_pipe = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('te', ce.TargetEncoder(smoothing=target_smoothing))
        ])
        transformers.append(('te', te_pipe, col_groups['high_card_nominal']))

    ct = ColumnTransformer(
        transformers=transformers,
        remainder='drop',
        verbose_feature_names_out=False
    )

    # Final pipeline: initial wrangling → typed transformer
    pipe = Pipeline(steps=[
        ('initial', initial),
        ('ct', ct)
    ])

    return pipe, col_groups

In [10]:
# # 1) Manual FE from your tables
# fe = HomeCreditPipeline(tables)         # your earlier class that merges & engineers
# full_df = fe.fit_transform()            # one row per SK_ID_CURR
# y = full_df['TARGET'].values
# X = full_df.drop(columns=['TARGET'])

# # 2) Build the preprocessing pipeline (from our previous step)
# preprocessor, groups = build_homecredit_preprocessor(
#     X,
#     known_ordinal={'NAME_YIELD_GROUP': ['low_normal','low_action','middle','high','unknown']},
#     known_binary=['FLAG_OWN_CAR','FLAG_OWN_REALTY','NFLAG_LAST_APPL_IN_DAY','NFLAG_INSURED_ON_APPROVAL'],
#     force_categorical_like=['SELLERPLACE_AREA'],
#     log1p_cols=['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','AMT_DOWN_PAYMENT']
# )

# # 3) Define AutoFE (let it auto-detect numeric cols)
# auto_fe = AutoFeatureEngineerTransformer(
#     cols=None,            # or pass a curated numeric list
#     base_top_k=30,        # shrink to 30 best base numerics before crossing
#     max_poly_features=60,
#     max_ratio_features=60,
#     use_mi=True,
#     random_state=42
# )


# # 4) Build the master pipeline (AutoFE -> Preprocessor -> Model)
# from sklearn.pipeline import Pipeline
# from lightgbm import LGBMClassifier

# master = Pipeline(steps=[
#     ('auto_fe', auto_fe),
#     ('prep', preprocessor),    # your ColumnTransformer with imputers/encoders
#     ('clf', LGBMClassifier(
#         n_estimators=2000, learning_rate=0.03,
#         num_leaves=64, subsample=0.8, colsample_bytree=0.8,
#         random_state=42
#     ))
# ])


# # 5) CV / training (CV-safe: AutoFE and TargetEncoder see only train folds)
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_val_score(master, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
# print("CV AUC:", scores.mean(), "+/-", scores.std())


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import scipy.sparse as sp

class FiniteCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, max_abs=1e6, force_dense=True, dtype='float32'):
        self.max_abs = max_abs
        self.force_dense = force_dense
        self.dtype = dtype
    def fit(self, X, y=None): return self
    def transform(self, X):
        # to dense if requested
        if sp.issparse(X):
            X = X.toarray() if self.force_dense else X.tocsr()
        X = np.asarray(X, dtype=self.dtype)
        X[~np.isfinite(X)] = 0.0
        np.clip(X, -self.max_abs, self.max_abs, out=X)
        return X

In [26]:

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

class HomeCreditPreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, builder, **builder_kwargs):
        self.builder = builder                  # e.g., build_homecredit_preprocessor
        self.builder_kwargs = builder_kwargs
        self.ct_ = None
        self.groups_ = None
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        X_df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        self.feature_names_in_ = list(X_df.columns)
        # Build the ColumnTransformer using the *current* columns (after AutoFE)
        ct, groups = self.builder(X_df, **self.builder_kwargs)
        self.ct_ = ct.fit(X_df, y)
        self.groups_ = groups
        return self

    def transform(self, X):
        X_df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X, columns=self.feature_names_in_)
        return self.ct_.transform(X_df)



fe = HomeCreditPipeline(tables)
full_df = fe.fit_transform()            # one row per SK_ID_CURR
y = full_df['TARGET'].values
X = full_df.drop(columns=['TARGET'])


auto_fe = AutoFeatureEngineerTransformer(
    cols=None,
    base_top_k=30,
    max_poly_features=60,
    max_ratio_features=60,
    use_mi=True,
    random_state=42
)


prep_tf = HomeCreditPreprocessorTransformer(
    builder=build_homecredit_preprocessor,   # <-- pass the callable, don't call it here
    known_ordinal={'NAME_YIELD_GROUP': ['low_normal','low_action','middle','high','unknown']},
    known_binary=['FLAG_OWN_CAR','FLAG_OWN_REALTY','NFLAG_LAST_APPL_IN_DAY','NFLAG_INSURED_ON_APPROVAL'],
    force_categorical_like=['SELLERPLACE_AREA'],
    log1p_cols=['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','AMT_DOWN_PAYMENT']
)



master = Pipeline(steps=[
    ('auto_fe', auto_fe),
    ('prep',    prep_tf),                     # now sees & processes AutoFE columns
    ('clean',   FiniteCleaner(max_abs=1e6)),  # optional safety
    ('clf',     LGBMClassifier(
        n_estimators=2000, learning_rate=0.03,
        num_leaves=64, subsample=0.8, colsample_bytree=0.8,
        random_state=42
    ))
])


from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(master, X, y, cv=cv, scoring='roc_auc', n_jobs=-1, error_score='raise')
print("CV AUC:", scores.mean(), "+/-", scores.std())


Creating application features...
Application features created: (307511, 134)
Creating bureau features...
Bureau features created: (305811, 37)
Creating previous application features...
Previous application features created: (338857, 33)
Creating installment features...
Installments features created: (339587, 12)
Creating credit card features...
Credit card features created: (103558, 10)
Creating POS/Cash features...
POS/Cash features created: (337252, 7)
Merging all features...
After bureau: (307511, 170)
After previous: (307511, 202)
After installments: (307511, 213)
After credit_card: (307511, 222)
After pos_cash: (307511, 228)
Final feature set: (307511, 228)
Pipeline complete: (307511, 240)
CV AUC: 0.7832860947013209 +/- 0.0028697116415542364


In [27]:
# !pip install flaml lightgbm xgboost catboost --quiet

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

from flaml import AutoML


class FLAMLClassifier(BaseEstimator, ClassifierMixin):
    """
    sklearn-compatible classifier that uses FLAML under the hood.
    Works inside sklearn Pipelines (so your AutoFE+Preprocessor are CV-safe).
    """
    def __init__(self,
                 time_budget=1800,     # seconds
                 metric="roc_auc",
                 eval_method="cv",     # "cv" or "holdout"
                 n_splits=5,
                 task="classification",
                 verbose=1,
                 seed=42,
                 estimator_list=None,  # None = let FLAML choose freely
                 **flaml_kwargs):
        self.time_budget = time_budget
        self.metric = metric
        self.eval_method = eval_method
        self.n_splits = n_splits
        self.task = task
        self.verbose = verbose
        self.seed = seed
        self.estimator_list = estimator_list
        self.flaml_kwargs = flaml_kwargs
        self.automl_ = None

    def fit(self, X, y):
        automl = AutoML()
        fit_kwargs = dict(
            X_train=X,
            y_train=y,
            task=self.task,
            time_budget=self.time_budget,
            metric=self.metric,
            eval_method=self.eval_method,
            n_splits=self.n_splits if self.eval_method == "cv" else None,
            verbose=self.verbose,
            seed=self.seed,
        )
        if self.estimator_list is not None:
            fit_kwargs["estimator_list"] = self.estimator_list
        # pass any extra FLAML args (e.g., constraints)
        fit_kwargs.update(self.flaml_kwargs)

        automl.fit(**fit_kwargs)
        self.automl_ = automl
        return self

    def predict_proba(self, X):
        if self.automl_ is None:
            raise RuntimeError("Call fit() before predict_proba().")
        return self.automl_.predict_proba(X)

    def predict(self, X):
        if self.automl_ is None:
            raise RuntimeError("Call fit() before predict().")
        return self.automl_.predict(X)

    @property
    def best_estimator_(self):
        return None if self.automl_ is None else self.automl_.best_estimator

    @property
    def best_config_(self):
        return None if self.automl_ is None else self.automl_.best_config

    @property
    def best_model_(self):
        return None if self.automl_ is None else self.automl_.model


In [31]:
# Your existing instances (safe defaults for 16GB)
auto_fe = AutoFeatureEngineerTransformer(
    cols=None, base_top_k=16, max_poly_features=20, max_ratio_features=20,
    use_mi=True, random_state=42, dtype='float32'
)

prep_tf = HomeCreditPreprocessorTransformer(
    builder=build_homecredit_preprocessor,   # <-- pass the callable, don't call it here
    known_ordinal={'NAME_YIELD_GROUP': ['low_normal','low_action','middle','high','unknown']},
    known_binary=['FLAG_OWN_CAR','FLAG_OWN_REALTY','NFLAG_LAST_APPL_IN_DAY','NFLAG_INSURED_ON_APPROVAL'],
    force_categorical_like=['SELLERPLACE_AREA'],
    log1p_cols=['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','AMT_DOWN_PAYMENT']
)

fe_prep = Pipeline([
    ('auto_fe', auto_fe),
    ('prep',    prep_tf),
    ('clean',   FiniteCleaner(max_abs=1e6, force_dense=True, dtype='float32')),
])

automl_clf = FLAMLClassifier(
    time_budget=1800,
    metric="roc_auc",
    eval_method="cv",
    n_splits=5,
    verbose=2,
    seed=42,
    estimator_list=None
)

master_auto = Pipeline([
    ('fe', fe_prep),
    ('automl', automl_clf)
])

In [32]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores = cross_val_score(estimator=master_auto, X=X, y=y, cv=cv, scoring='roc_auc', n_jobs=-1, error_score='raise')
print("CV AUC:", scores.mean(), "+/-", scores.std())


ValueError: Pipeline should either be a classifier to be used with response_method=predict_proba or the response_method should be 'predict'. Got a regressor with response_method=predict_proba instead.

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores = cross_val_score(master, X, y, cv=cv, scoring='roc_auc',
                         n_jobs=-1, error_score='raise')
print(scores.mean(), scores.std())