# FPD Rule Mining Pipeline

End-to-end workflow to surface compact FPD mitigation rules from pre-application data.

## 0. Config & Setup
Update the configuration variables in the next cell to point at the appropriate CSV and tweak modeling thresholds if needed.

In [1]:
# Configuration
FILEPATH = "risk_analytics_case_2025.csv"  # path to source CSV
echo = None
ID_COL = "user_uuid"
TARGET_COL = "is_fpd"
UW_SCORE_COL = "acquisition_uw_score"
DROP_POST_APP_COLS = [
    "removed_nelo_app_between_first_loan_and_first_due_date",
    "days_since_most_recent_app_event_prior_to_first_due_date",
    "total_mobile_app_events_between_first_loan_and_first_due_date",
]
LOW_MISS = 0.01
HIGH_MISS = 0.99
N_SPLITS = 5
ALPHA_TE = 200.0
RANDOM_SEED = 42
MIN_RULE_IMPROVEMENT_PP = 0.2
MAX_DEPTH = 4
MIN_SAMPLES_LEAF_FRAC = 0.01
TEST_SIZE = 0.3
PLOT_STYLE = "whitegrid"
WARNING_FILTER = "ignore"
LEAF_MIN_LIFT_PP = 2.0
RARE_LEVEL_FRACTION = 0.005
BAND_GUARDRAIL_DELTA = 0.3


In [2]:
# Imports & global setup
import json
import warnings
from pathlib import Path
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text

plt.rcParams["figure.figsize"] = (10, 6)
warnings.filterwarnings(WARNING_FILTER)
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(RANDOM_SEED)
sns.set_style(PLOT_STYLE)

FILEPATH = Path(FILEPATH)
PLOTS_DIR = Path("plots") / "fpd_rule_dashboard"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR = Path("outputs") / "fpd_rule_mining"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

pd.options.display.max_columns = 200
pd.options.display.float_format = "{:,.4f}".format


### Helper functions

In [3]:
from typing import List, Sequence, Optional

def drop_columns_with_missingness(df, low_threshold, high_threshold, protected=None):
    """Drop columns with missing fraction below/above the configured thresholds."""
    if protected is None:
        protected = []
    protected_set = set(protected)
    missing_rate = df.isna().mean()
    to_drop_low = [col for col, rate in missing_rate.items() if rate < low_threshold and col not in protected_set]
    to_drop_high = [col for col, rate in missing_rate.items() if rate > high_threshold and col not in protected_set]
    drop_cols = list(dict.fromkeys(to_drop_low + to_drop_high))
    df_clean = df.drop(columns=drop_cols, errors="ignore")
    log = {"low_missing": to_drop_low, "high_missing": to_drop_high}
    return df_clean, log

def is_binary_series(series):
    """Return True if the non-null values of the series are binary (0/1)."""
    unique_vals = pd.unique(series.dropna())
    if len(unique_vals) == 0:
        return True
    try:
        unique_vals = set(np.round(unique_vals.astype(float), 8))
    except Exception:
        return False
    return unique_vals <= {0.0, 1.0}

def build_missing_flags(df, numeric_cols, categorical_cols):
    """Generate missingness indicator columns for eligible features."""
    flag_frames = []
    created_cols = []
    for col in numeric_cols:
        if col not in df.columns:
            continue
        if is_binary_series(df[col]):
            continue
        if df[col].isna().any():
            flag_col = f"{col}_missing"
            flag_frames.append(df[col].isna().astype("uint8").rename(flag_col))
            created_cols.append(flag_col)
    for col in categorical_cols:
        if col not in df.columns:
            continue
        if df[col].isna().any():
            flag_col = f"{col}_missing"
            flag_frames.append(df[col].isna().astype("uint8").rename(flag_col))
            created_cols.append(flag_col)
    if flag_frames:
        flags_df = pd.concat(flag_frames, axis=1)
    else:
        flags_df = pd.DataFrame(index=df.index)
    return flags_df, created_cols

def frequency_encode(series):
    """Frequency encode a categorical series."""
    freq = series.value_counts(normalize=True)
    encoded = series.map(freq).astype(float)
    encoded.name = f"{series.name}_freq"
    return encoded

def cross_fitted_target_encode(series, target, n_splits, alpha, random_state):
    """Cross-fitted target encoding with additive smoothing."""
    series = series.astype("category")
    target = target.astype(float)
    global_mean = target.mean()
    encoded = pd.Series(index=series.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idx, valid_idx in kf.split(series):
        train_series = series.iloc[train_idx]
        train_target = target.iloc[train_idx]
        stats = train_target.groupby(train_series).agg(["sum", "count"])
        smooth = (stats["sum"] + alpha * global_mean) / (stats["count"] + alpha)
        enc_map = smooth.to_dict()
        fold_values = series.iloc[valid_idx].map(enc_map).fillna(global_mean)
        encoded.iloc[valid_idx] = fold_values
    encoded.name = f"{series.name}_te"
    return encoded.astype(float)

def encode_categoricals(df, categorical_cols, target, n_splits, alpha, random_state, rare_fraction):
    """Encode categorical columns following tiered strategy."""
    encoded_parts = []
    log_rows = []
    for col in categorical_cols:
        col_series = df[col].fillna("<missing>").astype(str)
        cardinality = col_series.nunique(dropna=False)
        log_entry = {"column": col, "cardinality": cardinality}
        if cardinality <= 5:
            log_entry["encoding"] = "one-hot"
            one_hot = pd.get_dummies(col_series, prefix=col, drop_first=True, dtype="uint8")
            if not one_hot.empty:
                encoded_parts.append(one_hot)
        elif cardinality <= 30:
            log_entry["encoding"] = "one-hot + freq + target"
            value_freq = col_series.value_counts(normalize=True)
            collapsed = col_series.where(value_freq.ge(rare_fraction), other="Other")
            one_hot = pd.get_dummies(collapsed, prefix=col, drop_first=True, dtype="uint8")
            if not one_hot.empty:
                encoded_parts.append(one_hot)
            freq_enc = frequency_encode(collapsed)
            encoded_parts.append(freq_enc.to_frame())
            te_enc = cross_fitted_target_encode(collapsed, target, n_splits=n_splits, alpha=alpha, random_state=random_state)
            encoded_parts.append(te_enc.to_frame())
        else:
            log_entry["encoding"] = "freq + target"
            freq_enc = frequency_encode(col_series)
            encoded_parts.append(freq_enc.to_frame())
            te_enc = cross_fitted_target_encode(col_series, target, n_splits=n_splits, alpha=alpha, random_state=random_state)
            encoded_parts.append(te_enc.to_frame())
        log_rows.append(log_entry)
    if encoded_parts:
        encoded_df = pd.concat(encoded_parts, axis=1)
    else:
        encoded_df = pd.DataFrame(index=df.index)
    encoding_log = pd.DataFrame(log_rows)
    return encoded_df, encoding_log

def create_risk_band(series, q=6):
    """Create quantile-based risk bands from the UW score."""
    labels = list(range(1, q + 1))
    try:
        bands = pd.qcut(series, q=q, labels=labels)
    except ValueError:
        bands = pd.qcut(series, q=q, labels=labels, duplicates="drop")
        categories = [cat for cat in bands.cat.categories]
        relabel = {cat: idx + 1 for idx, cat in enumerate(categories)}
        bands = bands.map(relabel)
        return bands.astype(int)
    return bands.astype(int)

def extract_tree_paths(clf, feature_names):
    """Extract decision rules for each leaf of the trained tree."""
    tree = clf.tree_
    paths = {}
    def recurse(node, depth, path_clauses):
        if tree.feature[node] != -2:
            feature = feature_names[tree.feature[node]]
            threshold = tree.threshold[node]
            left_clause = f"{feature} <= {threshold:.6f}"
            right_clause = f"{feature} > {threshold:.6f}"
            recurse(tree.children_left[node], depth + 1, path_clauses + [left_clause])
            recurse(tree.children_right[node], depth + 1, path_clauses + [right_clause])
        else:
            paths[node] = {
                "depth": depth,
                "clauses": path_clauses,
                "rule": " & ".join(path_clauses),
            }
    recurse(0, 0, [])
    return paths

def extract_rule_columns(rule):
    """Identify column names referenced inside a rule string."""
    pattern = r"([A-Za-z0-9_]+)\s*(?:<=|>=|==|<|>)"
    return list(dict.fromkeys(re.findall(pattern, rule)))

def mask_from_rule(rule, df_like):
    """Return boolean mask of rows satisfying the rule expression."""
    if not rule:
        return pd.Series(False, index=df_like.index)
    columns = extract_rule_columns(rule)
    missing = [col for col in columns if col not in df_like.columns]
    if missing:
        raise KeyError(f"Columns not found for rule '{rule}': {missing}")
    mask = df_like.eval(rule, engine="python")
    if mask.dtype != bool:
        mask = mask.astype(bool)
    return mask

def evaluate_rules(df_like, target, rules):
    """Evaluate each rule independently and compute its marginal impact."""
    columns = [
        "rule",
        "rule_rank",
        "removed_n",
        "removed_pct",
        "kept_n",
        "new_fpd_pct",
        "fpd_reduction_pp",
        "efficiency",
        "purity_pct",
    ]
    if not rules:
        return pd.DataFrame(columns=columns)
    total = len(df_like)
    base_rate = target.mean() * 100
    records = []
    for idx, rule in enumerate(rules, start=1):
        mask = mask_from_rule(rule, df_like)
        removed_n = int(mask.sum())
        removed_pct = removed_n / total * 100 if total else 0.0
        kept_mask = ~mask
        kept_n = int(kept_mask.sum())
        if kept_n == 0:
            new_rate = np.nan
            reduction = np.nan
        else:
            new_rate = target[kept_mask].mean() * 100
            reduction = base_rate - new_rate
        purity = target[mask].mean() * 100 if removed_n > 0 else np.nan
        efficiency = reduction / removed_pct if removed_pct else np.nan
        records.append({
            "rule": rule,
            "rule_rank": idx,
            "removed_n": removed_n,
            "removed_pct": removed_pct,
            "kept_n": kept_n,
            "new_fpd_pct": new_rate,
            "fpd_reduction_pp": reduction,
            "efficiency": efficiency,
            "purity_pct": purity,
        })
    result_df = pd.DataFrame(records)
    result_df = result_df.sort_values(by=["efficiency", "fpd_reduction_pp"], ascending=False).reset_index(drop=True)
    return result_df

def evaluate_rules_cumulative(df_like, target, rules, bands=None, band_universe=None):
    """Apply rules cumulatively and track stage-level metrics."""
    total = len(df_like)
    rows = []
    baseline_rate = target.mean() * 100
    if bands is not None:
        bands = bands.astype(int)
        if band_universe is None:
            band_universe = sorted({int(x) for x in bands.dropna().unique()})
    else:
        band_universe = []
    def stage_row(stage_idx, stage_label, mask, marginal_mask):
        kept_mask = ~mask
        kept_n = int(kept_mask.sum())
        removed_n = int(mask.sum())
        kept_pct = kept_n / total * 100 if total else 0.0
        removed_pct = removed_n / total * 100 if total else 0.0
        bad_n = int(target[kept_mask].sum())
        good_n = kept_n - bad_n
        overall_fpd = target[kept_mask].mean() * 100 if kept_n else np.nan
        row = {
            "stage": stage_idx,
            "stage_label": stage_label,
            "kept_n": kept_n,
            "kept_pct": kept_pct,
            "removed_n": removed_n,
            "removed_pct": removed_pct,
            "overall_fpd_pct": overall_fpd,
            "fpd_reduction_pp": baseline_rate - overall_fpd if kept_n else np.nan,
            "bad_n": bad_n,
            "good_n": good_n,
            "marginal_removed_n": int(marginal_mask.sum()),
            "marginal_removed_pct": marginal_mask.sum() / total * 100 if total else 0.0,
            "marginal_bad_n": int(target[marginal_mask].sum()),
            "marginal_good_n": int(marginal_mask.sum() - target[marginal_mask].sum()),
            "marginal_purity_pct": target[marginal_mask].mean() * 100 if marginal_mask.sum() else np.nan,
        }
        if bands is not None:
            kept_bands = bands[kept_mask]
            kept_target = target[kept_mask]
            band_rates = kept_target.groupby(kept_bands).mean() if kept_n else pd.Series(dtype=float)
            band_rates = band_rates.reindex(band_universe, fill_value=np.nan)
            for band_id, value in band_rates.items():
                row[f"band_{band_id}_fpd_pct"] = value * 100 if pd.notna(value) else np.nan
        return row
    empty_mask = pd.Series(False, index=df_like.index)
    rows.append(stage_row(0, "Baseline", empty_mask, empty_mask))
    cumulative_mask = empty_mask.copy()
    for idx, rule in enumerate(rules, start=1):
        rule_mask = mask_from_rule(rule, df_like)
        marginal_mask = rule_mask & (~cumulative_mask)
        cumulative_mask = cumulative_mask | rule_mask
        stage_label = "After " + "+".join([f"R{i}" for i in range(1, idx + 1)])
        rows.append(stage_row(idx, stage_label, cumulative_mask, marginal_mask))
    return pd.DataFrame(rows)

def automatic_rule_selection(rules, df_like, target, bands, min_delta_pp, band_guardrail_pp=None, band_universe=None):
    """Sequentially add rules until marginal improvement falls below threshold or guardrails trigger."""
    selected = []
    if not rules:
        return selected
    cumulative_mask = pd.Series(False, index=df_like.index)
    prev_rate = target.mean() * 100
    if bands is not None:
        bands = bands.astype(int)
        if band_universe is None:
            band_universe = sorted({int(x) for x in bands.dropna().unique()})
        prev_band_rates = target.groupby(bands).mean().reindex(band_universe, fill_value=np.nan) * 100
    else:
        prev_band_rates = pd.Series(dtype=float)
    for rule in rules:
        rule_mask = mask_from_rule(rule, df_like)
        new_mask = cumulative_mask | rule_mask
        kept_mask = ~new_mask
        kept_n = kept_mask.sum()
        if kept_n == 0:
            break
        new_rate = target[kept_mask].mean() * 100
        improvement = prev_rate - new_rate
        if improvement < min_delta_pp:
            break
        if bands is not None:
            new_band_rates = target[kept_mask].groupby(bands[kept_mask]).mean().reindex(band_universe, fill_value=np.nan) * 100
            if band_guardrail_pp is not None:
                increases = new_band_rates - prev_band_rates
                if (increases > band_guardrail_pp).any():
                    break
            prev_band_rates = new_band_rates
        selected.append(rule)
        cumulative_mask = new_mask
        prev_rate = new_rate
    return selected

def shorten_rule(rule, max_terms=3):
    """Create a shorter display version of a rule string."""
    clauses = [clause.strip() for clause in rule.split("&")]
    clauses = [clause for clause in clauses if clause]
    if len(clauses) <= max_terms:
        return " & ".join(clauses)
    return " & ".join(clauses[:max_terms]) + " ..."


## 1. Load & Basic Sanity

In [4]:
df_raw = pd.read_csv(FILEPATH)
if df_raw.columns[0].lower().startswith("unnamed"):
    df_raw = pd.read_csv(FILEPATH, index_col=0)

initial_shape = df_raw.shape
missing_subset = df_raw[TARGET_COL].isna() | df_raw[UW_SCORE_COL].isna()
dropped_rows = int(missing_subset.sum())
df_raw = df_raw.dropna(subset=[TARGET_COL, UW_SCORE_COL])
post_filter_shape = df_raw.shape

print(f"Initial shape: {initial_shape}")
print(f"Rows dropped due to missing target/UW score: {dropped_rows}")
print(f"Shape after filter: {post_filter_shape}")
print(f"Baseline FPD rate: {df_raw[TARGET_COL].mean() * 100:.2f}%")

display(df_raw.head())


Initial shape: (182462, 218)
Rows dropped due to missing target/UW score: 0
Shape after filter: (182462, 218)
Baseline FPD rate: 25.00%


Unnamed: 0,user_uuid,first_due_date_vintage,first_loan_principal,credit_limit,limit_utilization,uw_bucket,acquisition_uw_score,fraud_score,is_fpd,first_loan_type,acquisition_channel,bureau_1_credit_lines_in_use,bureau_1_credit_lines_total,bureau_1_current_credit_balance,bureau_1_credit_lines_past_due,bureau_1_current_balance_past_due,bureau_1_credit_lines_past12_months,bureau_1_credit_amount_past12_months,bureau_1_credit_lines_past24_months,bureau_1_credit_amount_past24_months,bureau_1_recent_credit_inquiries,bureau_1_max_external_credit_limit,bureau_1_active_lines,bureau_1_loan_types_count,bureau_1_avg_days_credit,bureau_1_oldest_account_duration,bureau_1_newest_account_duration,bureau_1_pl_credits_count,bureau_1_cc_credits_count,bureau_1_pn_credits_count,bureau_1_cl_credits_count,bureau_1_af_credits_count,bureau_1_cac_credits_count,bureau_1_max_external_active_limit,bureau_1_min_external_active_limit,bureau_1_total_external_active_limit,bureau_1_avg_loan_types,bureau_1_perc_active_lines,bureau_1_perc_lines_in_use,bureau_1_debt_ratio_max_limit,bureau_1_debt_ratio_total_limit,bureau_1_over_utilized,bureau_1_utilization,bureau_2_credit_lines_in_use,bureau_2_credit_lines_total,bureau_2_current_credit_balance,bureau_2_credit_lines_past_due,bureau_2_current_balance_past_due,bureau_2_credit_lines_past12_months,bureau_2_credit_amount_past12_months,bureau_2_credit_lines_past24_months,bureau_2_credit_amount_past24_months,bureau_2_recent_credit_inquiries,bureau_2_max_external_credit_limit,bureau_2_active_lines,bureau_2_loan_types_count,bureau_2_avg_days_credit,bureau_2_oldest_account_duration,bureau_2_newest_account_duration,bureau_2_pl_credits_count,bureau_2_cc_credits_count,bureau_2_pn_credits_count,bureau_2_cl_credits_count,bureau_2_af_credits_count,bureau_2_max_external_active_limit,bureau_2_min_external_active_limit,bureau_2_total_external_active_limit,bureau_2_avg_loan_types,bureau_2_perc_active_lines,bureau_2_perc_lines_in_use,bureau_2_debt_ratio_max_limit,bureau_2_debt_ratio_total_limit,bureau_2_over_utilized,bureau_2_utilization,bureau_2_consulted,device_manufacturer,hotspots_overdue_ratio,hotspots_overdue_ratio_near,pi_age_years,pi_occupation,pi_has_signup_merchant,pi_reported_income,referrer_unique_referrers,renapo_registration_year_null,renapo_curp_status_rcn,renapo_curp_status_bd,renapo_curp_status_ah,renapo_curp_status_an,renapo_curp_status_rcc,renapo_curp_status_bsu,renapo_curp_status_null,shared_devices_count,shared_devices_overdue,vendor_2_has_users_similar_address,vendor_2_email_age_category,vendor_2_has_digit_normalized_email_address,vendor_2_has_latest_name,vendor_2_has_raw_ip,vendor_2_has_network,vendor_2_has_email_domain,...,apps_has_instagram,apps_has_discord,apps_has_twitter,apps_has_outlook,apps_has_chrome,apps_has_amazon,apps_has_alibaba,apps_has_mercadolibre,apps_has_liverpool,apps_has_tencent,apps_has_uber,apps_has_didi,apps_has_banamex,apps_has_bbva,apps_has_nubank,apps_has_santander,apps_has_amex,apps_shady_credit_apps_count,imss_employments,imss_min_days,imss_max_days,imss_min_monthly_income,imss_max_monthly_income,imss_total_monthly_income,imss_min_mode,imss_max_mode,avg_imss_income,avg_self_reported_income,history_tc_max_good_standing_strike,credits_months_since_earliest_recent_fecha_cierre_cuenta,credits_total_count,credits_months_since_earliest_recent_fecha_ultima_compra,credits_months_since_earliest_recent_fecha_peor_atraso,history_tc_active_max_good_standing_strike,history_tc_total_good_standing_reports,credits_cc_saldo_vencido,credits_pp_limite_credito,credits_cc_limite_credito,history_cc_max_good_standing_strike,credit_check_r_months_since_earliest_credit_check,history_pp_max_good_standing_strike,credits_pp_saldo_vencido,credits_total_limite_credito,credit_check_r_total_approved_amount,credit_check_soc_fin_ob_mult_months_since_earliest_credit_check,credits_tc_max_peor_atraso,credit_check_f_in_last_3_months_approved_amount,credit_check_f_in_last_12_months_approved_amount,credit_check_r_months_since_last_credit_check,credits_cc_max_peor_atraso,credits_tc_limite_credito,credit_check_microfinanciera_total_approved_amount,credit_check_sic_in_last_12_months,credit_check_compania_prestamo_personal_total_approved_amount,credits_total_active_monto_pagar,credit_check_r_in_last_3_months_approved_amount,history_tc_max_reported_events,credits_pp_credito_maximo,credit_check_f_total_approved_amount,credits_saldo_vencido,credits_tc_credito_maximo,credits_tc_saldo_vencido,history_tc_active_max_reported_events,credits_months_since_most_recent_fecha_peor_atraso,credits_pp_max_peor_atraso,credits_total_monto_pagar,history_pp_total_good_standing_reports,credit_check_microfinanciera_months_since_last_credit_check,credit_check_bancos_months_since_earliest_credit_check,credit_check_q_in_last_3_months_approved_amount,history_cc_active_max_reported_events,credit_check_q_months_since_earliest_credit_check,credit_check_bancos_months_since_last_credit_check,credit_check_soc_fin_de_obj_multiple_total_approved_amount,credit_check_sic_months_since_earliest_credit_check,credits_total_credito_maximo,credits_tc_active_credito_maximo,credits_pp_active_credito_maximo,vendor_1_telegram_privacy_status,vendor_1_whatsapp_privacy_status,vendor_1_registered_profiles,vendor_1_registered_ecommerce_profiles,vendor_1_registered_email_provider_profiles,vendor_1_registered_messaging_profiles,vendor_1_registered_professional_profiles,vendor_1_registered_social_media_profiles,vendor_1_number_of_photos_returned,vendor_1_google_registered,vendor_1_telegram_registered,vendor_1_whatsapp_registered,vendor_1_microsoft_registered,vendor_1_facebook_registered,vendor_1_instagram_registered,vendor_1_twitter_registered,vendor_1_amazon_registered,vendor_1_linkedin_registered,user_state,removed_nelo_app_between_first_loan_and_first_due_date,days_since_most_recent_app_event_prior_to_first_due_date,total_mobile_app_events_between_first_loan_and_first_due_date
0,9df3f75a-3006-40ba-aa60-9d1384b65bbe,2024-04-15,300.0,350.0,0.8571,[0.88‑0.92),0.8801,0.4189,0,loan_type_1,source_a,6.0,26.0,66180.0,2.0,2136.0,20.0,99849.0,23.0,106983.0,6.0,2700.0,8.0,3.0,176.36,4447.0,38.0,21.0,0.0,0.0,3.0,0.0,2.0,2700.0,0.0,3699.0,0.1154,0.3077,0.2308,24.5111,17.8913,1.0,17.8913,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,oppo,0.2122,0.2267,31.0,private_employee,0.0,20000.0,,,,,,,,,,0.0,0.0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,12233.69,12919.7468,,1.0,26.0,1.0,2.0,,0.0,0.0,26200.0,0.0,23.0,,14.0,1579.0,27199.0,,8.0,0.0,0.0,0.0,,1.0,0.0,6000.0,1.0,2000.0,6070.0,,,45369.0,0.0,2136.0,0.0,0.0,,35.0,6.0,6070.0,62.0,22.0,0.0,,33.0,,6.0,1000.0,11.0,110383.0,0.0,15069.0,,PRIVATE,5.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,,1.0,0.0,1.0,1.0,0.0,1.0,,MEX,0.0,0.0,614.0
1,64551791-0c2e-4cbf-aa10-c1194587fec2,2024-05-31,350.0,750.0,0.4667,[0.88‑0.92),0.9198,0.4016,0,loan_type_2,source_d,9.0,38.0,135224.0,5.0,40349.0,6.0,44367.0,12.0,222829.0,6.0,25000.0,9.0,4.0,67.6216,2528.0,26.0,26.0,6.0,0.0,4.0,0.0,2.0,25000.0,0.0,55640.0,0.1053,0.2368,0.2368,5.409,2.4303,1.0,2.4303,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,apple,0.2727,0.2389,36.0,public_employee,0.0,24000.0,,,,,,,,,,0.0,0.0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,10528.26,49296.0,18.0,0.0,38.0,0.0,0.0,18.0,64.0,0.0,59550.0,3000.0,40.0,,40.0,25.0,123290.0,,10.0,7.0,2000.0,2400.0,,1.0,48500.0,3000.0,5.0,2000.0,59088.0,,24.0,351814.0,6400.0,40349.0,50596.0,37491.0,24.0,65.0,6.0,59088.0,435.0,18.0,,,,,,4400.0,0.0,490097.0,19480.0,162296.0,,PRIVATE,3.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,,1.0,0.0,0.0,1.0,0.0,0.0,,MEX,0.0,2.0,963.0
2,26b2b091-20da-43a8-a2d1-e0a0e2c8282f,2024-05-31,457.58,500.0,0.9152,[0.84‑0.88),0.8637,0.4471,1,loan_type_3,source_b,3.0,31.0,5996.0,5.0,11228.0,0.0,0.0,3.0,3137.0,2.0,0.0,3.0,5.0,213.8667,6859.0,443.0,21.0,1.0,0.0,0.0,3.0,1.0,3000.0,1200.0,6200.0,0.1613,0.0968,0.0968,,0.9671,0.9671,0.9671,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,xiaomi,0.2781,0.2625,53.0,self_employed,0.0,7000.0,,,,,,,,,,0.0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,12539.2,13928.8823,0.0,2.0,26.0,14.0,0.0,0.0,0.0,486.0,3600.0,3000.0,1.0,,24.0,7848.0,54500.0,,,13.0,,,,27.0,2000.0,,0.0,,5672.0,,24.0,270512.0,,11228.0,2310.0,2894.0,24.0,28.0,36.0,5672.0,251.0,,,,29.0,,,,,319459.0,2310.0,1200.0,,PUBLIC,4.0,1.0,0.0,1.0,0.0,2.0,1.0,,,1.0,0.0,1.0,1.0,0.0,1.0,,VER,0.0,1.0,1080.0
3,ed80a4a7-4384-40ef-9b23-2f639f308eb9,2024-04-30,500.0,950.0,0.5263,[0.88‑0.92),0.9047,0.5024,1,loan_type_2,source_a,7.0,47.0,50765.0,1.0,11864.0,6.0,36800.0,7.0,46800.0,7.0,11100.0,7.0,8.0,195.2174,9022.0,42.0,7.0,4.0,0.0,2.0,0.0,6.0,16000.0,0.0,32801.0,0.1702,0.1489,0.1489,4.5734,1.5477,1.0,1.5477,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,samsung,0.2273,0.1733,47.0,self_employed,0.0,3000.0,,,,,,,,,,0.0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,11088.8,12635.2137,24.0,26.0,19.0,0.0,70.0,24.0,35.0,0.0,15500.0,0.0,10.0,8.0,15.0,0.0,36801.0,0.0,,13.0,0.0,0.0,8.0,0.0,21001.0,3000.0,1.0,2000.0,14449.0,,24.0,43012.0,0.0,11864.0,31236.0,11864.0,24.0,70.0,0.0,14449.0,34.0,23.0,8.0,,,,8.0,,3.0,113915.0,28854.0,15500.0,,PRIVATE,4.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,,1.0,0.0,1.0,1.0,0.0,0.0,,CHIS,0.0,0.0,1225.0
4,e91919b2-fdb3-4ae8-ad80-2760e35b3b77,2024-02-15,250.0,550.0,0.4545,[0.92‑0.96),0.9414,0.4019,0,loan_type_2,source_a,8.0,19.0,25175.0,3.0,18471.0,16.0,16900.0,16.0,16900.0,4.0,2800.0,7.0,2.0,162.1667,2934.0,15.0,15.0,4.0,0.0,0.0,0.0,0.0,2800.0,0.0,10200.0,0.1053,0.3684,0.4211,8.9911,2.4681,1.0,2.4681,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,samsung,0.2275,0.2567,34.0,self_employed,0.0,16000.0,,,,,,,,,,0.0,0.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,30.0,11587.5,11587.5,11587.5,10.0,10.0,15448.09,18187.4879,16.0,0.0,19.0,0.0,9.0,16.0,36.0,0.0,15600.0,0.0,,,6.0,0.0,30100.0,,2.0,13.0,200.0,200.0,,0.0,14500.0,6000.0,2.0,1000.0,8822.0,,24.0,16400.0,200.0,18471.0,19029.0,18471.0,21.0,26.0,0.0,13133.0,39.0,23.0,,,,,,1200.0,9.0,35429.0,2419.0,6800.0,,PRIVATE,4.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,,1.0,0.0,1.0,0.0,0.0,1.0,,NL,0.0,1.0,300.0


## 2. Cleaning & Missing Flags

In [5]:
df_work = df_raw.copy()
if ID_COL and ID_COL in df_work.columns:
    df_work[ID_COL] = df_work[ID_COL].astype(str)

post_app_to_drop = [col for col in DROP_POST_APP_COLS if col in df_work.columns]
df_work = df_work.drop(columns=post_app_to_drop, errors="ignore")

protected_cols = [TARGET_COL, UW_SCORE_COL]
if ID_COL:
    protected_cols.append(ID_COL)

df_work, missing_drop_log = drop_columns_with_missingness(
    df_work,
    low_threshold=LOW_MISS,
    high_threshold=HIGH_MISS,
    protected=protected_cols,
)

numeric_cols = df_work.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_work.select_dtypes(include=["object", "category"]).columns.tolist()

flags_df, flag_cols = build_missing_flags(df_work, numeric_cols, categorical_cols)
if not flags_df.empty:
    df_work = pd.concat([df_work, flags_df], axis=1)

column_governance_log = {
    "post_app_dropped": post_app_to_drop,
    "low_missing_dropped": missing_drop_log["low_missing"],
    "high_missing_dropped": missing_drop_log["high_missing"],
    "missing_flags_created": flag_cols,
}

df_work[TARGET_COL] = df_work[TARGET_COL].astype(int)

print("Column governance summary:")
for key, values in column_governance_log.items():
    print(f"  {key}: {len(values)}")


Column governance summary:
  post_app_dropped: 3
  low_missing_dropped: 63
  high_missing_dropped: 11
  missing_flags_created: 105


## 3. Risk Banding

In [6]:
df_work[UW_SCORE_COL] = pd.to_numeric(df_work[UW_SCORE_COL], errors="coerce")
df_work = df_work.dropna(subset=[UW_SCORE_COL])
df_work["risk_band"] = create_risk_band(df_work[UW_SCORE_COL], q=6)

band_summary = (
    df_work.groupby("risk_band")[TARGET_COL]
    .agg(count="count", bad_rate="mean")
    .assign(fpd_pct=lambda d: d["bad_rate"] * 100)
)

print("Risk band counts:")
print(df_work["risk_band"].value_counts().sort_index())
print()
print("FPD by band:")
display(band_summary)
print(f"Monotonic FPD by band: {band_summary['bad_rate'].is_monotonic_increasing}")


Risk band counts:
risk_band
1    30411
2    30410
3    30410
4    30410
5    30410
6    30411
Name: count, dtype: int64

FPD by band:


Unnamed: 0_level_0,count,bad_rate,fpd_pct
risk_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,30411,0.4119,41.1923
2,30410,0.2858,28.5761
3,30410,0.2463,24.6333
4,30410,0.2193,21.9336
5,30410,0.1887,18.8688
6,30411,0.1479,14.794


Monotonic FPD by band: False


## 4. Categorical Encoders & Feature Assembly

In [7]:
categorical_cols = [
    col for col in df_work.select_dtypes(include=["object", "category"]).columns
    if col not in {TARGET_COL}
]
if ID_COL in categorical_cols:
    categorical_cols.remove(ID_COL)

encoded_cats, encoding_log = encode_categoricals(
    df_work,
    categorical_cols,
    df_work[TARGET_COL],
    n_splits=N_SPLITS,
    alpha=ALPHA_TE,
    random_state=RANDOM_SEED,
    rare_fraction=RARE_LEVEL_FRACTION,
)

numeric_cols = [
    col for col in df_work.select_dtypes(include=[np.number]).columns
    if col not in {TARGET_COL, "risk_band"}
]
if ID_COL in numeric_cols:
    numeric_cols.remove(ID_COL)

X_num = df_work[numeric_cols]
X_cat = encoded_cats
X = pd.concat([X_num, X_cat], axis=1)
y = df_work[TARGET_COL].astype(int)
bands = df_work["risk_band"].astype(int)

print(f"Numeric features: {X_num.shape[1]}")
print(f"Categorical encodings: {X_cat.shape[1]}")
print(f"Total feature matrix shape: {X.shape}")

display(encoding_log.head())


Numeric features: 241
Categorical encodings: 6
Total feature matrix shape: (182462, 247)


Unnamed: 0,column,cardinality,encoding
0,phone_carrier,207,freq + target
1,vendor_1_telegram_privacy_status,3,one-hot
2,vendor_1_whatsapp_privacy_status,3,one-hot


## 5. Exploratory Signals

In [8]:
band_universe = list(range(1, 7))

numeric_for_corr = X_num.select_dtypes(include=[np.number])
corr_series = numeric_for_corr.apply(lambda col: col.corr(y)).dropna()
corr_df = (
    pd.DataFrame({"corr": corr_series, "abs_corr": corr_series.abs()})
    .sort_values("abs_corr", ascending=False)
)

mi_input = numeric_for_corr.fillna(numeric_for_corr.median())
mi_scores = mutual_info_classif(mi_input, y, random_state=RANDOM_SEED)
mi_df = pd.DataFrame({"feature": mi_input.columns, "mutual_info": mi_scores}).set_index("feature")

ranking = corr_df.join(mi_df, how="outer")
ranking["corr_rank"] = ranking["abs_corr"].rank(ascending=False, method="dense")
ranking["mi_rank"] = ranking["mutual_info"].rank(ascending=False, method="dense")
ranking["high_both"] = (ranking["corr_rank"] <= 15) & (ranking["mi_rank"] <= 15)

print("Top features by correlation / mutual information overlap:")
display(ranking.sort_values(["high_both", "abs_corr"], ascending=[False, False]).head(20))

top_features = ranking.sort_values("abs_corr", ascending=False).head(12).index.tolist()
band_corr_records = {}
for band_id in band_universe:
    mask = bands == band_id
    if mask.sum() < 10:
        continue
    band_corr_records[band_id] = {
        feature: X_num.loc[mask, feature].corr(y[mask])
        for feature in top_features
    }

band_corr_df = pd.DataFrame(band_corr_records).T[top_features]
plt.figure(figsize=(12, 6))
sns.heatmap(band_corr_df, cmap="coolwarm", center=0, annot=True, fmt=".2f")
plt.title("Correlation of Top Numerical Features vs FPD by Risk Band")
plt.xlabel("Feature")
plt.ylabel("Risk Band")
plt.tight_layout()
plt.savefig(PLOTS_DIR / "band_feature_correlation_heatmap.png", dpi=150)
plt.close()


Top features by correlation / mutual information overlap:


Unnamed: 0,corr,abs_corr,mutual_info,corr_rank,mi_rank,high_both
bureau_2_cc_credits_count,-0.134,0.134,0.0281,2.0,5.0,True
bureau_2_avg_loan_types,0.1196,0.1196,0.0263,3.0,13.0,True
bureau_2_perc_lines_in_use,0.0995,0.0995,0.0263,4.0,12.0,True
acquisition_uw_score,-0.2169,0.2169,0.0246,1.0,25.0,False
history_tc_total_good_standing_reports,-0.0862,0.0862,0.0045,5.0,119.0,False
bureau_2_oldest_account_duration,-0.0859,0.0859,0.015,6.0,83.0,False
bureau_1_avg_loan_types,0.0844,0.0844,0.0055,7.0,115.0,False
bureau_2_credit_lines_total,-0.0833,0.0833,0.0213,8.0,60.0,False
bureau_2_perc_active_lines,0.0817,0.0817,0.0225,9.0,50.0,False
history_tc_active_max_good_standing_strike_missing,0.0796,0.0796,0.0122,10.0,90.0,False


## 6. Decision Tree & Leaf Rules

In [9]:
imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=y,
)

min_samples_leaf = max(int(MIN_SAMPLES_LEAF_FRAC * len(X_train)), 1)
clf = DecisionTreeClassifier(
    max_depth=MAX_DEPTH,
    min_samples_leaf=min_samples_leaf,
    random_state=RANDOM_SEED,
    class_weight="balanced",
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

tree_metrics = {
    "auc": roc_auc_score(y_test, y_proba),
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred, zero_division=0),
    "recall": recall_score(y_test, y_pred, zero_division=0),
}
print("Tree metrics:")
for metric, value in tree_metrics.items():
    print(f"  {metric}: {value:.3f}")

print()
print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred))

print()
print("Classification report:")
print(classification_report(y_test, y_pred, digits=3))

tree_text = export_text(clf, feature_names=list(X_imputed.columns), decimals=2)
print("Decision tree structure:")
print(tree_text)

leaf_ids = clf.apply(X_imputed)
leaf_paths = extract_tree_paths(clf, list(X_imputed.columns))
base_rate_pct = y.mean() * 100

leaf_records = []
for leaf_id in np.unique(leaf_ids):
    node_info = leaf_paths.get(leaf_id, {"rule": "", "depth": 0, "clauses": []})
    mask = leaf_ids == leaf_id
    n_samples = int(mask.sum())
    bad_n = int(y[mask].sum())
    good_n = n_samples - bad_n
    fpd_rate_pct = (bad_n / n_samples) * 100 if n_samples else np.nan
    kept_mask = ~mask
    kept_rate_pct = y[kept_mask].mean() * 100 if kept_mask.sum() else np.nan
    fpd_reduction_pp = base_rate_pct - kept_rate_pct if kept_mask.sum() else np.nan
    removed_pct = n_samples / len(X_imputed) * 100 if len(X_imputed) else 0.0
    efficiency = fpd_reduction_pp / removed_pct if removed_pct else np.nan
    leaf_records.append({
        "leaf_id": int(leaf_id),
        "rule": node_info["rule"],
        "depth": int(node_info["depth"]),
        "n_samples": n_samples,
        "n_bad": bad_n,
        "n_good": good_n,
        "fpd_rate_pct": fpd_rate_pct,
        "removed_pct": removed_pct,
        "fpd_reduction_pp": fpd_reduction_pp,
        "efficiency": efficiency,
        "is_candidate": bool(
            (fpd_rate_pct - base_rate_pct) >= LEAF_MIN_LIFT_PP and
            (n_samples / len(X_imputed)) >= MIN_SAMPLES_LEAF_FRAC
        ),
    })

leaf_summary_df = pd.DataFrame(leaf_records).sort_values("efficiency", ascending=False)
display(leaf_summary_df)


Tree metrics:
  auc: 0.633
  accuracy: 0.674
  precision: 0.369
  recall: 0.427

Confusion matrix (rows=true, cols=pred):
[[31081  9973]
 [ 7848  5837]]

Classification report:
              precision    recall  f1-score   support

           0      0.798     0.757     0.777     41054
           1      0.369     0.427     0.396     13685

    accuracy                          0.674     54739
   macro avg      0.584     0.592     0.586     54739
weighted avg      0.691     0.674     0.682     54739

Decision tree structure:
|--- acquisition_uw_score <= 0.84
|   |--- acquisition_uw_score <= 0.69
|   |   |--- acquisition_uw_score <= 0.60
|   |   |   |--- acquisition_uw_score <= 0.52
|   |   |   |   |--- class: 1
|   |   |   |--- acquisition_uw_score >  0.52
|   |   |   |   |--- class: 1
|   |   |--- acquisition_uw_score >  0.60
|   |   |   |--- phone_carrier_te <= 0.23
|   |   |   |   |--- class: 1
|   |   |   |--- phone_carrier_te >  0.23
|   |   |   |   |--- class: 1
|   |--- acquisitio

Unnamed: 0,leaf_id,rule,depth,n_samples,n_bad,n_good,fpd_rate_pct,removed_pct,fpd_reduction_pp,efficiency,is_candidate
0,4,acquisition_uw_score <= 0.838233 & acquisition...,4,2002,1353,649,67.5824,1.0972,0.4724,0.4306,True
1,5,acquisition_uw_score <= 0.838233 & acquisition...,4,1841,1071,770,58.1749,1.009,0.3381,0.3351,True
6,14,acquisition_uw_score <= 0.838233 & acquisition...,4,2365,1247,1118,52.7273,1.2962,0.3641,0.2809,True
3,8,acquisition_uw_score <= 0.838233 & acquisition...,4,2285,1193,1092,52.2101,1.2523,0.3451,0.2756,True
2,7,acquisition_uw_score <= 0.838233 & acquisition...,4,1879,866,1013,46.0883,1.0298,0.2194,0.2131,True
7,15,acquisition_uw_score <= 0.838233 & acquisition...,4,4420,1871,2549,42.3303,2.4224,0.4302,0.1776,True
4,11,acquisition_uw_score <= 0.838233 & acquisition...,4,6977,2612,4365,37.4373,3.8238,0.4945,0.1293,True
10,22,acquisition_uw_score > 0.838233 & acquisition_...,4,7564,2632,4932,34.7964,4.1455,0.4237,0.1022,True
5,12,acquisition_uw_score <= 0.838233 & acquisition...,4,16609,4835,11774,29.1107,9.1027,0.4117,0.0452,True
11,23,acquisition_uw_score > 0.838233 & acquisition_...,4,6372,1714,4658,26.8989,3.4922,0.0687,0.0197,False


## 7. Rule Evaluation Engine

In [10]:
df_for_rules = X_imputed.copy()
df_for_rules["risk_band"] = bands

candidate_rules = [rule for rule in leaf_summary_df.loc[leaf_summary_df["is_candidate"], "rule"] if rule]

single_rule_impact_df = evaluate_rules(df_for_rules, y, candidate_rules)
single_rule_impact_df = single_rule_impact_df.merge(
    leaf_summary_df[["rule", "n_samples", "n_bad", "n_good", "depth", "fpd_rate_pct"]],
    on="rule",
    how="left",
)

single_rule_impact_df["rule_label"] = [f"R{i+1}" for i in range(len(single_rule_impact_df))]
single_rule_impact_df["rule_short"] = single_rule_impact_df["rule"].apply(shorten_rule)

print(f"Candidate rules evaluated: {len(candidate_rules)}")
display(single_rule_impact_df.head(10))

selected_rules = automatic_rule_selection(
    single_rule_impact_df["rule"].tolist(),
    df_for_rules,
    y,
    bands,
    min_delta_pp=MIN_RULE_IMPROVEMENT_PP,
    band_guardrail_pp=BAND_GUARDRAIL_DELTA,
    band_universe=band_universe,
)

print(f"Selected rules (auto-stopping): {len(selected_rules)}")
for idx, rule in enumerate(selected_rules, start=1):
    print(f"R{idx}: {rule}")

cumulative_eval_df = evaluate_rules_cumulative(
    df_for_rules,
    y,
    selected_rules,
    bands=bands,
    band_universe=band_universe,
)

display(cumulative_eval_df)

band_cols = [col for col in cumulative_eval_df.columns if col.startswith("band_")]
band_fpd_matrix = cumulative_eval_df[["stage_label"] + band_cols].set_index("stage_label")


Candidate rules evaluated: 9


Unnamed: 0,rule,rule_rank,removed_n,removed_pct,kept_n,new_fpd_pct,fpd_reduction_pp,efficiency,purity_pct,n_samples,n_bad,n_good,depth,fpd_rate_pct,rule_label,rule_short
0,acquisition_uw_score <= 0.838233 & acquisition...,1,2002,1.0972,180460,24.5273,0.4724,0.4306,67.5824,2002,1353,649,4,67.5824,R1,acquisition_uw_score <= 0.838233 & acquisition...
1,acquisition_uw_score <= 0.838233 & acquisition...,2,1841,1.009,180621,24.6616,0.3381,0.3351,58.1749,1841,1071,770,4,58.1749,R2,acquisition_uw_score <= 0.838233 & acquisition...
2,acquisition_uw_score <= 0.838233 & acquisition...,3,2365,1.2962,180097,24.6356,0.3641,0.2809,52.7273,2365,1247,1118,4,52.7273,R3,acquisition_uw_score <= 0.838233 & acquisition...
3,acquisition_uw_score <= 0.838233 & acquisition...,4,2285,1.2523,180177,24.6546,0.3451,0.2756,52.2101,2285,1193,1092,4,52.2101,R4,acquisition_uw_score <= 0.838233 & acquisition...
4,acquisition_uw_score <= 0.838233 & acquisition...,5,1879,1.0298,180583,24.7803,0.2194,0.2131,46.0883,1879,866,1013,4,46.0883,R5,acquisition_uw_score <= 0.838233 & acquisition...
5,acquisition_uw_score <= 0.838233 & acquisition...,6,4420,2.4224,178042,24.5695,0.4302,0.1776,42.3303,4420,1871,2549,4,42.3303,R6,acquisition_uw_score <= 0.838233 & acquisition...
6,acquisition_uw_score <= 0.838233 & acquisition...,7,6977,3.8238,175485,24.5052,0.4945,0.1293,37.4373,6977,2612,4365,4,37.4373,R7,acquisition_uw_score <= 0.838233 & acquisition...
7,acquisition_uw_score > 0.838233 & acquisition_...,8,7564,4.1455,174898,24.576,0.4237,0.1022,34.7964,7564,2632,4932,4,34.7964,R8,acquisition_uw_score > 0.838233 & acquisition_...
8,acquisition_uw_score <= 0.838233 & acquisition...,9,16609,9.1027,165853,24.588,0.4117,0.0452,29.1107,16609,4835,11774,4,29.1107,R9,acquisition_uw_score <= 0.838233 & acquisition...


Selected rules (auto-stopping): 9
R1: acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score <= 0.597289 & acquisition_uw_score <= 0.516786
R2: acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score <= 0.597289 & acquisition_uw_score > 0.516786
R3: acquisition_uw_score <= 0.838233 & acquisition_uw_score > 0.689887 & apps_installed_count_missing > 0.500000 & acquisition_uw_score <= 0.808913
R4: acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score > 0.597289 & phone_carrier_te > 0.233899
R5: acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score > 0.597289 & phone_carrier_te <= 0.233899
R6: acquisition_uw_score <= 0.838233 & acquisition_uw_score > 0.689887 & apps_installed_count_missing > 0.500000 & acquisition_uw_score > 0.808913
R7: acquisition_uw_score <= 0.838233 & acquisition_uw_score > 0.689887 & apps_installed_count_missing <= 0.500000 &

Unnamed: 0,stage,stage_label,kept_n,kept_pct,removed_n,removed_pct,overall_fpd_pct,fpd_reduction_pp,bad_n,good_n,marginal_removed_n,marginal_removed_pct,marginal_bad_n,marginal_good_n,marginal_purity_pct,band_1_fpd_pct,band_2_fpd_pct,band_3_fpd_pct,band_4_fpd_pct,band_5_fpd_pct,band_6_fpd_pct
0,0,Baseline,182462,100.0,0,0.0,24.9997,0.0,45615,136847,0,0.0,0,0,,41.1923,28.5761,24.6333,21.9336,18.8688,14.794
1,1,After R1,180460,98.9028,2002,1.0972,24.5273,0.4724,44262,136198,2002,1.0972,1353,649,67.5824,39.3326,28.5761,24.6333,21.9336,18.8688,14.794
2,2,After R1+R2,178619,97.8938,3843,2.1062,24.1805,0.8192,43191,135428,1841,1.009,1071,770,58.1749,38.0269,28.5761,24.6333,21.9336,18.8688,14.794
3,3,After R1+R2+R3,176254,96.5976,6208,3.4024,23.7975,1.2023,41944,134310,2365,1.2962,1247,1118,52.7273,36.5905,28.5761,24.6333,21.9336,18.8688,14.794
4,4,After R1+R2+R3+R4,173969,95.3453,8493,4.6547,23.4243,1.5754,40751,133218,2285,1.2523,1193,1092,52.2101,34.9621,28.5761,24.6333,21.9336,18.8688,14.794
5,5,After R1+R2+R3+R4+R5,172090,94.3155,10372,5.6845,23.1768,1.8229,39885,132205,1879,1.0298,866,1013,46.0883,33.9189,28.5761,24.6333,21.9336,18.8688,14.794
6,6,After R1+R2+R3+R4+R5+R6,167670,91.8931,14792,8.1069,22.6719,2.3278,38014,129656,4420,2.4224,1871,2549,42.3303,32.8169,27.5266,24.6333,21.9336,18.8688,14.794
7,7,After R1+R2+R3+R4+R5+R6+R7,160693,88.0693,21769,11.9307,22.0308,2.9689,35402,125291,6977,3.8238,2612,4365,37.4373,29.8596,27.5266,24.6333,21.9336,18.8688,14.794
8,8,After R1+R2+R3+R4+R5+R6+R7+R8,153129,83.9238,29333,16.0762,21.4003,3.5995,32770,120359,7564,4.1455,2632,4932,34.7964,29.8596,26.0228,23.5561,21.9336,18.8688,14.794
9,9,After R1+R2+R3+R4+R5+R6+R7+R8+R9,136520,74.8211,45942,25.1789,20.4622,4.5375,27935,108585,16609,9.1027,4835,11774,29.1107,,25.509,23.5561,21.9336,18.8688,14.794


## 8. Visualizations

In [11]:
stage_labels = cumulative_eval_df["stage_label"]

# Plot 1: Trade-off bars
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].bar(stage_labels, cumulative_eval_df["overall_fpd_pct"], color="#d62728")
axes[0].set_title("Overall FPD (%) by Stage")
axes[0].set_ylabel("FPD %")
axes[0].set_xticklabels(stage_labels, rotation=45, ha="right")
for idx, val in enumerate(cumulative_eval_df["overall_fpd_pct"]):
    axes[0].text(idx, val + 0.05, f"{val:.2f}", ha="center", va="bottom", fontsize=9)

axes[1].bar(stage_labels, cumulative_eval_df["kept_pct"], color="#1f77b4")
axes[1].set_title("Kept Volume (%) by Stage")
axes[1].set_ylabel("Kept %")
axes[1].set_xticklabels(stage_labels, rotation=45, ha="right")
for idx, val in enumerate(cumulative_eval_df["kept_pct"]):
    axes[1].text(idx, val + 0.05, f"{val:.2f}", ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.savefig(PLOTS_DIR / "tradeoff_fpdk_kept.png", dpi=150)
plt.close()

# Plot 2: Single rule impact
if not single_rule_impact_df.empty:
    plot_df = single_rule_impact_df.head(10).copy()
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(plot_df["rule_label"], plot_df["fpd_reduction_pp"], color="#ff7f0e")
    ax.set_title("Top Rule FPD Reduction (pp)")
    ax.set_ylabel("FPD Reduction (pp)")
    for idx, row in plot_df.iterrows():
        ax.text(row["rule_label"], row["fpd_reduction_pp"] + 0.05, f"{row['removed_pct']:.1f}% removed", ha="center", va="bottom", fontsize=8)
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "single_rule_impact.png", dpi=150)
    plt.close()

# Plot 3: Stacked marginal removals
stage_positive = cumulative_eval_df[cumulative_eval_df["stage"] > 0]
if not stage_positive.empty:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    axes[0].bar(stage_positive["stage_label"], stage_positive["marginal_bad_n"], label="Bad", color="#d62728")
    axes[0].bar(stage_positive["stage_label"], stage_positive["marginal_good_n"], bottom=stage_positive["marginal_bad_n"], label="Good", color="#2ca02c")
    axes[0].set_title("Marginal Removals by Stage")
    axes[0].set_ylabel("Applicants")
    axes[0].legend()
    axes[0].set_xticklabels(stage_positive["stage_label"], rotation=45, ha="right")

    purity_pct = stage_positive["marginal_purity_pct"].fillna(0)
    axes[1].bar(stage_positive["stage_label"], purity_pct, color="#9467bd")
    axes[1].set_title("Marginal Purity (Bad %)")
    axes[1].set_ylabel("% Bad")
    axes[1].set_xticklabels(stage_positive["stage_label"], rotation=45, ha="right")
    for idx, val in enumerate(purity_pct):
        axes[1].text(idx, val + 0.5, f"{val:.1f}%", ha="center", va="bottom", fontsize=9)
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "marginal_removals.png", dpi=150)
    plt.close()

# Plot 4: FPD by risk band
if band_cols:
    band_long = cumulative_eval_df.melt(id_vars=["stage", "stage_label"], value_vars=band_cols, var_name="band", value_name="fpd_pct")
    band_long["band"] = band_long["band"].str.extract(r"band_(\d+)").astype(int)
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.lineplot(data=band_long, x="band", y="fpd_pct", hue="stage_label", marker="o", ax=ax)
    ax.set_title("FPD % by Risk Band Across Stages")
    ax.set_xlabel("Risk Band")
    ax.set_ylabel("FPD %")
    ax.set_xticks(sorted(band_long["band"].unique()))
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "fpd_by_band.png", dpi=150)
    plt.close()

# Plot 5: Overall FPD trend
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(stage_labels, cumulative_eval_df["overall_fpd_pct"], marker="o", color="#d62728")
ax.set_title("Overall FPD Progression")
ax.set_ylabel("FPD %")
ax.set_xticklabels(stage_labels, rotation=45, ha="right")
for idx, val in enumerate(cumulative_eval_df["overall_fpd_pct"]):
    ax.text(idx, val + 0.05, f"{val:.2f}", ha="center", va="bottom", fontsize=8)
plt.tight_layout()
plt.savefig(PLOTS_DIR / "overall_fpd_progression.png", dpi=150)
plt.close()

# Plot 6: % change in approvals by class
baseline_bad = cumulative_eval_df.loc[cumulative_eval_df["stage"] == 0, "bad_n"].iloc[0]
baseline_good = cumulative_eval_df.loc[cumulative_eval_df["stage"] == 0, "good_n"].iloc[0]
if baseline_bad > 0 and baseline_good > 0 and not stage_positive.empty:
    change_rows = []
    for _, row in stage_positive.iterrows():
        change_rows.append({
            "stage_label": row["stage_label"],
            "segment": "is_fpd=1",
            "pct_change": (row["bad_n"] - baseline_bad) / baseline_bad * 100,
        })
        change_rows.append({
            "stage_label": row["stage_label"],
            "segment": "is_fpd=0",
            "pct_change": (row["good_n"] - baseline_good) / baseline_good * 100,
        })
    change_df = pd.DataFrame(change_rows)
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(data=change_df, x="stage_label", y="pct_change", hue="segment", ax=ax)
    ax.set_title("% Change in Approved Volume vs Baseline")
    ax.set_ylabel("% Change")
    ax.set_xticklabels(stage_positive["stage_label"], rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "approved_volume_change.png", dpi=150)
    plt.close()

# Plot 7: Waterfall of filtered volume
if not stage_positive.empty:
    total_applicants = len(df_for_rules)
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    axes[0].bar(stage_positive["stage_label"], stage_positive["marginal_removed_n"], color="#8c564b")
    axes[0].set_title("Marginal Removed Applicants")
    axes[0].set_ylabel("Count")
    axes[0].set_xticklabels(stage_positive["stage_label"], rotation=45, ha="right")
    axes[1].bar(stage_positive["stage_label"], stage_positive["marginal_removed_pct"], color="#e377c2")
    axes[1].set_title("Marginal Removed (% of baseline)")
    axes[1].set_ylabel("% of baseline")
    axes[1].set_xticklabels(stage_positive["stage_label"], rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "waterfall_removed_volume.png", dpi=150)
    plt.close()


## 9. QA, Logs & Artifact Export

In [12]:
baseline_row = cumulative_eval_df.iloc[0]
final_row = cumulative_eval_df.iloc[-1]

print("Baseline metrics:")
print(baseline_row[["kept_n", "kept_pct", "overall_fpd_pct", "bad_n", "good_n"]])
print()
print("Final metrics:")
print(final_row[["kept_n", "kept_pct", "overall_fpd_pct", "bad_n", "good_n"]])

leaf_summary_path = ARTIFACTS_DIR / "leaf_summary.csv"
single_rule_path = ARTIFACTS_DIR / "single_rule_impact.csv"
cumulative_path = ARTIFACTS_DIR / "cumulative_stages.csv"
band_matrix_path = ARTIFACTS_DIR / "band_fpd_matrix.csv"

leaf_summary_df.to_csv(leaf_summary_path, index=False)
single_rule_impact_df.to_csv(single_rule_path, index=False)
cumulative_eval_df.to_csv(cumulative_path, index=False)
band_fpd_matrix.to_csv(band_matrix_path)

print("Artifacts saved:")
print(f"  Leaf summary -> {leaf_summary_path}")
print(f"  Single rule impact -> {single_rule_path}")
print(f"  Cumulative stages -> {cumulative_path}")
print(f"  Band FPD matrix -> {band_matrix_path}")


Baseline metrics:
kept_n              182462
kept_pct          100.0000
overall_fpd_pct    24.9997
bad_n                45615
good_n              136847
Name: 0, dtype: object

Final metrics:
kept_n             136520
kept_pct          74.8211
overall_fpd_pct   20.4622
bad_n               27935
good_n             108585
Name: 9, dtype: object
Artifacts saved:
  Leaf summary -> outputs/fpd_rule_mining/leaf_summary.csv
  Single rule impact -> outputs/fpd_rule_mining/single_rule_impact.csv
  Cumulative stages -> outputs/fpd_rule_mining/cumulative_stages.csv
  Band FPD matrix -> outputs/fpd_rule_mining/band_fpd_matrix.csv


## 10. Why Each Step Matters

- Missing indicators surface predictive signal tied to absent data rather than magnitude.
- Tiered categorical encoders balance fidelity and dimensionality without leaking outcomes.
- Risk bands align analytics with underwriting intuition and expose Simpson's paradox risks.
- Compact decision trees translate model decisions into interpretable, auditable rules.
- Leaf-to-rule translation delivers actionable filter clauses rather than opaque scores.
- Single and cumulative evaluations make the FPD vs approvals trade-off explicit.
- Automatic stopping keeps the rule set surgical and prevents over-pruning good applicants.
- Visual dashboards communicate impacts quickly for stakeholders and governance.

## 11. Save Summary & Clean Rules

In [13]:
band_cols = [col for col in cumulative_eval_df.columns if col.startswith("band_")]
band_summary_final = {col.replace("band_", "").replace("_fpd_pct", ""): final_row[col] for col in band_cols}

summary_dict = {
    "total_applicants": int(len(y)),
    "rules_applied": selected_rules,
    "baseline": {
        "kept_pct": float(baseline_row["kept_pct"]),
        "overall_fpd_pct": float(baseline_row["overall_fpd_pct"]),
        "bad_n": int(baseline_row["bad_n"]),
        "good_n": int(baseline_row["good_n"]),
    },
    "final": {
        "kept_pct": float(final_row["kept_pct"]),
        "overall_fpd_pct": float(final_row["overall_fpd_pct"]),
        "bad_n": int(final_row["bad_n"]),
        "good_n": int(final_row["good_n"]),
    },
    "band_fpd_pct_final": band_summary_final,
}

metrics_path = Path("metrics.json")
metrics_path.write_text(json.dumps(summary_dict, indent=2))
print("Summary metrics saved ->", metrics_path)
print(json.dumps(summary_dict, indent=2))

rules_lines = ["# Clean Rules", "", "Final rules ready for A/B test:"]
if selected_rules:
    for idx, rule in enumerate(selected_rules, start=1):
        rules_lines.append(f"{idx}. `{rule}`")
else:
    rules_lines.append("No rules selected under current thresholds.")

clean_rules_path = Path("clean_rules.md")
clean_rules_path.write_text("\n".join(rules_lines))
print("Clean rules saved ->", clean_rules_path)


Summary metrics saved -> metrics.json
{
  "total_applicants": 182462,
  "rules_applied": [
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score <= 0.597289 & acquisition_uw_score <= 0.516786",
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score <= 0.597289 & acquisition_uw_score > 0.516786",
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score > 0.689887 & apps_installed_count_missing > 0.500000 & acquisition_uw_score <= 0.808913",
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score > 0.597289 & phone_carrier_te > 0.233899",
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score <= 0.689887 & acquisition_uw_score > 0.597289 & phone_carrier_te <= 0.233899",
    "acquisition_uw_score <= 0.838233 & acquisition_uw_score > 0.689887 & apps_installed_count_missing > 0.500000 & acquisition_uw_score > 0.808913",
    "acquisition_uw_score <= 0.838233 & 