In [1]:
!pip install catboost



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from imblearn.over_sampling import SMOTE

In [3]:
df = pd.read_csv("/content/df_feature_engineered.csv")

In [4]:
df

Unnamed: 0,partner_id,role,gender,age_group,city_tier,earnings_avg,on_time_rate,cancel_rate,customer_rating,complaints,...,engagement_ratio,weekly_trip_ratio,reliability_index,complaint_rate,accident_rate,txn_freq_per_day,wallet_txn_flag,vehicle_utilization,earnings_vs_city_avg,rating_vs_age_avg
0,1,driver,M,<=25,2,638.443420,0.964244,0.135795,4.837981,0,...,0.513514,2.600000,0.833304,0.000000,0.0,8.001422,1,9.744333,0.913992,1.023845
1,2,merchant,M,<=25,2,444.440684,0.931376,0.061321,4.778687,0,...,1.333333,2.120000,0.874263,0.000000,0.0,4.188749,1,8.175693,0.636259,1.011297
2,3,merchant,M,26-35,1,659.666189,0.960345,0.045608,4.418110,1,...,0.451613,1.206897,0.916545,0.027778,0.0,4.623777,1,4.948950,0.942219,0.934736
3,4,driver,F,<=25,3,503.720971,0.850885,0.053521,4.782128,0,...,0.529412,3.052632,0.805345,0.000000,0.0,14.130121,1,11.814075,0.721940,1.012025
4,5,driver,M,50+,3,637.742060,0.970612,0.150525,4.905054,0,...,0.678571,2.850000,0.824510,0.000000,0.0,0.000000,0,12.631052,0.914021,1.038896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7996,driver,M,50+,2,651.340236,0.963597,0.009458,4.877334,0,...,1.130435,1.370370,0.954483,0.000000,0.0,10.193239,1,6.665293,0.932455,1.033024
7996,7997,driver,M,26-35,3,804.831806,0.932786,0.109542,4.864437,0,...,1.769231,1.291667,0.830607,0.000000,0.0,13.467973,1,31.000000,1.153497,1.029165
7997,7998,driver,M,26-35,2,648.224239,0.916002,0.130103,4.602213,0,...,1.500000,2.000000,0.796827,0.000000,0.0,10.934323,1,15.436476,0.927994,0.973687
7998,7999,merchant,M,26-35,1,861.418504,0.932769,0.124617,4.975657,0,...,0.434783,3.809524,0.816530,0.000000,0.0,16.806311,1,14.420394,1.230387,1.052696


In [5]:
def group_metrics(y_true, y_pred, y_prob, groups):
    """
    Returns per-group metrics and disparity measures.
    groups: array-like group labels (same length as y_true)
    """
    groups = np.array(groups)
    uniq = np.unique(groups)
    out = {}
    for g in uniq:
        mask = groups == g
        yt = np.array(y_true)[mask]
        yp = np.array(y_pred)[mask]
        yp_prob = np.array(y_prob)[mask]
        tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0,1]).ravel()
        tpr = tp / (tp + fn) if (tp + fn) else np.nan
        fpr = fp / (fp + tn) if (fp + tn) else np.nan
        ppv = tp / (tp + fp) if (tp + fp) else np.nan
        pos_rate = yp.mean()
        out[int(g)] = {"TPR": float(tpr), "FPR": float(fpr), "PPV": float(ppv),
                       "PositiveRate": float(pos_rate), "Count": int(mask.sum())}
    return out

def disparity(metric_name, table):
    vals = [v[metric_name] for v in table.values() if not np.isnan(v[metric_name])]
    if len(vals) <= 1:
        return 0.0
    return float(max(vals) - min(vals))

def evaluate_credit(y_true, y_prob, y_pred, groups=None, show_plots=True, name="Model"):
    auc = roc_auc_score(y_true, y_prob)
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    ks = float(max(tpr - fpr))
    gini = 2 * auc - 1

    print(f"\n=== {name} ===")
    print(f"AUC: {auc:.3f} | KS: {ks:.3f} | Gini: {gini:.3f}")

    # decile lift
    df_lift = pd.DataFrame({"y": y_true, "p": y_prob})
    # handle identical probs possibly: qcut may fail; use rank then quantile
    df_lift["decile"] = pd.qcut(df_lift["p"].rank(method="first"), 10, labels=False)
    dec = df_lift.groupby("decile").agg(total=("y","count"), events=("y","sum")).sort_index(ascending=False)
    dec["event_rate"] = dec["events"] / dec["total"]
    print("\nDecile-wise lift (top decile = highest prob):")
    print(dec)

    if groups is not None:
        gm = group_metrics(y_true, y_pred, y_prob, groups)
        dpd = disparity("PositiveRate", gm)   # Demographic parity difference
        eod = disparity("TPR", gm)            # Equal opportunity difference
        ppd = disparity("PPV", gm)            # Predictive parity difference
        print(f"\nFairness (by {sensitive_col}): DPD={dpd:.3f}, EOD={eod:.3f}, PPD={ppd:.3f}")
        print("Group breakdown:")
        pretty = pd.DataFrame(gm).T
        print(pretty)

    if show_plots:
        plt.figure(figsize=(6,6))
        plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
        plt.plot([0,1],[0,1],"k--")
        plt.title(f"ROC - {name}")
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.legend(); plt.show()

        # KS curve
        plt.figure(figsize=(6,4))
        plt.plot(thresholds, tpr, label="TPR")
        plt.plot(thresholds, fpr, label="FPR")
        plt.plot(thresholds, tpr - fpr, label=f"KS={ks:.2f}")
        plt.xlabel("Threshold"); plt.legend(); plt.title(f"KS curve - {name}")
        plt.show()

    return {"AUC": auc, "KS": ks, "Gini": gini, "decile_table": dec, "group_metrics": (gm if groups is not None else None)}

In [6]:
features = [
    'role', 'gender', 'age_group', 'city_tier',
       'earnings_avg', 'on_time_rate', 'cancel_rate', 'customer_rating',
       'complaints', 'accidents', 'night_shift_pct', 'cashless_ratio',
       'past_due_history','vehicle_age_bin', 'earnings_cv',
       'productivity_stability', 'earnings_stability', 'wallet_txn_bin',
       'tenure_x_earnings', 'active_x_reliability', 'engagement_ratio',
       'weekly_trip_ratio', 'reliability_index', 'complaint_rate',
       'accident_rate', 'txn_freq_per_day', 'wallet_txn_flag',
       'vehicle_utilization', 'earnings_vs_city_avg', 'rating_vs_age_avg'
]

target = "good_repayment"

In [7]:
sensitive_col = "city_tier"   # fairness group
calibrate = True              # whether to calibrate model probabilities
use_smote = False             # set True to apply SMOTE on training set (option)
random_state = 42

In [8]:
X_raw = df[features].copy()
y = df[target].copy()

In [9]:
cat_cols = [c for c in ['role','gender','age_group','city_tier','vehicle_age_bin','wallet_txn_bin'] if c in X_raw.columns]

In [10]:
# One-hot for tree models that prefer numeric matrix (XGBoost/LightGBM + sklearn)
X = pd.get_dummies(X_raw, drop_first=True)

In [11]:
# Train/test split (stratify by target)
X_train, X_test, y_train, y_test, Xraw_train, Xraw_test = train_test_split(
    X, y, X_raw, test_size=0.25, random_state=random_state, stratify=y
)

In [12]:
# scaling numeric columns for linear model (we'll scale for XGBoost/LGB too - optional)
numeric_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Reweighting function for fairness (works on X_raw train with original city_tier)
def reweight_samples_raw(xraw_df, sensitive_col="city_tier"):
    group_counts = xraw_df[sensitive_col].value_counts()
    weights = xraw_df[sensitive_col].map(lambda x: 1.0 / group_counts[x])
    return (weights / weights.mean()).values

w_train = reweight_samples_raw(Xraw_train, sensitive_col=sensitive_col)

In [14]:
if use_smote:
    sm = SMOTE(random_state=random_state)
    X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)
else:
    X_train_res, y_train_res = X_train_scaled, y_train

In [27]:
X_train.columns

Index(['city_tier', 'earnings_avg', 'on_time_rate', 'cancel_rate',
       'customer_rating', 'complaints', 'accidents', 'night_shift_pct',
       'cashless_ratio', 'past_due_history', 'earnings_cv',
       'productivity_stability', 'earnings_stability', 'tenure_x_earnings',
       'active_x_reliability', 'engagement_ratio', 'weekly_trip_ratio',
       'reliability_index', 'complaint_rate', 'accident_rate',
       'txn_freq_per_day', 'wallet_txn_flag', 'vehicle_utilization',
       'earnings_vs_city_avg', 'rating_vs_age_avg', 'role_merchant',
       'gender_M', 'age_group_36-50', 'age_group_50+', 'age_group_<=25',
       'vehicle_age_bin_new', 'vehicle_age_bin_old', 'wallet_txn_bin_High',
       'wallet_txn_bin_Low', 'wallet_txn_bin_Medium',
       'wallet_txn_bin_Very High'],
      dtype='object')

In [28]:
# Convert all columns to strings, replace problematic characters
X_train.columns = (
    X_train.columns
    .astype(str)
    .str.replace(r'[<>=\s]', '_', regex=True)  # replace <, >, =, and spaces with _
)

# Optional: remove consecutive underscores
X_train.columns = X_train.columns.str.replace(r'_+', '_', regex=True)

In [29]:
models = {}

In [30]:
# 1) Logistic Regression baseline (calibrated)
lr = LogisticRegression(max_iter=1000, class_weight=None, random_state=random_state)
if calibrate:
    lr_clf = CalibratedClassifierCV(lr, cv=5)
else:
    lr_clf = lr
models['Logistic'] = {"est": lr_clf, "fit_X": X_train_res, "fit_y": y_train_res, "use_sample_weight": False}

In [31]:
# 2) XGBoost (use scale_pos_weight if imbalanced) - using the sklearn wrapper
# scale_pos_weight = n_neg / n_pos on train
neg = (y_train==0).sum()
pos = (y_train==1).sum()
scale_pos_weight = (neg / pos) if pos>0 else 1.0
xgb = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.05,
                    use_label_encoder=False, eval_metric='logloss',
                    scale_pos_weight=scale_pos_weight, random_state=random_state)
if calibrate:
    xgb_clf = CalibratedClassifierCV(xgb, cv=5)
else:
    xgb_clf = xgb
models['XGBoost'] = {"est": xgb_clf, "fit_X": X_train, "fit_y": y_train, "use_sample_weight": False}  # xgb can take original one-hot X

In [32]:
# 3) LightGBM
lgb = LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.05, class_weight=None, random_state=random_state)
if calibrate:
    lgb_clf = CalibratedClassifierCV(lgb, cv=5)
else:
    lgb_clf = lgb
models['LightGBM'] = {"est": lgb_clf, "fit_X": X_train, "fit_y": y_train, "use_sample_weight": False}


In [33]:
# 4) CatBoost - keep original categorical columns; CatBoost handles categories natively
# prepare Pool objects later when training CatBoost directly without CalibratedClassifierCV (we can still calibrate)
cat = CatBoostClassifier(iterations=800, depth=6, learning_rate=0.05,
                         eval_metric='Logloss', verbose=0, random_seed=random_state)
# For CatBoost we'll fit separately (not through same dict flow), but include placeholder
models['CatBoost'] = {"est": cat, "fit_X": Xraw_train, "fit_y": y_train, "use_sample_weight": False, "cat_features": cat_cols}

In [34]:
# 5) Fairness-aware Logistic (sample_weight = w_train)
lr_fair = LogisticRegression(max_iter=1000, random_state=random_state)
if calibrate:
    lr_fair_clf = CalibratedClassifierCV(lr_fair, cv=5)
else:
    lr_fair_clf = lr_fair
models['Logistic_Fair'] = {"est": lr_fair_clf, "fit_X": X_train_scaled, "fit_y": y_train, "use_sample_weight": True, "sample_weight": w_train}

In [35]:
# 6) Fairness-aware XGBoost (pass sample_weight)
xgb_fair = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.05,
                         use_label_encoder=False, eval_metric='logloss', random_state=random_state)
models['XGBoost_Fair'] = {"est": xgb_fair, "fit_X": X_train, "fit_y": y_train, "use_sample_weight": True, "sample_weight": w_train}


In [36]:
results = {}
for name, spec in models.items():
    print(f"\nTraining -> {name}")
    est = spec["est"]
    fit_X = spec["fit_X"]
    fit_y = spec["fit_y"]
    use_w = spec.get("use_sample_weight", False)
    w = spec.get("sample_weight", None)

    # Special handling for CatBoost (uses original Xraw with categorical columns)
    if name == "CatBoost":
        cb = spec["est"]
        pool = Pool(data=spec["fit_X"], label=spec["fit_y"], cat_features=spec.get("cat_features", []))
        # fairness sample weights for catboost if desired
        if use_w:
            cb.fit(pool, sample_weight=w)
        else:
            cb.fit(pool)
        # predict on Xraw_test using pool
        pool_test = Pool(data=Xraw_test, cat_features=spec.get("cat_features", []))
        y_prob = cb.predict_proba(pool_test)[:,1]
        y_pred = (y_prob >= 0.5).astype(int)
    else:
        # if fit_X is scaled (np.array), pass as-is; if it's DataFrame (one-hot), convert as needed
        X_for_fit = fit_X
        # If the estimator is CalibratedClassifierCV wrapper, it expects arrays
        if hasattr(est, "fit"):
            if use_w:
                est.fit(X_for_fit, fit_y, **({"sample_weight": w} if w is not None else {}))
            else:
                est.fit(X_for_fit, fit_y)
        # For sklearn wrappers like CalibratedClassifierCV, the fitted object supports predict_proba on same-format test data.
        # Choose appropriate test matrix format:
        if isinstance(X_for_fit, np.ndarray):
            X_test_for_pred = X_test_scaled
        else:
            # fit_X likely DataFrame -> use one-hot test matrix
            X_test_for_pred = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
        y_prob = est.predict_proba(X_test_for_pred)[:,1]
        y_pred = (y_prob >= 0.5).astype(int)

    # Evaluate
    groups_for_eval = Xraw_test[sensitive_col].values if sensitive_col in Xraw_test.columns else None
    res = evaluate_credit(y_test.values, y_prob, y_pred, groups=groups_for_eval, show_plots=False, name=name)
    results[name] = res


Training -> Logistic

=== Logistic ===
AUC: 0.647 | KS: 0.225 | Gini: 0.294

Decile-wise lift (top decile = highest prob):
        total  events  event_rate
decile                           
9         200     197       0.985
8         200     193       0.965
7         200     197       0.985
6         200     188       0.940
5         200     191       0.955
4         200     190       0.950
3         200     190       0.950
2         200     185       0.925
1         200     183       0.915
0         200     178       0.890

Fairness (by city_tier): DPD=0.000, EOD=0.000, PPD=0.015
Group breakdown:
   TPR  FPR       PPV  PositiveRate  Count
1  1.0  1.0  0.947727           1.0  880.0
2  1.0  1.0  0.949930           1.0  719.0
3  1.0  1.0  0.935162           1.0  401.0

Training -> XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
AUC: 0.599 | KS: 0.188 | Gini: 0.198

Decile-wise lift (top decile = highest prob):
        total  events  event_rate
decile                           
9         200     192       0.960
8         200     191       0.955
7         200     194       0.970
6         200     194       0.970
5         200     188       0.940
4         200     194       0.970
3         200     190       0.950
2         200     183       0.915
1         200     183       0.915
0         200     183       0.915

Fairness (by city_tier): DPD=0.000, EOD=0.000, PPD=0.015
Group breakdown:
   TPR  FPR       PPV  PositiveRate  Count
1  1.0  1.0  0.947727           1.0  880.0
2  1.0  1.0  0.949930           1.0  719.0
3  1.0  1.0  0.935162           1.0  401.0

Training -> LightGBM
[LightGBM] [Info] Number of positive: 4540, number of negative: 260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001594 seconds.
You can set `force_col_wise=true` to remove the ov




=== LightGBM ===
AUC: 0.574 | KS: 0.125 | Gini: 0.147

Decile-wise lift (top decile = highest prob):
        total  events  event_rate
decile                           
9         200     195       0.975
8         200     192       0.960
7         200     189       0.945
6         200     190       0.950
5         200     187       0.935
4         200     194       0.970
3         200     184       0.920
2         200     190       0.950
1         200     188       0.940
0         200     183       0.915

Fairness (by city_tier): DPD=0.000, EOD=0.000, PPD=0.015
Group breakdown:
   TPR  FPR       PPV  PositiveRate  Count
1  1.0  1.0  0.947727           1.0  880.0
2  1.0  1.0  0.949930           1.0  719.0
3  1.0  1.0  0.935162           1.0  401.0

Training -> CatBoost

=== CatBoost ===
AUC: 0.564 | KS: 0.133 | Gini: 0.128

Decile-wise lift (top decile = highest prob):
        total  events  event_rate
decile                           
9         200     195       0.975
8         200    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost_Fair ===
AUC: 0.593 | KS: 0.156 | Gini: 0.186

Decile-wise lift (top decile = highest prob):
        total  events  event_rate
decile                           
9         200     193       0.965
8         200     194       0.970
7         200     192       0.960
6         200     188       0.940
5         200     191       0.955
4         200     192       0.960
3         200     190       0.950
2         200     188       0.940
1         200     181       0.905
0         200     183       0.915

Fairness (by city_tier): DPD=0.002, EOD=0.003, PPD=0.015
Group breakdown:
        TPR  FPR       PPV  PositiveRate  Count
1  1.000000  1.0  0.947727      1.000000  880.0
2  1.000000  1.0  0.949930      1.000000  719.0
3  0.997333  1.0  0.935000      0.997506  401.0


In [37]:
summary = []
for name, r in results.items():
    summary.append({"model": name, "AUC": r["AUC"], "KS": r["KS"], "Gini": r["Gini"]})
summary_df = pd.DataFrame(summary).sort_values("AUC", ascending=False)
print("\n=== Summary ===")
print(summary_df)

# Optional: show fairness disparities per model (DPD, EOD, PPD)
fairness_summary = []
for name, r in results.items():
    gm = r.get("group_metrics", {})
    if gm is None:
        fairness_summary.append({"model": name, "DPD": np.nan, "EOD": np.nan, "PPD": np.nan})
    else:
        fairness_summary.append({
            "model": name,
            "DPD": disparity("PositiveRate", gm),
            "EOD": disparity("TPR", gm),
            "PPD": disparity("PPV", gm)
        })
fair_df = pd.DataFrame(fairness_summary).sort_values("DPD")
print("\n=== Fairness Summary (lower DPD better) ===")
print(fair_df)


=== Summary ===
           model       AUC        KS      Gini
0       Logistic  0.647218  0.225452  0.294437
4  Logistic_Fair  0.644116  0.235827  0.288231
1        XGBoost  0.598901  0.188141  0.197802
5   XGBoost_Fair  0.592972  0.155861  0.185944
2       LightGBM  0.573580  0.125010  0.147160
3       CatBoost  0.564198  0.133055  0.128396

=== Fairness Summary (lower DPD better) ===
           model       DPD       EOD       PPD
0       Logistic  0.000000  0.000000  0.014768
1        XGBoost  0.000000  0.000000  0.014768
2       LightGBM  0.000000  0.000000  0.014768
4  Logistic_Fair  0.000000  0.000000  0.014768
5   XGBoost_Fair  0.002494  0.002667  0.014930
3       CatBoost  0.004988  0.005333  0.015093


In [38]:
best_model_name = summary_df.iloc[0]["model"]
print("\nBest model by AUC:", best_model_name)
# pick its predicted probs (from results above)
# NOTE: to reuse predicted probs you might store them in results[name]["y_prob"] earlier; here re-predict quickly for chosen model
# for simplicity we take logistic_fair if present else first
chosen = "Logistic_Fair" if "Logistic_Fair" in results else best_model_name
# If you want the probs used above, please store them in results dict when evaluating
# Quick mapping function:
def prob_to_score(p, min_score=300, max_score=900):
    return min_score + (max_score - min_score) * p

# Example create scorecard for test set using summary_df top model's saved probs if available:
# (If not saved, re-run predict_proba on the chosen model similar to above and compute scores)
print("\nDone. See 'summary_df' and 'fair_df' for performance and fairness tradeoffs.")


Best model by AUC: Logistic

Done. See 'summary_df' and 'fair_df' for performance and fairness tradeoffs.
