In [6]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

# Metrics

In [7]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics


def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc


def pr_auc_from_probs(y_true, y_prob):
    # Sort by predicted probability descending
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    
    tp = 0
    fp = 0
    pos = np.sum(y_true == 1)
    
    precision = [1.0]  # starts at 1 when recall=0
    recall = [0.0]
    
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        prec = tp / (tp + fp)
        rec = tp / pos
        precision.append(prec)
        recall.append(rec)
    
    # ensure it ends at recall=1
    precision = np.array(precision)
    recall = np.array(recall)
    
    # integrate area under curve
    auc_pr = np.trapz(precision, recall)
    return auc_pr

# LDA Cross Validation

In [19]:
def cv_lda(data, feature_cols, target_col, threshold = 0.5, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    all_preds = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        lda_model = LinearDiscriminantAnalysis()
        lda_model.fit(X_train, y_train)

        y_prob = lda_model.predict_proba(X_test)[:, 1]
        y_train_prob = lda_model.predict_proba(X_train)[:, 1]
        y_pred = (y_prob > threshold).astype(int)

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['pr_auc'] = pr_auc_from_probs(y_test, y_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)

        fold_preds = pd.DataFrame({
            'fold': f,
            'y_true': y_test.values,
            'y_prob': y_prob
        })
        all_preds.append(fold_preds)


    results_df = pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)
    preds_df = pd.concat(all_preds, ignore_index=True)

    return results_df, preds_df


        


# Import Data

In [10]:
#Stratified data set
apps_cv_strat = pd.read_csv("data/apps_cv_strat.csv")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.csv")

#Random data set
apps_cv_rand = pd.read_csv("data/apps_cv_random.csv")
apps_holdout_rand = pd.read_csv("data/apps_holdout_random.csv")

#Multi-stratified data set
apps_cv_multi = pd.read_csv("data/apps_cv_multi.csv")
apps_holdout_multi = pd.read_csv("data/apps_holdout_multi.csv")

target_col = 'TARGET'
feature_cols_strat = [col for col in apps_cv_strat.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]

feature_cols_rand = [col for col in apps_cv_rand.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]


# Stratified Results

In [20]:
strat_results, strat_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,0.241865,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,0.244496,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,0.228715,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,0.244284,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,0.238328,5


# Random Results

In [36]:
rand_results, rand_preds = cv_lda(apps_cv_rand, feature_cols_rand, target_col, params = None)
rand_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,172,44960,218,3806,0.918138,0.519206,0.441026,0.043238,0.995175,0.078755,0.760461,0.765584,1
1,49155,202,44990,225,3738,0.919377,0.523146,0.473068,0.051269,0.995024,0.092512,0.761883,0.764656,2
2,49155,186,45075,197,3697,0.920781,0.521775,0.48564,0.047901,0.995649,0.087201,0.761395,0.764668,3
3,49155,199,44980,218,3758,0.919113,0.522734,0.477218,0.050291,0.995177,0.090992,0.764674,0.763945,4
4,49155,176,44935,200,3844,0.91773,0.519675,0.468085,0.043781,0.995569,0.080073,0.762706,0.764829,5


# Multi-Stratified Results

In [37]:
multi_results, multi_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,5


# Comparison Between Results

In [38]:
print(f"Stratified Avg ROC-AUC: {strat_results['roc_auc'].mean():.4f}")
print(f"Random Avg ROC-AUC: {rand_results['roc_auc'].mean():.4f}")
print(f"Multi-Stratified Avg ROC-AUC: {multi_results['roc_auc'].mean():.4f}")

Stratified Avg ROC-AUC: 0.7602
Random Avg ROC-AUC: 0.7622
Multi-Stratified Avg ROC-AUC: 0.7602


# Threshold Tuning
The random set had the best ROC-AUC out of all of the data sets so we will tune the threshold using the random set.

In [21]:
thresholds = np.linspace(0, 1, 200)
scores = []
for t in thresholds:
    y_pred = (strat_preds["y_prob"] >= t).astype(int)
    tp = ((y_pred == 1) & (strat_preds["y_true"] == 1)).sum()
    fp = ((y_pred == 1) & (strat_preds["y_true"] == 0)).sum()
    tn = ((y_pred == 0) & (strat_preds["y_true"] == 0)).sum()
    fn = ((y_pred == 0) & (strat_preds["y_true"] == 1)).sum()

    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    f1   = 2 * prec * rec / max(prec + rec, 1e-12)
    spec = tn / max(tn + fp, 1)
    bal_acc = (rec + spec) / 2

    scores.append((t, prec, rec, f1, bal_acc))

#Sort by F1-Score to determine the best threshold
scores_df = pd.DataFrame(scores, columns=["threshold", "precision", "recall", "f1", "bal_acc"]).sort_values("f1", ascending = False).reset_index(drop=True)
best_threshold = scores_df.loc[scores_df["f1"].idxmax(), "threshold"]
scores_df

Unnamed: 0,threshold,precision,recall,f1,bal_acc
0,0.155779,0.242190,0.419573,0.307108,0.652132
1,0.145729,0.233843,0.447087,0.307075,0.659215
2,0.150754,0.237734,0.432675,0.306862,0.655412
3,0.140704,0.229552,0.461197,0.306533,0.662621
4,0.160804,0.245925,0.406773,0.306530,0.648611
...,...,...,...,...,...
195,0.979899,0.272727,0.000151,0.000302,0.500058
196,0.974874,0.250000,0.000151,0.000302,0.500056
197,0.969849,0.230769,0.000151,0.000302,0.500053
198,0.994975,0.250000,0.000101,0.000201,0.500037


# Data Leakage Check

In [47]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled, preds_shuffled = cv_lda(shuffled, feature_cols_rand, target_col, threshold = best_threshold, params = None)
print("Shuffled mean AUC:", fold_results_shuffled["roc_auc"].mean())

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Shuffled mean AUC: 0.4978999206075014


# Threshold Tuning Part 2

In [22]:
def thresh_pair_metrics(y_true, y_prob, t_low, t_high):

    # make decisions (0=Approve, Deny=1, Review=2)
    decision = np.where(y_prob <= t_low, 0, np.where(y_prob >= t_high, 1, -1)) 

    # compute some metrics for each class
    metrics = {}
    for name, k in [("approve",0), ("review",-1), ("deny",1)]:

        # how many applicants in this group (and %)
        idx = (decision==k)
        n = idx.sum() 
        metrics[f"{name}_n"] = n
        metrics[f"{name}_rate"] = n / len(y_true)

        # default rate in this group
        metrics[f"{name}_default_rate"] = y_true[idx].mean() if n else np.nan

    # how often approved loans did not default (precision, high=good)
    metrics["approve_nondefault_precision"] = 1 - metrics["approve_default_rate"]  

    # how often denied loans did default (precision, high=good)
    metrics["deny_default_precision"] = metrics["deny_default_rate"]    

    # recall of defaults caught by deny
    pos = (y_true==1)
    metrics["deny_default_recall"] = ((decision==1) & pos).sum() / max(pos.sum(),1)

    return metrics

In [23]:
def search_thresholds(y_true, y_prob, target_approve_prec=0.95, target_deny_prec=0.45, target_review_rate=0.25):

    # loop through possible threshold pairs
    rows = []
    for t_low in np.linspace(0.05, 0.40, 71):      # approve threshold grid
        for t_high in np.linspace(0.50, 0.90, 81):  # deny threshold grid
            if t_low >= t_high: 
                continue

            # get metrics for this threshold pair
            metrics = thresh_pair_metrics(y_true, y_prob, t_low, t_high)

            # check if meets each target constraint
            approve_ok = (metrics["approve_nondefault_precision"] >= target_approve_prec)
            deny_ok = (metrics["deny_default_precision"] >= target_deny_prec)
            review_ok = (metrics["review_rate"] <= target_review_rate)

            # if all are good, compute a overall score of how good it did (kind of arbitrary)
            ## weight good denials heaviest because want to avoid approving bad loans
            if approve_ok and deny_ok and review_ok:
                rows.append({"t_low":t_low, "t_high":t_high, **metrics})
                
    return pd.DataFrame(rows)

In [24]:
strat_preds

Unnamed: 0,fold,y_true,y_prob
0,5,0,0.012725
1,5,0,0.008971
2,5,0,0.029379
3,5,0,0.013363
4,5,0,0.073626
...,...,...,...
245772,2,0,0.046333
245773,2,0,0.045540
245774,2,0,0.010014
245775,2,0,0.099751


In [16]:

thresh_pair_results = search_thresholds(strat_preds.y_true,
                                        strat_preds.y_prob,
                                        target_approve_prec = 0.95,
                                        target_deny_prec = 0.5,
                                        target_review_rate = 0.22)

thresh_pair_results


Unnamed: 0,t_low,t_high,approve_n,approve_rate,approve_default_rate,review_n,review_rate,review_default_rate,deny_n,deny_rate,deny_default_rate,approve_nondefault_precision,deny_default_precision,deny_default_recall
0,0.11,0.575,191052,0.777339,0.045987,53610,0.218125,0.195822,1115,0.004537,0.502242,0.954013,0.502242,0.028220
1,0.11,0.580,191052,0.777339,0.045987,53673,0.218381,0.196225,1052,0.004280,0.500000,0.954013,0.500000,0.026507
2,0.11,0.595,191052,0.777339,0.045987,53805,0.218918,0.196952,920,0.003743,0.501087,0.954013,0.501087,0.023231
3,0.11,0.600,191052,0.777339,0.045987,53838,0.219052,0.197091,887,0.003609,0.503946,0.954013,0.503946,0.022526
4,0.11,0.605,191052,0.777339,0.045987,53863,0.219154,0.197223,862,0.003507,0.504640,0.954013,0.504640,0.021921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.13,0.750,201369,0.819316,0.049988,44207,0.179866,0.218812,201,0.000818,0.522388,0.950012,0.522388,0.005291
146,0.13,0.755,201369,0.819316,0.049988,44217,0.179907,0.218898,191,0.000777,0.518325,0.950012,0.518325,0.004989
147,0.13,0.760,201369,0.819316,0.049988,44224,0.179935,0.218976,184,0.000749,0.510870,0.950012,0.510870,0.004737
148,0.13,0.765,201369,0.819316,0.049988,44237,0.179988,0.219093,171,0.000696,0.502924,0.950012,0.502924,0.004334


In [17]:
results = thresh_pair_results[thresh_pair_results.deny_rate > 0.0045]
results

Unnamed: 0,t_low,t_high,approve_n,approve_rate,approve_default_rate,review_n,review_rate,review_default_rate,deny_n,deny_rate,deny_default_rate,approve_nondefault_precision,deny_default_precision,deny_default_recall
0,0.11,0.575,191052,0.777339,0.045987,53610,0.218125,0.195822,1115,0.004537,0.502242,0.954013,0.502242,0.02822
10,0.115,0.575,193927,0.789036,0.0471,50735,0.206427,0.200059,1115,0.004537,0.502242,0.9529,0.502242,0.02822
45,0.12,0.575,196530,0.799627,0.047967,48132,0.195836,0.204791,1115,0.004537,0.502242,0.952033,0.502242,0.02822
80,0.125,0.575,199040,0.80984,0.04896,45622,0.185624,0.209088,1115,0.004537,0.502242,0.95104,0.502242,0.02822
115,0.13,0.575,201369,0.819316,0.049988,43293,0.176147,0.212921,1115,0.004537,0.502242,0.950012,0.502242,0.02822


In [25]:
thresh_pair_metrics(strat_preds.y_true, strat_preds.y_prob, 0.11, 0.57)

{'approve_n': 191052,
 'approve_rate': 0.7773388071300407,
 'approve_default_rate': 0.04598747984841823,
 'review_n': 53560,
 'review_rate': 0.21792112362019228,
 'review_default_rate': 0.19564973861090365,
 'deny_n': 1165,
 'deny_rate': 0.004740069249767065,
 'deny_default_rate': 0.49699570815450644,
 'approve_nondefault_precision': 0.9540125201515818,
 'deny_default_precision': 0.49699570815450644,
 'deny_default_recall': 0.029177585164281396}