In [24]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

# Metrics

In [None]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics


def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc


def pr_auc_from_probs(y_true, y_prob):
    # Sort by predicted probability descending
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    
    tp = 0
    fp = 0
    pos = np.sum(y_true == 1)
    
    precision = [1.0]  # starts at 1 when recall=0
    recall = [0.0]
    
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        prec = tp / (tp + fp)
        rec = tp / pos
        precision.append(prec)
        recall.append(rec)
    
    # ensure it ends at recall=1
    precision = np.array(precision)
    recall = np.array(recall)
    
    # integrate area under curve
    auc_pr = np.trapz(precision, recall)
    return auc_pr

# LDA Cross Validation

In [33]:
def cv_lda(data, feature_cols, target_col, threshold = 0.5, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    all_preds = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        lda_model = LinearDiscriminantAnalysis()
        lda_model.fit(X_train, y_train)

        y_prob = lda_model.predict_proba(X_test)[:, 1]
        y_train_prob = lda_model.predict_proba(X_train)[:, 1]
        y_pred = (y_prob > threshold).astype(int)

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)

        fold_preds = pd.DataFrame({
            'fold': f,
            'y_true': y_test.values,
            'y_prob': y_prob
        })
        all_preds.append(fold_preds)


    results_df = pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)
    preds_df = pd.concat(all_preds, ignore_index=True)

    return results_df, preds_df


        


# Import Data

In [34]:
#Stratified data set
apps_cv_strat = pd.read_csv("data/apps_cv_strat.txt")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.txt")

#Random data set
apps_cv_rand = pd.read_csv("data/apps_cv_random.txt")
apps_holdout_rand = pd.read_csv("data/apps_holdout_random.txt")

#Multi-stratified data set
apps_cv_multi = pd.read_csv("data/apps_cv_multi.txt")
apps_holdout_multi = pd.read_csv("data/apps_holdout_multi.txt")

target_col = 'TARGET'
feature_cols_strat = [col for col in apps_cv_strat.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]

feature_cols_rand = [col for col in apps_cv_rand.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]


# Stratified Results

In [35]:
strat_results, strat_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,5


# Random Results

In [36]:
rand_results, rand_preds = cv_lda(apps_cv_rand, feature_cols_rand, target_col, params = None)
rand_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,172,44960,218,3806,0.918138,0.519206,0.441026,0.043238,0.995175,0.078755,0.760461,0.765584,1
1,49155,202,44990,225,3738,0.919377,0.523146,0.473068,0.051269,0.995024,0.092512,0.761883,0.764656,2
2,49155,186,45075,197,3697,0.920781,0.521775,0.48564,0.047901,0.995649,0.087201,0.761395,0.764668,3
3,49155,199,44980,218,3758,0.919113,0.522734,0.477218,0.050291,0.995177,0.090992,0.764674,0.763945,4
4,49155,176,44935,200,3844,0.91773,0.519675,0.468085,0.043781,0.995569,0.080073,0.762706,0.764829,5


# Multi-Stratified Results

In [37]:
multi_results, multi_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,5


# Comparison Between Results

In [38]:
print(f"Stratified Avg ROC-AUC: {strat_results['roc_auc'].mean():.4f}")
print(f"Random Avg ROC-AUC: {rand_results['roc_auc'].mean():.4f}")
print(f"Multi-Stratified Avg ROC-AUC: {multi_results['roc_auc'].mean():.4f}")

Stratified Avg ROC-AUC: 0.7602
Random Avg ROC-AUC: 0.7622
Multi-Stratified Avg ROC-AUC: 0.7602


# Threshold Tuning
The random set had the best ROC-AUC out of all of the data sets so we will tune the threshold using the random set.

In [39]:
thresholds = np.linspace(0, 1, 200)
scores = []
for t in thresholds:
    y_pred = (rand_preds["y_prob"] >= t).astype(int)
    tp = ((y_pred == 1) & (rand_preds["y_true"] == 1)).sum()
    fp = ((y_pred == 1) & (rand_preds["y_true"] == 0)).sum()
    tn = ((y_pred == 0) & (rand_preds["y_true"] == 0)).sum()
    fn = ((y_pred == 0) & (rand_preds["y_true"] == 1)).sum()

    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    f1   = 2 * prec * rec / max(prec + rec, 1e-12)
    spec = tn / max(tn + fp, 1)
    bal_acc = (rec + spec) / 2

    scores.append((t, prec, rec, f1, bal_acc))

#Sort by F1-Score to determine the best threshold
scores_df = pd.DataFrame(scores, columns=["threshold", "precision", "recall", "f1", "bal_acc"]).sort_values("f1", ascending = False).reset_index(drop=True)
best_threshold = scores_df.loc[scores_df["f1"].idxmax(), "threshold"]
scores_df

Unnamed: 0,threshold,precision,recall,f1,bal_acc
0,0.155779,0.241254,0.420518,0.306606,0.652389
1,0.160804,0.245284,0.407625,0.306272,0.648931
2,0.165829,0.249514,0.395793,0.306074,0.645805
3,0.145729,0.232951,0.445950,0.306037,0.658722
4,0.150754,0.236497,0.432602,0.305812,0.655190
...,...,...,...,...,...
195,0.979899,0.357143,0.000253,0.000505,0.500106
196,0.984925,0.333333,0.000202,0.000404,0.500083
197,0.994975,0.285714,0.000101,0.000202,0.500039
198,0.989950,0.250000,0.000101,0.000202,0.500037


# Holdout Evaluation

In [None]:
X_train, y_train = apps_cv_strat[feature_cols_strat], apps_cv_strat[target_col]
X_test, y_test = apps_holdout_strat[feature_cols_strat], apps_holdout_strat[target_col]

pca = PCA(n_components = 0.95)
pca.fit(X_train)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_prob = lda.predict_proba(X_test)[:, 1]
y_train_prob = lda.predict_proba(X_train)[:, 1]
y_pred = (y_prob > best_threshold).astype(int)

metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics['pr_roc_auc'] = pr_auc_from_probs(y_test, y_prob)
metrics

  auc = np.trapz(tpr, fpr)


{'n': 61443,
 'tp': np.int64(2093),
 'tn': np.int64(50104),
 'fp': np.int64(6379),
 'fn': np.int64(2867),
 'acc': np.float64(0.8495190664518334),
 'bal_acc': np.float64(0.6545195853248451),
 'prec': np.float64(0.2470491029272899),
 'rec': np.float64(0.4219758064516129),
 'spec': np.float64(0.8870633641980773),
 'f1': np.float64(0.3116438356164384),
 'roc_auc': np.float64(0.7663528292555053),
 'train_roc_auc': np.float64(0.7624949512474828)}

# Data Leakage Check

In [47]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled, preds_shuffled = cv_lda(shuffled, feature_cols_rand, target_col, threshold = best_threshold, params = None)
print("Shuffled mean AUC:", fold_results_shuffled["roc_auc"].mean())

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Shuffled mean AUC: 0.4978999206075014


# Threshold Tuning Part 2

In [68]:
def thresh_pair_metrics(y_true, y_prob, t_low, t_high):

    # make decisions (0=Approve, Deny=1, Review=2)
    decision = np.where(y_prob <= t_low, 0, np.where(y_prob >= t_high, 1, -1)) 

    # compute some metrics for each class
    metrics = {}
    for name, k in [("approve",0), ("review",-1), ("deny",1)]:

        # how many applicants in this group (and %)
        idx = (decision==k)
        n = idx.sum() 
        metrics[f"{name}_n"] = n
        metrics[f"{name}_rate"] = n / len(y_true)

        # default rate in this group
        metrics[f"{name}_default_rate"] = y_true[idx].mean() if n else np.nan

    # how often approved loans did not default (precision, high=good)
    metrics["approve_nondefault_precision"] = 1 - metrics["approve_default_rate"]  

    # how often denied loans did default (precision, high=good)
    metrics["deny_default_precision"] = metrics["deny_default_rate"]    

    # recall of defaults caught by deny
    pos = (y_true==1)
    metrics["deny_default_recall"] = ((decision==1) & pos).sum() / max(pos.sum(),1)

    return metrics

In [71]:
def search_thresholds(y_true, y_prob, target_approve_prec=0.95, target_deny_prec=0.45, target_review_rate=0.25):

    # loop through possible threshold pairs
    rows = []
    for t_low in np.linspace(0.05, 0.40, 71):      # approve threshold grid
        for t_high in np.linspace(0.50, 0.90, 81):  # deny threshold grid
            if t_low >= t_high: 
                continue

            # get metrics for this threshold pair
            metrics = thresh_pair_metrics(y_true, y_prob, t_low, t_high)

            # check if meets each target constraint
            approve_ok = (metrics["approve_nondefault_precision"] >= target_approve_prec)
            deny_ok = (metrics["deny_default_precision"] >= target_deny_prec)
            review_ok = (metrics["review_rate"] <= target_review_rate)

            # if all are good, compute a overall score of how good it did (kind of arbitrary)
            ## weight good denials heaviest because want to avoid approving bad loans
            if approve_ok and deny_ok and review_ok:
                rows.append({"t_low":t_low, "t_high":t_high, **metrics})
                
    return pd.DataFrame(rows)

In [91]:
rand_preds

Unnamed: 0,fold,y_true,y_prob
0,5,0,0.017344
1,5,0,0.070515
2,5,0,0.028619
3,5,0,0.089661
4,5,0,0.103879
...,...,...,...
245771,2,0,0.032629
245772,2,0,0.167567
245773,2,0,0.041493
245774,2,0,0.049403


In [82]:

thresh_pair_results = search_thresholds(rand_preds.y_true,
                                        rand_preds.y_prob,
                                        target_approve_prec = 0.95,
                                        target_deny_prec = 0.5,
                                        target_review_rate = 0.22)

thresh_pair_results


Unnamed: 0,t_low,t_high,approve_n,approve_rate,approve_default_rate,review_n,review_rate,review_default_rate,deny_n,deny_rate,deny_default_rate,approve_nondefault_precision,deny_default_precision,deny_default_recall
0,0.11,0.565,191021,0.777216,0.045524,53603,0.218097,0.195959,1152,0.004687,0.501736,0.954476,0.501736,0.029224
1,0.11,0.570,191021,0.777216,0.045524,53658,0.218321,0.196150,1097,0.004463,0.507748,0.954476,0.507748,0.028163
2,0.11,0.575,191021,0.777216,0.045524,53699,0.218488,0.196428,1056,0.004297,0.505682,0.954476,0.505682,0.027000
3,0.11,0.580,191021,0.777216,0.045524,53737,0.218642,0.196550,1018,0.004142,0.510806,0.954476,0.510806,0.026292
4,0.11,0.585,191021,0.777216,0.045524,53788,0.218850,0.196847,967,0.003934,0.510858,0.954476,0.510858,0.024977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,0.13,0.785,201558,0.820088,0.049703,44083,0.179363,0.219790,135,0.000549,0.525926,0.950297,0.525926,0.003590
205,0.13,0.790,201558,0.820088,0.049703,44092,0.179399,0.219881,126,0.000513,0.515873,0.950297,0.515873,0.003286
206,0.13,0.795,201558,0.820088,0.049703,44092,0.179399,0.219881,126,0.000513,0.515873,0.950297,0.515873,0.003286
207,0.13,0.800,201558,0.820088,0.049703,44097,0.179419,0.219902,121,0.000492,0.520661,0.950297,0.520661,0.003185


In [87]:
results = thresh_pair_results[thresh_pair_results.deny_rate > 0.0045]
results

Unnamed: 0,t_low,t_high,approve_n,approve_rate,approve_default_rate,review_n,review_rate,review_default_rate,deny_n,deny_rate,deny_default_rate,approve_nondefault_precision,deny_default_precision,deny_default_recall
0,0.11,0.565,191021,0.777216,0.045524,53603,0.218097,0.195959,1152,0.004687,0.501736,0.954476,0.501736,0.029224
13,0.115,0.565,193991,0.7893,0.046549,50633,0.206013,0.200857,1152,0.004687,0.501736,0.953451,0.501736,0.029224
62,0.12,0.565,196658,0.800151,0.047616,47966,0.195161,0.205062,1152,0.004687,0.501736,0.952384,0.501736,0.029224
111,0.125,0.565,199266,0.810763,0.048699,45358,0.18455,0.209357,1152,0.004687,0.501736,0.951301,0.501736,0.029224
160,0.13,0.565,201558,0.820088,0.049703,43066,0.175225,0.213208,1152,0.004687,0.501736,0.950297,0.501736,0.029224


In [90]:
apps_holdout_rand = apps_holdout_rand.iloc[:len(y_prob)].copy()
apps_holdout_rand['y_prob'] = y_prob

thresh_pair_metrics(apps_holdout_rand.TARGET, apps_holdout_rand.y_prob, 0.11, 0.57)

{'approve_n': np.int64(47783),
 'approve_rate': np.float64(0.7776801262959165),
 'approve_default_rate': np.float64(0.08111671514973945),
 'review_n': np.int64(13384),
 'review_rate': np.float64(0.21782790553846654),
 'review_default_rate': np.float64(0.08390615660490137),
 'deny_n': np.int64(276),
 'deny_rate': np.float64(0.004491968165616913),
 'deny_default_rate': np.float64(0.09782608695652174),
 'approve_nondefault_precision': np.float64(0.9188832848502606),
 'deny_default_precision': np.float64(0.09782608695652174),
 'deny_default_recall': np.float64(0.005372065260644648)}