In [24]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

# Metrics

In [32]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics


def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc

# LDA Cross Validation

In [33]:
def cv_lda(data, feature_cols, target_col, threshold = 0.5, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    all_preds = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        lda_model = LinearDiscriminantAnalysis()
        lda_model.fit(X_train, y_train)

        y_prob = lda_model.predict_proba(X_test)[:, 1]
        y_train_prob = lda_model.predict_proba(X_train)[:, 1]
        y_pred = (y_prob > threshold).astype(int)

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)

        fold_preds = pd.DataFrame({
            'fold': f,
            'y_true': y_test.values,
            'y_prob': y_prob
        })
        all_preds.append(fold_preds)


    results_df = pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)
    preds_df = pd.concat(all_preds, ignore_index=True)

    return results_df, preds_df


        


# Import Data

In [34]:
#Stratified data set
apps_cv_strat = pd.read_csv("data/apps_cv_strat.txt")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.txt")

#Random data set
apps_cv_rand = pd.read_csv("data/apps_cv_random.txt")
apps_holdout_rand = pd.read_csv("data/apps_holdout_random.txt")

#Multi-stratified data set
apps_cv_multi = pd.read_csv("data/apps_cv_multi.txt")
apps_holdout_multi = pd.read_csv("data/apps_holdout_multi.txt")

target_col = 'TARGET'
feature_cols_strat = [col for col in apps_cv_strat.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]

feature_cols_rand = [col for col in apps_cv_rand.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]


# Stratified Results

In [35]:
strat_results, strat_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,5


# Random Results

In [36]:
rand_results, rand_preds = cv_lda(apps_cv_rand, feature_cols_rand, target_col, params = None)
rand_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,172,44960,218,3806,0.918138,0.519206,0.441026,0.043238,0.995175,0.078755,0.760461,0.765584,1
1,49155,202,44990,225,3738,0.919377,0.523146,0.473068,0.051269,0.995024,0.092512,0.761883,0.764656,2
2,49155,186,45075,197,3697,0.920781,0.521775,0.48564,0.047901,0.995649,0.087201,0.761395,0.764668,3
3,49155,199,44980,218,3758,0.919113,0.522734,0.477218,0.050291,0.995177,0.090992,0.764674,0.763945,4
4,49155,176,44935,200,3844,0.91773,0.519675,0.468085,0.043781,0.995569,0.080073,0.762706,0.764829,5


# Multi-Stratified Results

In [37]:
multi_results, multi_preds = cv_lda(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,190,44984,203,3779,0.918993,0.521689,0.483461,0.047871,0.995508,0.087116,0.763452,0.761945,1
1,49156,203,44952,235,3766,0.918606,0.522973,0.46347,0.051146,0.994799,0.092126,0.764456,0.761871,2
2,49156,189,44951,236,3780,0.918301,0.521198,0.444706,0.047619,0.994777,0.086026,0.75253,0.764671,3
3,49155,200,44971,215,3769,0.91895,0.522816,0.481928,0.050391,0.995242,0.091241,0.762864,0.761982,4
4,49154,204,44947,239,3764,0.918562,0.523061,0.460497,0.051411,0.994711,0.092496,0.75781,0.763469,5


# Comparison Between Results

In [38]:
print(f"Stratified Avg ROC-AUC: {strat_results['roc_auc'].mean():.4f}")
print(f"Random Avg ROC-AUC: {rand_results['roc_auc'].mean():.4f}")
print(f"Multi-Stratified Avg ROC-AUC: {multi_results['roc_auc'].mean():.4f}")

Stratified Avg ROC-AUC: 0.7602
Random Avg ROC-AUC: 0.7622
Multi-Stratified Avg ROC-AUC: 0.7602


# Threshold Tuning
The random set had the best ROC-AUC out of all of the data sets so we will tune the threshold using the random set.

In [39]:
thresholds = np.linspace(0, 1, 200)
scores = []
for t in thresholds:
    y_pred = (rand_preds["y_prob"] >= t).astype(int)
    tp = ((y_pred == 1) & (rand_preds["y_true"] == 1)).sum()
    fp = ((y_pred == 1) & (rand_preds["y_true"] == 0)).sum()
    tn = ((y_pred == 0) & (rand_preds["y_true"] == 0)).sum()
    fn = ((y_pred == 0) & (rand_preds["y_true"] == 1)).sum()

    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    f1   = 2 * prec * rec / max(prec + rec, 1e-12)
    spec = tn / max(tn + fp, 1)
    bal_acc = (rec + spec) / 2

    scores.append((t, prec, rec, f1, bal_acc))

#Sort by F1-Score to determine the best threshold
scores_df = pd.DataFrame(scores, columns=["threshold", "precision", "recall", "f1", "bal_acc"]).sort_values("f1", ascending = False).reset_index(drop=True)
best_threshold = scores_df.loc[scores_df["f1"].idxmax(), "threshold"]
scores_df

Unnamed: 0,threshold,precision,recall,f1,bal_acc
0,0.155779,0.241254,0.420518,0.306606,0.652389
1,0.160804,0.245284,0.407625,0.306272,0.648931
2,0.165829,0.249514,0.395793,0.306074,0.645805
3,0.145729,0.232951,0.445950,0.306037,0.658722
4,0.150754,0.236497,0.432602,0.305812,0.655190
...,...,...,...,...,...
195,0.979899,0.357143,0.000253,0.000505,0.500106
196,0.984925,0.333333,0.000202,0.000404,0.500083
197,0.994975,0.285714,0.000101,0.000202,0.500039
198,0.989950,0.250000,0.000101,0.000202,0.500037


# Holdout Evaluation

In [40]:
X_train, y_train = apps_cv_strat[feature_cols_strat], apps_cv_strat[target_col]
X_test, y_test = apps_holdout_strat[feature_cols_strat], apps_holdout_strat[target_col]

pca = PCA(n_components = 0.95)
pca.fit(X_train)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_prob = lda.predict_proba(X_test)[:, 1]
y_train_prob = lda.predict_proba(X_train)[:, 1]
y_pred = (y_prob > best_threshold).astype(int)

metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics

  auc = np.trapz(tpr, fpr)


{'n': 61443,
 'tp': np.int64(2093),
 'tn': np.int64(50104),
 'fp': np.int64(6379),
 'fn': np.int64(2867),
 'acc': np.float64(0.8495190664518334),
 'bal_acc': np.float64(0.6545195853248451),
 'prec': np.float64(0.2470491029272899),
 'rec': np.float64(0.4219758064516129),
 'spec': np.float64(0.8870633641980773),
 'f1': np.float64(0.3116438356164384),
 'roc_auc': np.float64(0.7663528292555053),
 'train_roc_auc': np.float64(0.7624949512474828)}

# Data Leakage Check

In [47]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled, preds_shuffled = cv_lda(shuffled, feature_cols_rand, target_col, threshold = best_threshold, params = None)
print("Shuffled mean AUC:", fold_results_shuffled["roc_auc"].mean())

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Shuffled mean AUC: 0.4978999206075014
