In [23]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression



# Metrics

In [24]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics


def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc


def pr_auc_from_probs(y_true, y_prob):
    # Sort by predicted probability descending
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    
    tp = 0
    fp = 0
    pos = np.sum(y_true == 1)
    
    precision = [1.0]  # starts at 1 when recall=0
    recall = [0.0]
    
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        prec = tp / (tp + fp)
        rec = tp / pos
        precision.append(prec)
        recall.append(rec)
    
    # ensure it ends at recall=1
    precision = np.array(precision)
    recall = np.array(recall)
    
    # integrate area under curve
    auc_pr = np.trapz(precision, recall)
    return auc_pr

# LR Cross Validation

In [25]:
def cv_lr(data, feature_cols, target_col, threshold = 0.5, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    all_preds = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        #Create the logistic regression model with equal class weights
        lr_model = LogisticRegression(class_weight="balanced", 
                                      max_iter=500
                                      )
        
        lr_model.fit(X_train, y_train)
        y_prob = lr_model.predict_proba(X_test)[:, 1]
        y_train_prob = lr_model.predict_proba(X_train)[:, 1]

        #Set the default threshold to 0.5, will below
        y_pred = (y_prob > threshold).astype(int)

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['pr_roc_auc'] = pr_auc_from_probs(y_test, y_prob)
        metrics['fold'] = int(f)
        metrics['threshold'] = threshold
        fold_metrics.append(metrics)

        #Save the probabilities to later tune the threshold
        fold_preds = pd.DataFrame({
            'fold': f,
            'y_true': y_test.values,
            'y_prob': y_prob
        })
        all_preds.append(fold_preds)


    results_df = pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)
    preds_df = pd.concat(all_preds, ignore_index=True)

    return results_df, preds_df

        


# Import Data

In [26]:
#Stratified data set
apps_cv_strat = pd.read_csv("data/apps_cv_strat.csv")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.csv")

#Random data set
apps_cv_rand = pd.read_csv("data/apps_cv_random.csv")
apps_holdout_rand = pd.read_csv("data/apps_holdout_random.csv")

#Multi-stratified data set
apps_cv_multi = pd.read_csv("data/apps_cv_multi.csv")
apps_holdout_multi = pd.read_csv("data/apps_holdout_multi.csv")

target_col = 'TARGET'
feature_cols_strat = [col for col in apps_cv_strat.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]

feature_cols_rand = [col for col in apps_cv_rand.columns if col not in
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500', 'AGE_INT', 'CODE_GENDER_M',
                 'CODE_GENDER_XNA', 'DAYS_BIRTH',
                 'NAME_FAMILY_STATUS_Previously Married', 'NAME_FAMILY_STATUS_Single']]


# Stratified Results

In [27]:
strat_results, strat_preds = cv_lr(apps_cv_strat, feature_cols_strat, target_col, params = None)
strat_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_roc_auc,fold,threshold
0,49156,2136,29309,15878,1833,0.639698,0.593393,0.118574,0.538171,0.648616,0.194332,0.629409,0.632667,0.119185,1,0.5
1,49156,2102,29916,15271,1867,0.651355,0.595827,0.120992,0.529604,0.662049,0.196982,0.633406,0.633279,0.12222,2,0.5
2,49156,2161,28894,16293,1808,0.631764,0.591951,0.117102,0.54447,0.639432,0.192749,0.622193,0.633949,0.118589,3,0.5
3,49155,2138,29232,15954,1831,0.638185,0.5928,0.118174,0.538675,0.646926,0.193826,0.631869,0.631726,0.118427,4,0.5
4,49154,2202,29682,15504,1766,0.648655,0.605912,0.124365,0.55494,0.656885,0.203193,0.642852,0.632312,0.125337,5,0.5


# Random Results

In [28]:
rand_results, rand_preds = cv_lr(apps_cv_rand, feature_cols_rand, target_col, params = None)
rand_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_roc_auc,fold,threshold
0,49156,2166,28831,16347,1812,0.630584,0.59133,0.116999,0.544495,0.638165,0.19261,0.625577,0.633525,0.121292,1,0.5
1,49155,2093,29952,15263,1847,0.651917,0.596827,0.120592,0.531218,0.662435,0.196563,0.636674,0.632877,0.120439,2,0.5
2,49155,2173,28462,16810,1710,0.623233,0.594154,0.114471,0.559619,0.628689,0.190064,0.631284,0.631873,0.119116,3,0.5
3,49155,2151,29807,15391,1806,0.650147,0.601535,0.12262,0.543594,0.659476,0.200102,0.639989,0.633408,0.124125,4,0.5
4,49155,2235,28601,16534,1785,0.627322,0.594823,0.119079,0.55597,0.633677,0.196147,0.628971,0.633674,0.120913,5,0.5


# Multi-Stratified Results

In [29]:
multi_results, multi_preds = cv_lr(apps_cv_multi, feature_cols_strat, target_col, params = None)
multi_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_roc_auc,fold,threshold
0,49158,2218,29382,15806,1752,0.642825,0.604454,0.123058,0.55869,0.650217,0.201691,0.637011,0.635453,0.121915,1,0.5
1,49157,2139,29352,15835,1831,0.640621,0.594179,0.119005,0.538791,0.649567,0.194951,0.631294,0.637141,0.122909,2,0.5
2,49155,2114,29902,15285,1854,0.651327,0.597251,0.121501,0.532762,0.661739,0.197875,0.638195,0.637815,0.124756,3,0.5
3,49154,2198,28649,16537,1770,0.627558,0.593978,0.117321,0.553931,0.634024,0.193631,0.629695,0.635517,0.122008,4,0.5
4,49154,2230,29451,15735,1738,0.644525,0.606884,0.12413,0.561996,0.651773,0.203347,0.641863,0.635254,0.125252,5,0.5


# Comparison Between Results

In [30]:
print(f"Stratified Avg ROC-AUC: {strat_results['roc_auc'].mean():.4f}")
print(f"Random Avg ROC-AUC: {rand_results['roc_auc'].mean():.4f}")
print(f"Multi-Stratified Avg ROC-AUC: {multi_results['roc_auc'].mean():.4f}")

Stratified Avg ROC-AUC: 0.6319
Random Avg ROC-AUC: 0.6325
Multi-Stratified Avg ROC-AUC: 0.6356


# Tune Threshold
Since the highest ROC-AUC was with the multi-stratified set, we will tune the threshold on that set.

In [31]:
thresholds = np.linspace(0, 1, 200)
scores = []
for t in thresholds:
    y_pred = (strat_preds["y_prob"] >= t).astype(int)
    tp = ((y_pred == 1) & (strat_preds["y_true"] == 1)).sum()
    fp = ((y_pred == 1) & (strat_preds["y_true"] == 0)).sum()
    tn = ((y_pred == 0) & (strat_preds["y_true"] == 0)).sum()
    fn = ((y_pred == 0) & (strat_preds["y_true"] == 1)).sum()

    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    f1   = 2 * prec * rec / max(prec + rec, 1e-12)
    spec = tn / max(tn + fp, 1)
    bal_acc = (rec + spec) / 2

    scores.append((t, prec, rec, f1, bal_acc))

#Sort by F1-Score to determine the best threshold
scores_df = pd.DataFrame(scores, columns=["threshold", "precision", "recall", "f1", "bal_acc"]).sort_values("f1", ascending = False).reset_index(drop=True)
best_threshold = scores_df.loc[scores_df["f1"].idxmax(), "threshold"]
scores_df

Unnamed: 0,threshold,precision,recall,f1,bal_acc
0,0.512563,0.124125,0.482665,0.197468,0.591762
1,0.507538,0.122464,0.506148,0.197212,0.593797
2,0.517588,0.125354,0.457569,0.196795,0.588577
3,0.522613,0.126982,0.433128,0.196388,0.585791
4,0.502513,0.120482,0.528623,0.196238,0.594843
...,...,...,...,...,...
195,0.989950,0.269231,0.000353,0.000705,0.500134
196,0.979899,0.259259,0.000353,0.000705,0.500132
197,0.974874,0.225806,0.000353,0.000704,0.500123
198,0.994975,0.263158,0.000252,0.000503,0.500095


# LR With Penalty

In [41]:
def cv_lr_penalty(data, feature_cols, target_col, threshold, penalty, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        lr_model = LogisticRegression(penalty="l2", 
                                      C=penalty,
                                      class_weight="balanced", 
                                      max_iter=500
                                      )
        
        lr_model.fit(X_train, y_train)
        y_prob = lr_model.predict_proba(X_test)[:, 1]
        y_train_prob = lr_model.predict_proba(X_train)[:, 1]

        y_pred = (y_prob > threshold).astype(int)

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['pr_roc_auc'] = pr_auc_from_probs(y_test, y_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)

    return pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)

        


# Hyperparameter Tuning

In [73]:
lambda_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

lam_results = []

for lam in lambda_values:
    results = cv_lr_penalty(apps_cv_multi, feature_cols_strat, target_col, threshold = best_threshold, penalty = lam)
    avg_roc_auc = results['roc_auc'].mean()

    lam_results.append({
        "Lambda_Value": lam,
        "avg_roc_auc": avg_roc_auc
    })

results_df = pd.DataFrame(lam_results).sort_values("avg_roc_auc", ascending = False).reset_index(drop=True)
best_lambda = results_df.loc[results_df["avg_roc_auc"].idxmax(), "Lambda_Value"]
results_df






STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  auc = np.trapz(tpr, fpr)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  auc = np.trapz(tpr, fpr)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the 

Unnamed: 0,Lambda_Value,avg_roc_auc
0,0.1,0.635764
1,1000.0,0.63566
2,100.0,0.63564
3,10.0,0.635593
4,1.0,0.63546
5,0.01,0.635188
6,0.001,0.634078


In [42]:
tuned_results = cv_lr_penalty(apps_cv_strat, feature_cols_strat, target_col, threshold = 0.5, penalty = 0.1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [43]:
tuned_results

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_roc_auc,fold
0,49156,2113,29803,15384,1856,0.64928,0.595962,0.120764,0.532376,0.659548,0.196869,0.632073,0.635214,0.120492,1
1,49156,2104,29841,15346,1865,0.64987,0.595249,0.120573,0.530108,0.660389,0.196461,0.632253,0.632403,0.121154,2
2,49156,2080,29745,15442,1889,0.647429,0.591163,0.118708,0.524061,0.658265,0.193569,0.62196,0.635165,0.118766,3
3,49155,2167,28831,16355,1802,0.630617,0.592016,0.116996,0.545981,0.638052,0.192699,0.630946,0.631278,0.119275,4
4,49154,2118,30465,14721,1850,0.662876,0.603992,0.125779,0.53377,0.674213,0.203585,0.64329,0.632556,0.125114,5


# Data Leakage Check

In [33]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled = cv_lr_penalty(shuffled, feature_cols_strat, target_col, threshold = 0.5, penalty = 0.1)
print("Shuffled mean AUC:", fold_results_shuffled.roc_auc.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Shuffled mean AUC: 0.4950246444941664
