In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Metrics

In [8]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    metrics = {
        "tp":tp, "tn": tn, "fp":fp, "fn":fn, "acc":acc, "prec":prec, "rec": rec, "f1":f1
    }
    return metrics

# Train, Test, Validation

In [9]:
# 1. randomly sampled split
def train_test_index_split_random(n, k, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    folds = np.array_split(idx, k)
    return [
        (np.concatenate(folds[:i] + folds[i+1:]), folds[i])
        for i in range(k)
    ]

In [10]:
# 2. Stratified split
def train_test_index_split_stratified(y, k, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    # distribute classes evenly across folds
    for cls in np.unique(y):
        indexes = rng.permutation(np.where(y == cls)[0])
        split = np.array_split(indexes, k)
        for i in range(k):
            folds[i].extend(split[i])
    splits = []
    for i in range(k):
        test_idx = np.array(folds[i])
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
        splits.append((train_idx, test_idx))
    return splits

In [11]:
#3. multi-stratified split
def train_test_index_split_multi(x, y, k, seed=42):
    rng = np.random.default_rng(seed)
    x, y = np.asarray(x), np.asarray(y)
    folds = [[] for _ in range(k)]
    # distribute classes evenly across folds
    for cls in np.unique(y):
        indexes = rng.permutation(np.where(y == cls)[0])
        split = np.array_split(indexes, k)
        for i in range(k):
            folds[i].extend(split[i])
    splits = []
    for i in range(k):
        test_idx = np.array(folds[i])
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
        splits.append((
            x[train_idx], x[test_idx],
            y[train_idx], y[test_idx]
            ))
    return splits

# Cross Validation

In [12]:
import numpy as np

def cross_validate(X, y, k=5, split="stratified", seed=42, model_fn=None):
    X = np.asarray(X)
    y = np.asarray(y)

    # split method
    if split == "stratified":
        splits = train_test_index_split_stratified(y, k, seed)
    elif split == "random":
        splits = train_test_index_split_random(len(y), k, seed)
    elif split == "nonrandom":
        splits = train_test_index_split_multi(X, y, k, seed)
    else:
        raise ValueError("split type not found")

    results = []
    for fold, (train_idx, test_idx) in enumerate(splits, 1):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        if model_fn is None:
            results.append({
                "fold": fold,
                "train_idx": train_idx, "test_idx": test_idx,
                "X_train": X_train, "y_train": y_train,
                "X_test": X_test, "y_test": y_test,
            })
        else:
            # TODO: create model_fn for our four methods
            metrics = model_fn(X_train, y_train, X_test, y_test)  
            metrics["fold"] = fold
            results.append(metrics)
    return results

# Import Data

In [13]:
import pandas as pd
data = pd.read_csv("apps_cv.txt")
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,prev_last_decision_status_internal_Canceled,prev_last_decision_status_internal_Refused,prev_last_decision_status_internal_Unused offer,most_recent_loan_active_external_group_Closed,most_recent_loan_active_external_group_Problematic,most_recent_loan_type_external_group_Business/Other,most_recent_loan_type_external_group_Consumer/Personal,most_recent_loan_type_external_group_Mortgage/Real estate,prev_last_decision_status_internal_group_Canceled,prev_last_decision_status_internal_group_Refused
0,384635,0,1,135000.0,1078200.0,34780.5,900000.0,0.010966,-14899,-4019,...,False,False,False,False,False,False,True,False,False,False
1,384639,0,0,157500.0,900000.0,26316.0,900000.0,0.035792,-20671,365243,...,True,False,False,False,False,False,True,False,True,False
2,384641,0,0,180000.0,490495.5,27387.0,454500.0,0.018029,-20168,-1161,...,False,False,False,False,False,False,True,False,False,False
3,384642,0,0,135000.0,508495.5,38146.5,454500.0,0.072508,-14048,-3569,...,False,False,False,True,False,False,True,False,False,False
4,384645,1,0,180000.0,225000.0,20637.0,225000.0,0.010006,-11025,-4194,...,True,False,False,False,False,False,True,False,True,False


In [14]:
y = data["TARGET"]
X = data.drop(columns=["TARGET", "SK_ID_CURR"])
cross_validate(X, y, split="random")

[{'fold': 1,
  'train_idx': array([134280, 147447, 114405, ..., 131009, 129997,  75400],
        shape=(196621,)),
  'test_idx': array([231792, 211940, 166946, ..., 216518, 113238, 167319],
        shape=(49156,)),
  'X_train': array([[1, 81000.0, 254700.0, ..., False, True, False],
         [0, 157500.0, 427500.0, ..., False, False, False],
         [0, 216000.0, 2250000.0, ..., False, False, False],
         ...,
         [1, 135000.0, 106659.0, ..., False, False, False],
         [0, 157500.0, 301464.0, ..., False, False, False],
         [0, 315000.0, 497520.0, ..., False, False, False]],
        shape=(196621, 224), dtype=object),
  'y_train': array([0, 1, 0, ..., 0, 0, 0], shape=(196621,)),
  'X_test': array([[0, 135000.0, 394303.5, ..., False, False, True],
         [0, 270000.0, 1211049.0, ..., False, True, False],
         [0, 198000.0, 755190.0, ..., False, False, False],
         ...,
         [1, 135000.0, 127350.0, ..., False, False, False],
         [3, 103500.0, 270000.0

# Helper Functions

In [15]:
def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc

In [16]:
def inner_stratified_split(y, val_frac=0.2, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    val_idx = []
    for cls in np.unique(y):
        cls_idx = np.where(y==cls)[0]
        cls_idx = rng.permutation(cls_idx)
        cut = max(1, int(len(cls_idx)*val_frac))
        val_idx.extend(cls_idx[:cut])
    val_idx = np.array(val_idx)
    train_idx = np.setdiff1d(np.arange(len(y)), val_idx)
    return train_idx, val_idx

In [17]:
def best_threshold(y_true, y_prob, metric="f1"):
    grid = np.linspace(0, 1, 101)
    best_t, best_score = 0.5, -1.0
    for t in grid:
        y_pred = (y_prob >= t).astype(int)
        m = classification_metrics(y_true, y_pred)
        score = m[metric]
        if score > best_score:
            best_score = score
            best_t = t
    return float(best_t)

# LDA Function

In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
def model_fn_lda(X_train, y_train, X_test, y_test, pca_arg=0.90, metric_for_thr="f1"):
    # inner train/val to tune for best prob threshold
    inner_train_idx, inner_val_idx = inner_stratified_split(y_train, val_frac=0.2, seed=42)
   
    # preprocessing
    steps = [("scale", StandardScaler())]
    if pca_arg is not None:
        steps.append(("pca", PCA(n_components=pca_arg)))
    preproc = Pipeline(steps)

    # tune and get best threshold on train/val set
    X_inner_train = preproc.fit_transform(X_train[inner_train_idx])
    X_inner_val = preproc.transform(X_train[inner_val_idx])
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_inner_train, y_train[inner_train_idx])
    y_prob_val = lda.predict_proba(X_inner_val)[:, 1]
    best_thr = best_threshold(y_train[inner_val_idx], y_prob_val, metric=metric_for_thr)

    # refit model on outer train/test 
    X_outer_train = preproc.fit_transform(X_train)
    X_outer_test = preproc.transform(X_test)
    lda.fit(X_outer_train, y_train)

    y_prob_test = lda.predict_proba(X_outer_test)[:, 1]
    y_pred_test = (y_prob_test >= best_thr).astype(int)
    
    m = classification_metrics(y_test, y_pred_test)
    m["roc_auc"] = roc_auc_from_probs(y_test, y_prob_test)
    m["model"] = "LR"
    m["pca"] = pca_arg
    m["best_thr"] = best_thr
    return m


# Run Cross Validation

In [19]:
lda_results = cross_validate(
    X.values, y.values,
    k=5, split="stratified", seed=42,
    model_fn=lambda X_train, y_train, X_test, y_test : model_fn_lda(X_train, y_train, X_test, y_test, pca_arg=0.90)
)

lda_results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


[{'tp': np.int64(1664),
  'tn': np.int64(39680),
  'fp': np.int64(5507),
  'fn': np.int64(2305),
  'acc': np.float64(0.8410773862804134),
  'prec': np.float64(0.23204573978524612),
  'rec': np.float64(0.41924918115394305),
  'f1': np.float64(0.2987432675044883),
  'roc_auc': np.float64(0.755709248501634),
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.15,
  'fold': 1},
 {'tp': np.int64(1726),
  'tn': np.int64(39523),
  'fp': np.int64(5664),
  'fn': np.int64(2243),
  'acc': np.float64(0.8391447636097323),
  'prec': np.float64(0.23355886332882272),
  'rec': np.float64(0.4348702443940539),
  'f1': np.float64(0.3038999911964081),
  'roc_auc': np.float64(0.7531265318924434),
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.15,
  'fold': 2},
 {'tp': np.int64(1560),
  'tn': np.int64(40216),
  'fp': np.int64(4971),
  'fn': np.int64(2409),
  'acc': np.float64(0.849865733582879),
  'prec': np.float64(0.2388608176389527),
  'rec': np.float64(0.3930461073318216),
  'f1': np.float64(0.297142857142857