In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

### Metrics

In [18]:
def classification_metrics(y_true, y_pred):
    """
    Computes confusion matrix + accuracy, precision, recall, F1, and balanced accuracy.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Confusion matrix components
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Metrics
    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2 * prec * rec / max((prec + rec), 1e-12)) if (prec + rec) > 0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics

### Test, Train, Validation

Test, Training, and Validation Sets
- A completely randomly sampled split
 							
- A stratified split
 							
- A split that is chosen in a non-random way, so that your test and/or validation sets can be considered to more accurately represent the data that will be seen when the system is deployed


In [3]:
# 1. randomly sampled split
def train_test_index_split_random(n, k, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    folds = np.array_split(idx, k)
    return [
        (np.concatenate(folds[:i] + folds[i+1:]), folds[i])
        for i in range(k)
    ]

In [4]:
#2. stratified split
def train_test_index_split_stratified(y, k, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    # distribute classes evenly across folds
    for cls in np.unique(y):
        indexes = rng.permutation(np.where(y == cls)[0])
        split = np.array_split(indexes, k)
        for i in range(k):
            folds[i].extend(split[i])
    splits = []
    for i in range(k):
        test_idx = np.array(folds[i])
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
        splits.append((train_idx, test_idx))
    return splits

In [5]:
#3. multiple stratfied split
def train_test_index_split_multi_stratified(stratifiers, k, seed=42):
    rng = np.random.default_rng(seed)
    S = np.asarray(stratifiers)
    if S.ndim == 1:
        S = S.reshape(-1, 1)

    n = S.shape[0]
    S = S.astype('U64')  # convert to string for concatenation
    keys = S[:, 0]
    for j in range(1, S.shape[1]):
        keys = np.char.add(np.char.add(keys, '¦'), S[:, j])

    unique_keys = np.unique(keys)
    folds = [[] for _ in range(k)]

    # distribute samples with the same key evenly across folds
    for uk in unique_keys:
        idxs = np.where(keys == uk)[0]
        idxs = rng.permutation(idxs)
        for t, idx in enumerate(idxs):
            folds[t % k].append(idx)

    # convert to arrays
    folds = [np.array(f, dtype=int) for f in folds]

    # build splits (train = all other folds)
    splits = []
    for i in range(k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i]) if k > 1 else np.array([], dtype=int)
        splits.append((train_idx, test_idx))
    return splits

## cross val

In [6]:
import numpy as np

def cross_validate(X, y, k=5, split="stratified", seed=42, model_fn=None, stratifiers=None):
    X = np.asarray(X)
    y = np.asarray(y)

    # split method
    if split == "stratified":
        splits = train_test_index_split_stratified(y, k, seed)
    elif split == "random":
        splits = train_test_index_split_random(len(y), k, seed)
    elif split == "multi-stratified":
        if stratifiers is None:
            raise ValueError("stratifiers must be provided for multi-stratified CV")
        
        splits = train_test_index_split_multi_stratified(stratifiers, k, seed)
    else:
        raise ValueError("split type not found")
    results = []
    for fold, (train_idx, test_idx) in enumerate(splits, 1):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        if model_fn is None:
            results.append({
                "fold": fold,
                "train_idx": train_idx, "test_idx": test_idx,
                "X_train": X_train, "y_train": y_train,
                "X_test": X_test, "y_test": y_test,
            })
        else:
            metrics = model_fn(X_train, y_train, X_test, y_test)  
            metrics["fold"] = fold
            results.append(metrics)
    return results

In [7]:
import pandas as pd
data = pd.read_csv("data/apps_all_background.csv")
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,prev_last_decision_status_internal_Canceled,prev_last_decision_status_internal_Refused,prev_last_decision_status_internal_Unused offer,most_recent_loan_active_external_group_Closed,most_recent_loan_active_external_group_Problematic,most_recent_loan_type_external_group_Business/Other,most_recent_loan_type_external_group_Consumer/Personal,most_recent_loan_type_external_group_Mortgage/Real estate,prev_last_decision_status_internal_group_Canceled,prev_last_decision_status_internal_group_Refused
0,384635,0,1,135000.0,1078200.0,34780.5,900000.0,0.010966,-14899,-4019,...,False,False,False,False,False,False,True,False,False,False
1,384638,0,0,180000.0,2013840.0,53253.0,1800000.0,0.035792,-23202,-12384,...,False,False,False,True,False,False,False,False,False,False
2,384639,0,0,157500.0,900000.0,26316.0,900000.0,0.035792,-20671,365243,...,True,False,False,False,False,False,True,False,True,False
3,384641,0,0,180000.0,490495.5,27387.0,454500.0,0.018029,-20168,-1161,...,False,False,False,False,False,False,True,False,False,False
4,384642,0,0,135000.0,508495.5,38146.5,454500.0,0.072508,-14048,-3569,...,False,False,False,True,False,False,True,False,False,False


In [8]:
data.columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       ...
       'prev_last_decision_status_internal_Canceled',
       'prev_last_decision_status_internal_Refused',
       'prev_last_decision_status_internal_Unused offer',
       'most_recent_loan_active_external_group_Closed',
       'most_recent_loan_active_external_group_Problematic',
       'most_recent_loan_type_external_group_Business/Other',
       'most_recent_loan_type_external_group_Consumer/Personal',
       'most_recent_loan_type_external_group_Mortgage/Real estate',
       'prev_last_decision_status_internal_group_Canceled',
       'prev_last_decision_status_internal_group_Refused'],
      dtype='object', length=226)

In [30]:
y = data["TARGET"]
X = data.drop(columns=["TARGET", "SK_ID_CURR"])
cross_validate(X, y, split="random")

[{'fold': 1,
  'train_idx': array([190417,  37011, 229432, ..., 131009, 129997,  75400]),
  'test_idx': array([182872, 188689,  62292, ..., 233989, 215798,  57681]),
  'X_train': array([[3, 157500.0, 550980.0, ..., False, False, False],
         [0, 225000.0, 904500.0, ..., False, False, False],
         [0, 112500.0, 81504.0, ..., False, False, False],
         ...,
         [3, 90000.0, 270000.0, ..., False, False, False],
         [0, 288000.0, 2250000.0, ..., False, False, False],
         [0, 153000.0, 462694.5, ..., False, False, True]], dtype=object),
  'y_train': array([0, 0, 0, ..., 0, 0, 0]),
  'X_test': array([[0, 270000.0, 693301.5, ..., False, False, True],
         [0, 90000.0, 284256.0, ..., False, False, False],
         [0, 72000.0, 76410.0, ..., False, False, False],
         ...,
         [0, 36000.0, 76410.0, ..., False, False, False],
         [3, 135000.0, 918468.0, ..., False, False, False],
         [0, 90000.0, 180000.0, ..., False, False, False]], dtype=object

## helper functions

In [9]:
def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc

In [10]:
# stratified split for inner validation (ONE FEATURE)
def inner_stratified_split(y, val_frac=0.2, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    val_idx = []
    for cls in np.unique(y):
        cls_idx = np.where(y==cls)[0]
        cls_idx = rng.permutation(cls_idx)
        cut = max(1, int(len(cls_idx)*val_frac))
        val_idx.extend(cls_idx[:cut])
    val_idx = np.array(val_idx)
    train_idx = np.setdiff1d(np.arange(len(y)), val_idx)
    return train_idx, val_idx

# stratified split for inner validation (MULTIPLE FEATURES)
def inner_stratified_split_multi(stratifiers, val_frac=0.2, seed=42):

    rng = np.random.default_rng(seed)
    S = np.asarray(stratifiers)
    if S.ndim == 1:
        S = S.reshape(-1, 1) 
    n = S.shape[0]
    S = S.astype('U64')  # safe uniform string type

    keys = S[:, 0]
    for j in range(1, S.shape[1]):
        keys = np.char.add(np.char.add(keys, '¦'), S[:, j])

    val_idx = []
    for k in np.unique(keys):
        stratum_idx = np.where(keys == k)[0]
        stratum_idx = rng.permutation(stratum_idx)

        cut = max(1, int(len(stratum_idx) * val_frac))
        if len(stratum_idx) > 1:
            cut = min(cut, len(stratum_idx) - 1)

        val_idx.extend(stratum_idx[:cut])

    val_idx = np.array(sorted(val_idx))
    train_idx = np.setdiff1d(np.arange(n), val_idx, assume_unique=False)
    return train_idx, val_idx

In [11]:
def best_threshold(y_true, y_prob, metric="f1"):
    grid = np.linspace(0, 1, 101)
    best_t, best_score = 0.5, -1.0
    for t in grid:
        y_pred = (y_prob >= t).astype(int)
        m = classification_metrics(y_true, y_pred)
        score = m[metric]
        if score > best_score:
            best_score = score
            best_t = t
    return float(best_t)

## modeling functions

In [12]:
# log regression
from sklearn.linear_model import LogisticRegression
def model_fn_lr(X_train, y_train, X_test, y_test, pca_arg=0.90, metric_for_thr="f1"):
    # inner train/val to tune for best prob threshold
    inner_train_idx, inner_val_idx = inner_stratified_split(y_train, val_frac=0.2, seed=42)
   
    # preprocessing
    steps = [("scale", StandardScaler())]
    if pca_arg is not None:
        steps.append(("pca", PCA(n_components=pca_arg)))
    preproc = Pipeline(steps)

    # tune and get best threshold on train/val set
    X_inner_train = preproc.fit_transform(X_train[inner_train_idx])
    X_inner_val = preproc.transform(X_train[inner_val_idx])
    clf = LogisticRegression(class_weight="balanced", max_iter=200)
    clf.fit(X_inner_train, y_train[inner_train_idx])
    y_prob_val = clf.predict_proba(X_inner_val)[:, 1]
    best_thr = best_threshold(y_train[inner_val_idx], y_prob_val, metric=metric_for_thr)

    # refit model on outer train/test 
    X_outer_train = preproc.fit_transform(X_train)
    X_outer_test = preproc.transform(X_test)
    clf.fit(X_outer_train, y_train)

    y_prob_test = clf.predict_proba(X_outer_test)[:, 1]
    y_pred_test = (y_prob_test >= best_thr).astype(int)
    
    m = classification_metrics(y_test, y_pred_test)
    m["roc_auc"] = roc_auc_from_probs(y_test, y_prob_test)
    m["model"] = "LR"
    m["pca"] = pca_arg
    m["best_thr"] = best_thr
    return m


## test cross val

To run cross-validation:
1. Write a model function (ex: model_fn_lr) taking in (X_train, y_train, X_test, y_test) and return metrics like accuracy, F1, ROC-AUC, etc.
2. Pass it into cross_validate() using a lambda to pass in the params needed for ur model function.

In [35]:
lr_results = cross_validate(
    X.values, y.values,
    k=5, split="stratified", seed=42,
    model_fn=lambda X_train, y_train, X_test, y_test : model_fn_lr(X_train, y_train, X_test, y_test, pca_arg=0.90)
)

In [None]:
lr_results

[{'tp': 2168,
  'tn': 49143,
  'fp': 7341,
  'fn': 2793,
  'acc': 0.8350720156237286,
  'prec': 0.22799453149647703,
  'rec': 0.43700866760733725,
  'f1': 0.29965445749827224,
  'roc_auc': 0.7506848439426564,
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.65,
  'fold': 1},
 {'tp': 2258,
  'tn': 48710,
  'fp': 7773,
  'fn': 2703,
  'acc': 0.8295032875463837,
  'prec': 0.22510218323198086,
  'rec': 0.4551501713364241,
  'f1': 0.3012273212379936,
  'roc_auc': 0.754015713443531,
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.64,
  'fold': 2},
 {'tp': 2019,
  'tn': 50186,
  'fp': 6297,
  'fn': 2942,
  'acc': 0.8496354404010156,
  'prec': 0.2427849927849928,
  'rec': 0.40697440032251564,
  'f1': 0.30413497024930336,
  'roc_auc': 0.7602001559082929,
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.67,
  'fold': 3},
 {'tp': 2360,
  'tn': 48140,
  'fp': 8343,
  'fn': 2601,
  'acc': 0.8218865959247444,
  'prec': 0.22049892553489675,
  'rec': 0.47571054222938924,
  'f1': 0.3013278855975485,
  'roc_

# Make Folds and Holdout Set

For each cross-validation method, split into data for cross-validation and holdout data for testing. Then, split the cross-validation data into folds. Make a new column assigning these folds so we all use the same folds for fair comparison across model developement processes. 

In [13]:
# read in data
data = pd.read_csv("data/apps_all_background.csv")
data.drop(columns=['neighbors_target_mean_500'], inplace=True) # remove this column because of data leakage concerns

In [14]:
# random holdout set 
train_rand_idx = np.random.choice(data.index, size=int(0.8*len(data)), replace=False)
test_rand_idx = np.setdiff1d(data.index, train_rand_idx)
train_rand_data = data.loc[train_rand_idx]
test_rand_data = data.loc[test_rand_idx]

# assign folds
splits_rand = train_test_index_split_random(len(train_rand_data), k=5, seed=42)
folds_rand = np.zeros(len(train_rand_data), dtype=int)
for fold_num, (train_idx, test_idx) in enumerate(splits_rand, 1):
    folds_rand[test_idx] = fold_num
train_rand_data = train_rand_data.reset_index(drop=True)
train_rand_data["fold"] = folds_rand
train_rand_data.shape, test_rand_data.shape

((245776, 226), (61444, 225))

In [15]:
# stratified holdout set
train_strat, test_strat = inner_stratified_split(data["TARGET"], val_frac=0.2, seed=42)
train_strat_data, test_strat_data = data.iloc[train_strat], data.iloc[test_strat]

# assign folds
splits_strat = train_test_index_split_stratified(train_strat_data["TARGET"], k=5, seed=42)
folds_strat = np.zeros(len(train_strat_data), dtype=int)
for fold_num, (train_idx, test_idx) in enumerate(splits_strat, 1):
    folds_strat[test_idx] = fold_num
train_strat_data = train_strat_data.reset_index(drop=True)
train_strat_data["fold"] = folds_strat

# verify that the stratification worked
# Overall proportions
print("Overall class distribution:")
print(train_strat_data["TARGET"].value_counts(normalize=True))

# Distribution per fold
print("\nDistribution by fold:")
print(train_strat_data.groupby("fold")["TARGET"].value_counts(normalize=True).unstack().round(3))


Overall class distribution:
TARGET
0    0.91926
1    0.08074
Name: proportion, dtype: float64

Distribution by fold:
TARGET      0      1
fold                
1       0.919  0.081
2       0.919  0.081
3       0.919  0.081
4       0.919  0.081
5       0.919  0.081


In [16]:
# multiple stratified holdout set
train_multi_strat, test_multi_strat = inner_stratified_split_multi(data[['TARGET', 'CODE_GENDER_M']], val_frac=0.2, seed=42)
train_multi_strat_data, test_multi_strat_data = data.iloc[train_multi_strat], data.iloc[test_multi_strat]
train_multi_strat_data.shape, test_multi_strat_data.shape

# assign folds
splits_multi_strat = train_test_index_split_multi_stratified(train_multi_strat_data[["TARGET", "CODE_GENDER_M"]], k=5, seed=42)
folds_multi_strat = np.zeros(len(train_multi_strat_data), dtype=int)
for fold_num, (train_idx, test_idx) in enumerate(splits_multi_strat, 1):
    folds_multi_strat[test_idx] = fold_num
train_multi_strat_data = train_multi_strat_data.reset_index(drop=True)
train_multi_strat_data["fold"] = folds_multi_strat

# verify that multiple stratification worked
print("Overall class distribution:")
print(train_multi_strat_data["TARGET"].value_counts(normalize=True))
print("\nOverall gender distribution:")
print(train_multi_strat_data["CODE_GENDER_M"].value_counts(normalize=True))

# distribution per fold
print("\nDistribution by fold:")
print(train_multi_strat_data.groupby("fold")["TARGET"].value_counts(normalize=True).unstack().round(3))
print(train_multi_strat_data.groupby("fold")["CODE_GENDER_M"].value_counts(normalize=True).unstack().round(3))


Overall class distribution:
TARGET
0    0.91926
1    0.08074
Name: proportion, dtype: float64

Overall gender distribution:
CODE_GENDER_M
False    0.658338
True     0.341662
Name: proportion, dtype: float64

Distribution by fold:
TARGET      0      1
fold                
1       0.919  0.081
2       0.919  0.081
3       0.919  0.081
4       0.919  0.081
5       0.919  0.081
CODE_GENDER_M  False  True 
fold                       
1              0.658  0.342
2              0.658  0.342
3              0.658  0.342
4              0.658  0.342
5              0.658  0.342


In [17]:
train_rand_data.to_csv('data/apps_cv_random.csv', index=False)
test_rand_data.to_csv('data/apps_holdout_random.csv', index=False)
train_strat_data.to_csv('data/apps_cv_strat.csv', index=False)
test_strat_data.to_csv('data/apps_holdout_strat.csv', index=False)
train_multi_strat_data.to_csv('data/apps_cv_multi.csv', index=False)
test_multi_strat_data.to_csv('data/apps_holdout_multi.csv', index=False)