In [23]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

### Metrics

In [57]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    metrics = {
        "tp":tp, "tn": tn, "fp":fp, "fn":fn, "acc":acc, "prec":prec, "rec": rec, "f1":f1
    }
    return metrics

### Test, Train, Validation

Test, Training, and Validation Sets
- A completely randomly sampled split
 							
- A stratified split
 							
- A split that is chosen in a non-random way, so that your test and/or validation sets can be considered to more accurately represent the data that will be seen when the system is deployed


In [58]:
# 1. randomly sampled split
def train_test_index_split_random(n, k, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    folds = np.array_split(idx, k)
    return [
        (np.concatenate(folds[:i] + folds[i+1:]), folds[i])
        for i in range(k)
    ]

In [59]:
#2. stratified split
def train_test_index_split_stratified(y, k, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    # distribute classes evenly across folds
    for cls in np.unique(y):
        indexes = rng.permutation(np.where(y == cls)[0])
        split = np.array_split(indexes, k)
        for i in range(k):
            folds[i].extend(split[i])
    splits = []
    for i in range(k):
        test_idx = np.array(folds[i])
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
        splits.append((train_idx, test_idx))
    return splits

In [60]:
#TODO: the nonrandom split

## cross val

In [61]:
import numpy as np

def cross_validate(X, y, k=5, split="stratified", seed=42, model_fn=None):
    X = np.asarray(X)
    y = np.asarray(y)

    # split method
    if split == "stratified":
        splits = train_test_index_split_stratified(y, k, seed)
    elif split == "random":
        splits = train_test_index_split_random(len(y), k, seed)
    elif split == "nonrandom":
        pass
    else:
        raise ValueError("split type not found")

    results = []
    for fold, (train_idx, test_idx) in enumerate(splits, 1):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        if model_fn is None:
            results.append({
                "fold": fold,
                "train_idx": train_idx, "test_idx": test_idx,
                "X_train": X_train, "y_train": y_train,
                "X_test": X_test, "y_test": y_test,
            })
        else:
            # TODO: create model_fn for our four methods
            metrics = model_fn(X_train, y_train, X_test, y_test)  
            metrics["fold"] = fold
            results.append(metrics)
    return results

In [62]:
import pandas as pd
data = pd.read_csv("data/apps_all_background.csv")
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,prev_last_decision_status_internal_Canceled,prev_last_decision_status_internal_Refused,prev_last_decision_status_internal_Unused offer,most_recent_loan_active_external_group_Closed,most_recent_loan_active_external_group_Problematic,most_recent_loan_type_external_group_Business/Other,most_recent_loan_type_external_group_Consumer/Personal,most_recent_loan_type_external_group_Mortgage/Real estate,prev_last_decision_status_internal_group_Canceled,prev_last_decision_status_internal_group_Refused
0,384635,0,1,135000.0,1078200.0,34780.5,900000.0,0.010966,-14899,-4019,...,False,False,False,False,False,False,True,False,False,False
1,384638,0,0,180000.0,2013840.0,53253.0,1800000.0,0.035792,-23202,-12384,...,False,False,False,True,False,False,False,False,False,False
2,384639,0,0,157500.0,900000.0,26316.0,900000.0,0.035792,-20671,365243,...,True,False,False,False,False,False,True,False,True,False
3,384641,0,0,180000.0,490495.5,27387.0,454500.0,0.018029,-20168,-1161,...,False,False,False,False,False,False,True,False,False,False
4,384642,0,0,135000.0,508495.5,38146.5,454500.0,0.072508,-14048,-3569,...,False,False,False,True,False,False,True,False,False,False


In [89]:
data.columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       ...
       'prev_last_decision_status_internal_Canceled',
       'prev_last_decision_status_internal_Refused',
       'prev_last_decision_status_internal_Unused offer',
       'most_recent_loan_active_external_group_Closed',
       'most_recent_loan_active_external_group_Problematic',
       'most_recent_loan_type_external_group_Business/Other',
       'most_recent_loan_type_external_group_Consumer/Personal',
       'most_recent_loan_type_external_group_Mortgage/Real estate',
       'prev_last_decision_status_internal_group_Canceled',
       'prev_last_decision_status_internal_group_Refused'],
      dtype='object', length=226)

In [93]:
y = data["TARGET"]
X = data.drop(columns=["TARGET", "SK_ID_CURR"])
cross_validate(X, y, split="random")

[{'fold': 1,
  'train_idx': array([190417,  37011, 229432, ..., 131009, 129997,  75400],
        shape=(245776,)),
  'test_idx': array([182872, 188689,  62292, ..., 233989, 215798,  57681],
        shape=(61444,)),
  'X_train': array([[0, 112500.0, 450000.0, ..., False, False, False],
         [1, 135000.0, 450000.0, ..., False, True, False],
         [0, 157500.0, 540000.0, ..., False, True, False],
         ...,
         [1, 450000.0, 983299.5, ..., False, False, False],
         [0, 54000.0, 76410.0, ..., False, False, False],
         [0, 360000.0, 983160.0, ..., False, True, False]],
        shape=(245776, 224), dtype=object),
  'y_train': array([0, 0, 0, ..., 0, 0, 0], shape=(245776,)),
  'X_test': array([[0, 180000.0, 296280.0, ..., False, True, False],
         [0, 90000.0, 474048.0, ..., False, False, False],
         [0, 180000.0, 144801.0, ..., False, False, False],
         ...,
         [1, 90000.0, 270000.0, ..., False, False, False],
         [2, 180000.0, 497520.0, ...,

## helper functions

In [75]:
def to_float(X):
    X = np.asarray(X, dtype=float)
    np.nan_to_num(X, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
    return X

In [76]:
from sklearn.metrics import roc_auc_score
def roc_auc_from_probs(y_true, y_prob):
    try:
        return float(roc_auc_score(y_true, y_prob))
    except ValueError:
        return float("nan")

In [81]:
def inner_stratified_split(y, val_frac=0.2, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    val_idx = []
    for cls in np.unique(y):
        cls_idx = np.where(y==cls)[0]
        cls_idx = rng.permutation(cls_idx)
        cut = max(1, int(len(cls_idx)*val_frac))
        val_idx.extend(cls_idx[:cut])
    val_idx = np.array(val_idx)
    train_idx = np.setdiff1d(np.arange(len(y)), val_idx)
    return train_idx, val_idx

In [91]:
def best_threshold(y_true, y_prob, metric="f1"):
    grid = np.linspace(0, 1, 101)
    best_t, best_score = 0.5, -1.0
    for t in grid:
        y_pred = (y_prob >= t).astype(int)
        m = classification_metrics(y_true, y_pred)
        score = m[metric]
        if score > best_score:
            best_score = score
            best_t = t
    return float(best_t)

## modeling functions

In [85]:
# log regression
from sklearn.linear_model import LogisticRegression
def model_fn_lr(X_train, y_train, X_test, y_test, pca_arg=0.90, metric_for_thr="f1"):
    # inner train/val to tune for best prob threshold
    inner_train_idx, inner_val_idx = inner_stratified_split(y_train, val_frac=0.2, seed=42)
   
    # preprocessing
    steps = [("scale", StandardScaler())]
    if pca_arg is not None:
        steps.append(("pca", PCA(n_components=pca_arg)))
    preproc = Pipeline(steps)

    # tune and get best threshold on train/val set
    X_inner_train = preproc.fit_transform(X_train[inner_train_idx])
    X_inner_val = preproc.transform(X_train[inner_val_idx])
    clf = LogisticRegression(class_weight="balanced", max_iter=200)
    clf.fit(X_inner_train, y_train[inner_train_idx])
    y_prob_val = clf.predict_proba(X_inner_val)[:, 1]
    best_thr = best_threshold(y_train[inner_val_idx], y_prob_val, metric=metric_for_thr)

    # refit model on outer train/test 
    X_outer_train = preproc.fit_transform(X_train)
    X_outer_test = preproc.transform(X_test)
    clf.fit(X_outer_train, y_train)

    y_prob_test = clf.predict_proba(X_outer_test)[:, 1]
    y_pred_test = (y_prob_test >= best_thr).astype(int)
    
    m = classification_metrics(y_test, y_pred_test)
    m["roc_auc"] = roc_auc_from_probs(y_test, y_prob_test)
    m["model"] = "LR"
    m["pca"] = pca_arg
    m["best_thr"] = best_thr
    return m


## run cross val

To run cross-validation:
1. Write a model function (ex: model_fn_lr) taking in (X_train, y_train, X_test, y_test) and return metrics like accuracy, F1, ROC-AUC, etc.
2. Pass it into cross_validate() using a lambda to pass in the params needed for ur model function.

In [None]:
lr_results = cross_validate(
    X.values, y.values,
    k=5, split="stratified", seed=42,
    model_fn=lambda X_train, y_train, X_test, y_test : model_fn_lr(X_train, y_train, X_test, y_test, pca_arg=0.90)
)

In [90]:
lr_results

[{'tp': np.int64(2081),
  'tn': np.int64(49649),
  'fp': np.int64(6835),
  'fn': np.int64(2880),
  'acc': np.float64(0.8418911221417528),
  'prec': np.float64(0.23340062808434275),
  'rec': np.float64(0.4194718806692199),
  'f1': np.float64(0.2999207321467176),
  'roc_auc': 0.7504966363154879,
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.66,
  'fold': 1},
 {'tp': np.int64(2253),
  'tn': np.int64(48700),
  'fp': np.int64(7783),
  'fn': np.int64(2708),
  'acc': np.float64(0.829259162814921),
  'prec': np.float64(0.2244918294141092),
  'rec': np.float64(0.4541423100181415),
  'f1': np.float64(0.3004600920184037),
  'roc_auc': 0.7535783270050274,
  'model': 'LR',
  'pca': 0.9,
  'best_thr': 0.64,
  'fold': 2},
 {'tp': np.int64(2034),
  'tn': np.int64(50177),
  'fp': np.int64(6306),
  'fn': np.int64(2927),
  'acc': np.float64(0.8497330902936007),
  'prec': np.float64(0.24388489208633093),
  'rec': np.float64(0.40999798427736345),
  'f1': np.float64(0.3058416660401474),
  'roc_auc': 0.7602