In [24]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

# Metrics

In [25]:
def classification_metrics(y_true, y_pred):
    """
    computes conf matrix + acc, prec, rec, and f1
    
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # conf matrix
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))

    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2*prec*rec / max((prec+rec), 1e-12)) if (prec+rec)>0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics


def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc

# LDA Cross Validation

In [26]:
def cv_lda(data, feature_cols, target_col, params = None):
    if params == None:
        params = {}
    
    fold_metrics = []
    for f in data.fold.unique():

        #split data into train and test splits based on folds
        train = data[data.fold != f]
        test = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test, y_test = test[feature_cols], test[target_col]

        pca = PCA(n_components = 0.95)
        pca.fit(X_train)

        lda_model = LinearDiscriminantAnalysis()
        lda_model.fit(X_train, y_train)
        y_pred = lda_model.predict(X_test)

        y_prob = lda_model.predict_proba(X_test)[:, 1]
        y_train_prob = lda_model.predict_proba(X_train)[:, 1]

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)

    return pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)

        


# Import Data

In [27]:
apps_cv_strat = pd.read_csv("data/apps_cv_strat.txt")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.txt")
target_col = 'TARGET'
feature_cols = [col for col in apps_cv_strat.columns if col not in 
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500']]

# Results

In [28]:
results = cv_lda(apps_cv_strat, feature_cols, target_col, params = None)
results

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,193,44973,214,3776,0.91883,0.521945,0.474201,0.048627,0.995264,0.088208,0.763468,0.763918,1
1,49156,198,44965,222,3771,0.918769,0.522487,0.471429,0.049887,0.995087,0.090226,0.761945,0.76509,2
2,49156,188,44941,246,3781,0.918077,0.520962,0.43318,0.047367,0.994556,0.085396,0.762443,0.764113,3
3,49155,195,44947,239,3774,0.91836,0.521921,0.449309,0.049131,0.994711,0.088576,0.763293,0.763995,4
4,49154,205,44960,226,3763,0.918847,0.523331,0.475638,0.051663,0.994998,0.093203,0.759434,0.765144,5


# Holdout Evaluation

In [30]:
X_train, y_train = apps_cv_strat[feature_cols], apps_cv_strat[target_col]
X_test, y_test = apps_holdout_strat[feature_cols], apps_holdout_strat[target_col]

pca = PCA(n_components = 0.95)
pca.fit(X_train)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_prob = lda.predict_proba(X_test)[:, 1]
y_train_prob = lda.predict_proba(X_train)[:, 1]
y_pred = lda.predict(X_test)

metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics

  auc = np.trapz(tpr, fpr)


{'n': 61443,
 'tp': np.int64(239),
 'tn': np.int64(56200),
 'fp': np.int64(283),
 'fn': np.int64(4721),
 'acc': np.float64(0.9185586641277281),
 'bal_acc': np.float64(0.5215875633861858),
 'prec': np.float64(0.4578544061302682),
 'rec': np.float64(0.048185483870967744),
 'spec': np.float64(0.994989642901404),
 'f1': np.float64(0.08719445457862095),
 'roc_auc': np.float64(0.7647177383660398),
 'train_roc_auc': np.float64(0.764138868194914)}

# Data Leakage Check

In [31]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled = cv_lda(shuffled, feature_cols, target_col, params = None)
print("Shuffled mean AUC:", fold_results_shuffled.roc_auc.mean())

  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)
  auc = np.trapz(tpr, fpr)


Shuffled mean AUC: 0.4974010862544927
