# Tuning a LR Model

copied from `xgb_fitting.ipynb`

The goal of this notebook is to train and evaluate an LR model, comparing it's performance on a holdout set against other types of models (LR,LDA, XGBoost). 

To ensure reproducibility and consistent evaluation across models, all datasets were **pre-split into cross-val data and holdout data** as below:

| Split type           | CV training file     | Holdout file              | Description                              |
| -------------------- | -------------------- | ------------------------- | ---------------------------------------- |
| **Random**           | `apps_cv_random.csv` | `apps_holdout_random.csv` | Simple random sampling                   |
| **Stratified**       | `apps_cv_strat.csv`  | `apps_holdout_strat.csv`  | Stratified by `TARGET`                   |
| **Multi-Stratified** | `apps_cv_multi.csv`  | `apps_holdout_multi.csv`  | Stratified by `TARGET` + `CODE_GENDER_M` |

Each dataset for cross-validation (`apps_cv_*.csv`) also contains a column, `fold`, with pre-assigned folds from 1-5 using the corresponding splitting method to ensure consistent evaluation. Therefore, no additional splitting is needed inside this notebook -- can simply loop through assigned folds for cross-validation.


## Evaluation Functions

#### Metric calculators:

Copied from `cross_val.ipynb`

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [2]:
# METRICS 

def classification_metrics(y_true, y_pred):
    """
    Computes confusion matrix + accuracy, precision, recall, F1, and balanced accuracy.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Confusion matrix components
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Metrics
    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2 * prec * rec / max((prec + rec), 1e-12)) if (prec + rec) > 0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics

def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapz(tpr, fpr)
    return auc

def pr_auc_from_probs(y_true, y_prob):
    # Sort by predicted probability descending
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    
    tp = 0
    fp = 0
    pos = np.sum(y_true == 1)
    
    precision = [1.0]  # starts at 1 when recall=0
    recall = [0.0]
    
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        prec = tp / (tp + fp)
        rec = tp / pos
        precision.append(prec)
        recall.append(rec)
    
    # ensure it ends at recall=1
    precision = np.array(precision)
    recall = np.array(recall)
    
    # integrate area under curve
    auc_pr = np.trapz(precision, recall)
    return auc_pr

In [3]:
def cv_lr(data, feature_cols, target_col, params=None):
    if params is None:
        # params = {
        #     "penalty": "l2",
        #     "C": 1.0,
        #     "solver": "lbfgs",
        #     "class_weight": "balanced",
        #     "max_iter": 500
        # }
        params = {}

    fold_metrics = []
    for f in sorted(data.fold.unique()):

        # split data into train and test folds
        train = data[data.fold != f]
        test  = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test,  y_test  = test[feature_cols],  test[target_col]

        pipeline = Pipeline([
            ('scale', StandardScaler()),
            ('pca', PCA(0.95)),
            ('lr', LogisticRegression(**params))
        ])
        pipeline.fit(X_train, y_train)

        y_pred       = pipeline.predict(X_test)
        y_prob       = pipeline.predict_proba(X_test)[:, 1]
        y_train_prob = pipeline.predict_proba(X_train)[:, 1]

        m = classification_metrics(y_test, y_pred)
        m["roc_auc"]       = roc_auc_from_probs(y_test, y_prob)
        m["train_roc_auc"] = roc_auc_from_probs(y_train, y_train_prob)
        m["fold"] = int(f)

        fold_metrics.append(m)

    return pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)

## setup

In [4]:
apps_cv_strat = pd.read_csv("data/apps_cv_strat.csv")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.csv")
target_col = 'TARGET'
feature_cols = [col for col in apps_cv_strat.columns if col not in 
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500']]

In [5]:
apps_cv_strat.drop(columns=["CODE_GENDER_XNA","CODE_GENDER_M","AGE_INT"])

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,prev_last_decision_status_internal_Refused,prev_last_decision_status_internal_Unused offer,most_recent_loan_active_external_group_Closed,most_recent_loan_active_external_group_Problematic,most_recent_loan_type_external_group_Business/Other,most_recent_loan_type_external_group_Consumer/Personal,most_recent_loan_type_external_group_Mortgage/Real estate,prev_last_decision_status_internal_group_Canceled,prev_last_decision_status_internal_group_Refused,fold
0,384638,0,0,180000.0,2013840.0,53253.0,1800000.0,0.035792,-23202,-12384,...,False,False,True,False,False,False,False,False,False,5
1,384641,0,0,180000.0,490495.5,27387.0,454500.0,0.018029,-20168,-1161,...,False,False,False,False,False,True,False,False,False,3
2,384642,0,0,135000.0,508495.5,38146.5,454500.0,0.072508,-14048,-3569,...,False,False,True,False,False,True,False,False,False,4
3,384645,1,0,180000.0,225000.0,20637.0,225000.0,0.010006,-11025,-4194,...,True,False,False,False,False,True,False,False,True,1
4,384647,0,0,180000.0,405000.0,19611.0,405000.0,0.007330,-18097,-705,...,False,False,False,False,False,True,False,False,False,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245772,384117,0,0,270000.0,219042.0,21793.5,193500.0,0.046220,-17988,-2547,...,False,False,False,False,False,True,False,False,False,1
245773,383015,0,1,144000.0,900000.0,45850.5,900000.0,0.026392,-12468,-132,...,False,False,True,False,False,True,False,False,False,3
245774,383791,0,0,157500.0,202500.0,10125.0,202500.0,0.024610,-16301,-8166,...,False,False,False,False,False,True,False,False,False,2
245775,383885,0,0,157500.0,1350000.0,67500.0,1350000.0,0.031329,-22246,-7279,...,False,False,False,False,False,False,False,False,False,1


In [12]:
results_lr = cv_lr(apps_cv_strat,feature_cols,target_col)

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

In [13]:
results_lr

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,68,45128,59,3901,0.91944,0.507914,0.535433,0.017133,0.998694,0.033203,0.757047,0.756912,1
1,49156,74,45117,70,3895,0.919338,0.508548,0.513889,0.018644,0.998451,0.035983,0.756781,0.75699,2
2,49156,80,45121,66,3889,0.919542,0.509348,0.547945,0.020156,0.998539,0.038882,0.756078,0.75711,3
3,49155,73,45117,69,3896,0.919337,0.508433,0.514085,0.018393,0.998473,0.035514,0.755285,0.757154,4
4,49154,76,45132,54,3892,0.919722,0.508979,0.584615,0.019153,0.998805,0.037091,0.752719,0.75822,5


## todo: grid search/ hyperparam tuning

In [32]:
from sklearn.model_selection import ParameterGrid
import pandas as pd
import numpy as np

def grid_search_lr(data, feature_cols, target_col, param_grid):
    rows = []
    best = {"mean_auc": -np.inf, "params": None}

    for i, params in enumerate(ParameterGrid(param_grid), start=1):
        cv = cv_lr(data, feature_cols, target_col, params=params)  
        mean_auc = cv["roc_auc"].mean()
        std_auc  = cv["roc_auc"].std()

        rows.append({
            **params,
            "mean_auc": mean_auc,
            "std_auc": std_auc,
            "mean_recall": cv["rec"].mean(),
            "mean_f1": cv["f1"].mean(),
            "mean_acc": cv["acc"].mean(),
            "mean_bal_acc": cv["bal_acc"].mean(),
        })

        if mean_auc > best["mean_auc"]:
            best = {"mean_auc": mean_auc, "params": params}

        if i % 5 == 0:
            print(f"{i} combos | best AUC={best['mean_auc']:.4f} | best={best['params']}")

    results = pd.DataFrame(rows).sort_values("mean_auc", ascending=False).reset_index(drop=True)
    return results, best["params"]

In [33]:
param_grid = {
    "penalty": ["l2"],
    "C": [0.01, 0.1, 1, 10, 100],    
    "solver": ["lbfgs", "saga"],    
    "class_weight": ["balanced"],    
    "max_iter": [500]
}

In [None]:
lr_grid_results, best_lr_params = grid_search_lr(apps_cv_strat, feature_cols, target_col, param_grid)
lr_grid_results.head(), best_lr_params

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

5 combos | best AUC=0.7572 | best={'C': 1, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


In [None]:
lr_grid_results

## holdout set

In [6]:
X_train, y_train = apps_cv_strat[feature_cols], apps_cv_strat[target_col]
X_test,  y_test  = apps_holdout_strat[feature_cols], apps_holdout_strat[target_col]

params = {
            "penalty": "l2",
            "C": 1.0,
            "solver": "lbfgs",
            "class_weight": "balanced",
            "max_iter": 500
        }

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("lr", LogisticRegression(**params)),
])

pipe.fit(X_train, y_train)
y_pred       = pipe.predict(X_test)
y_prob       = pipe.predict_proba(X_test)[:, 1]
y_train_prob = pipe.predict_proba(X_train)[:, 1]


In [7]:
metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics['pr_auc'] = pr_auc_from_probs(y_test, y_prob)
metrics = pd.DataFrame([metrics])

In [8]:
metrics

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,pr_auc
0,61443,3459,39615,16868,1501,0.70104,0.69937,0.170168,0.697379,0.701361,0.273579,0.765293,0.758341,0.238345


## leakage check

In [15]:
shuffled = apps_cv_strat.copy()
shuffled['TARGET'] = np.random.permutation(shuffled['TARGET'].values)
fold_results_shuffled = cv_lr(shuffled, feature_cols, target_col, params)
print("Shuffled mean AUC:", fold_results_shuffled.roc_auc.mean())

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

Shuffled mean AUC: 0.5024842111752665


  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  ret = a @ b
  ret = a @ b
  ret = a @ b
