# Tuning a LR Model

copied from `xgb_fitting.ipynb`

The goal of this notebook is to train and evaluate an LR model, comparing it's performance on a holdout set against other types of models (LR,LDA, XGBoost). 

To ensure reproducibility and consistent evaluation across models, all datasets were **pre-split into cross-val data and holdout data** as below:

| Split type           | CV training file     | Holdout file              | Description                              |
| -------------------- | -------------------- | ------------------------- | ---------------------------------------- |
| **Random**           | `apps_cv_random.csv` | `apps_holdout_random.csv` | Simple random sampling                   |
| **Stratified**       | `apps_cv_strat.csv`  | `apps_holdout_strat.csv`  | Stratified by `TARGET`                   |
| **Multi-Stratified** | `apps_cv_multi.csv`  | `apps_holdout_multi.csv`  | Stratified by `TARGET` + `CODE_GENDER_M` |

Each dataset for cross-validation (`apps_cv_*.csv`) also contains a column, `fold`, with pre-assigned folds from 1-5 using the corresponding splitting method to ensure consistent evaluation. Therefore, no additional splitting is needed inside this notebook -- can simply loop through assigned folds for cross-validation.


## Evaluation Functions

#### Metric calculators:

Copied from `cross_val.ipynb`

In [1]:
# METRICS 

def classification_metrics(y_true, y_pred):
    """
    Computes confusion matrix + accuracy, precision, recall, F1, and balanced accuracy.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Confusion matrix components
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Metrics
    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2 * prec * rec / max((prec + rec), 1e-12)) if (prec + rec) > 0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics

def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapezoid(tpr, fpr)
    return auc

In [18]:
import pandas as pd 
import numpy as np 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [19]:
def cv_lr(data, feature_cols, target_col, params=None):
    if params is None:
        params = {
            "penalty": "l2",
            "C": 1.0,
            "solver": "lbfgs",
            "class_weight": "balanced",
            "max_iter": 500
        }

    fold_metrics = []
    for f in sorted(data.fold.unique()):

        # split data into train and test folds
        train = data[data.fold != f]
        test  = data[data.fold == f]
        X_train, y_train = train[feature_cols], train[target_col]
        X_test,  y_test  = test[feature_cols],  test[target_col]


        pipeline = Pipeline([
            ('scale', StandardScaler()),
            ('pca', PCA(n_components=0.95)),
            ('lr', LogisticRegression(**params))
        ])
        pipeline.fit(X_train, y_train)

        y_pred       = pipeline.predict(X_test)
        y_prob       = pipeline.predict_proba(X_test)[:, 1]
        y_train_prob = pipeline.predict_proba(X_train)[:, 1]

        m = classification_metrics(y_test, y_pred)
        m["roc_auc"]       = roc_auc_from_probs(y_test, y_prob)
        m["train_roc_auc"] = roc_auc_from_probs(y_train, y_train_prob)
        m["fold"] = int(f)

        fold_metrics.append(m)

    return pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)

## setup

In [4]:
apps_cv_strat = pd.read_csv("data/apps_cv_strat.csv")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.csv")
target_col = 'TARGET'
feature_cols = [col for col in apps_cv_strat.columns if col not in 
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500']]

In [5]:
results_lr = cv_lr(apps_cv_strat,feature_cols,target_col)

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

In [6]:
results_lr

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,2709,31756,13431,1260,0.701135,0.692654,0.167844,0.68254,0.702768,0.269432,0.759086,0.759407,1
1,49156,2753,31377,13810,1216,0.69432,0.694003,0.166214,0.693626,0.694381,0.268167,0.758493,0.759274,2
2,49156,2696,31741,13446,1273,0.700566,0.69085,0.167018,0.679264,0.702437,0.268112,0.757978,0.759423,3
3,49155,2708,31549,13637,1261,0.696918,0.690245,0.165678,0.682288,0.698203,0.266614,0.755883,0.757803,4
4,49154,2683,31572,13614,1285,0.696891,0.687436,0.164632,0.676159,0.698712,0.264792,0.754353,0.760123,5


## todo: grid search/ hyperparam tuning

In [9]:
from sklearn.model_selection import ParameterGrid
import pandas as pd
import numpy as np

def grid_search_lr(data, feature_cols, target_col, param_grid):
    rows = []
    best = {"mean_auc": -np.inf, "params": None}

    for i, params in enumerate(ParameterGrid(param_grid), start=1):
        cv = cv_lr(data, feature_cols, target_col, params=params)  # your CV function
        mean_auc = cv["roc_auc"].mean()
        std_auc  = cv["roc_auc"].std()

        rows.append({
            **params,
            "mean_auc": mean_auc,
            "std_auc": std_auc,
            "mean_recall": cv["rec"].mean(),
            "mean_f1": cv["f1"].mean(),
            "mean_acc": cv["acc"].mean(),
            "mean_bal_acc": cv["bal_acc"].mean(),
        })

        if mean_auc > best["mean_auc"]:
            best = {"mean_auc": mean_auc, "params": params}

        if i % 5 == 0:
            print(f"{i} combos | best AUC={best['mean_auc']:.4f} | best={best['params']}")

    results = pd.DataFrame(rows).sort_values("mean_auc", ascending=False).reset_index(drop=True)
    return results, best["params"]

In [10]:
param_grid = {
    "penalty": ["l2"],
    "C": [0.01, 0.1, 1, 10, 100],    
    "solver": ["lbfgs", "saga"],    
    "class_weight": ["balanced"],    
    "max_iter": [500]
}

In [None]:
lr_grid_results, best_lr_params = grid_search_lr(apps_cv_strat, feature_cols, target_col, param_grid)
lr_grid_results.head(), best_lr_params

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

In [None]:
lr_grid_results

## holdout set

In [15]:
X_train, y_train = apps_cv_strat[feature_cols], apps_cv_strat[target_col]
X_test,  y_test  = apps_holdout_strat[feature_cols], apps_holdout_strat[target_col]

params = {
            "penalty": "l2",
            "C": 1.0,
            "solver": "lbfgs",
            "class_weight": "balanced",
            "max_iter": 500
        }

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("lr", LogisticRegression(**params)),
])

pipe.fit(X_train, y_train)
y_pred       = pipe.predict(X_test)
y_prob       = pipe.predict_proba(X_test)[:, 1]
y_train_prob = pipe.predict_proba(X_train)[:, 1]


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

In [16]:
metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics = pd.DataFrame([metrics])

In [17]:
metrics

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc
0,61443,3419,39481,17002,1541,0.698208,0.694152,0.167426,0.689315,0.698989,0.269414,0.756646,0.75769
