# Tuning a LR Model

copied from `xgb_fitting.ipynb`

The goal of this notebook is to train and evaluate an LR model, comparing it's performance on a holdout set against other types of models (LR,LDA, XGBoost). 

To ensure reproducibility and consistent evaluation across models, all datasets were **pre-split into cross-val data and holdout data** as below:

| Split type           | CV training file     | Holdout file              | Description                              |
| -------------------- | -------------------- | ------------------------- | ---------------------------------------- |
| **Random**           | `apps_cv_random.csv` | `apps_holdout_random.csv` | Simple random sampling                   |
| **Stratified**       | `apps_cv_strat.csv`  | `apps_holdout_strat.csv`  | Stratified by `TARGET`                   |
| **Multi-Stratified** | `apps_cv_multi.csv`  | `apps_holdout_multi.csv`  | Stratified by `TARGET` + `CODE_GENDER_M` |

Each dataset for cross-validation (`apps_cv_*.csv`) also contains a column, `fold`, with pre-assigned folds from 1-5 using the corresponding splitting method to ensure consistent evaluation. Therefore, no additional splitting is needed inside this notebook -- can simply loop through assigned folds for cross-validation.


## Evaluation Functions

#### Metric calculators:

Copied from `cross_val.ipynb`

In [27]:
# METRICS 

def classification_metrics(y_true, y_pred):
    """
    Computes confusion matrix + accuracy, precision, recall, F1, and balanced accuracy.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Confusion matrix components
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Metrics
    acc  = (tp + tn) / max((tp + tn + fp + fn), 1)
    prec = tp / max((tp + fp), 1)
    rec  = tp / max((tp + fn), 1)
    f1   = (2 * prec * rec / max((prec + rec), 1e-12)) if (prec + rec) > 0 else 0.0

    # Specificity (True Negative Rate)
    spec = tn / max((tn + fp), 1)

    # Balanced accuracy
    bal_acc = 0.5 * (rec + spec)

    metrics = {
        "n": len(y_true),
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "acc": acc, "bal_acc": bal_acc, "prec": prec, "rec": rec, "spec": spec,
        "f1": f1
    }
    return metrics

def roc_auc_from_probs(y_true, y_prob):
    
    desc_sort_indices = np.argsort(-y_prob)
    y_true = np.array(y_true)[desc_sort_indices]
    y_prob = np.array(y_prob)[desc_sort_indices]
    pos = np.sum(y_true == 1)
    neg = np.sum(y_true == 0)

    # running totals for TPR/FPR
    tpr = [0.0]
    fpr = [0.0]
    tp = fp = 0
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / pos)
        fpr.append(fp / neg)

    # get auc
    auc = np.trapezoid(tpr, fpr)
    return auc

In [28]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [66]:
def cv_lr(data, feature_cols, target_col, params=None, use_pca=True, pca_var_explained=0.95):
    if params is None:
        params = {
            "penalty": "l2",
            "C": 1.0,
            "solver": "lbfgs",
            "class_weight": "balanced",
            "max_iter": 500
        }
    
    fold_metrics = []
    for f in data.fold.unique():

        # split data into train and test based on folds
        train = data[data.fold != f]
        test  = data[data.fold == f]
        X_train, y_train = train[feature_cols].values, train[target_col].values
        X_test,  y_test  = test[feature_cols].values,  test[target_col].values

        # scale
        scaler = StandardScaler().fit(X_train)
        X_train_s = scaler.transform(X_train)
        X_test_s  = scaler.transform(X_test)

        # pca
        if use_pca:
            pca = PCA(n_components=0.95)
            pca.fit(X_train_s)
        
            X_train_t = pca.transform(X_train_s)
            X_test_t  = pca.transform(X_test_s)
        else:
            X_train_t, X_test_t = X_train_s, X_test_s

        lr_model = LogisticRegression()
        lr_model.fit(X_train_t, y_train)

        # predictions
        y_pred       = lr_model.predict(X_test_t)
        y_prob       = lr_model.predict_proba(X_test_t)[:, 1]
        y_train_prob = lr_model.predict_proba(X_train_t)[:, 1]

        metrics = classification_metrics(y_test, y_pred)
        metrics['roc_auc']       = roc_auc_from_probs(y_test, y_prob)
        metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
        metrics['fold'] = int(f)

        fold_metrics.append(metrics)
        
    return pd.DataFrame(fold_metrics).sort_values("fold").reset_index(drop=True)

## setup

In [67]:
apps_cv_strat = pd.read_csv("data/apps_cv_strat.csv")
apps_holdout_strat = pd.read_csv("data/apps_holdout_strat.csv")
target_col = 'TARGET'
feature_cols = [col for col in apps_cv_strat.columns if col not in 
                [target_col, 'SK_ID_CURR', 'fold', 'neighbors_target_mean_500']]

In [68]:
results_lr = cv_lr(apps_cv_strat,feature_cols,target_col, use_pca=False, pca_var_explained=0.95)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X 

In [69]:
results_lr

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc,fold
0,49156,97,45101,86,3872,0.919481,0.511268,0.530055,0.024439,0.998097,0.046724,0.763803,0.765237,1
1,49156,96,45099,88,3873,0.91942,0.51112,0.521739,0.024187,0.998053,0.046232,0.762838,0.765395,2
2,49156,89,45091,96,3880,0.919115,0.51015,0.481081,0.022424,0.997875,0.04285,0.764557,0.765152,3
3,49155,100,45095,91,3869,0.919439,0.511591,0.52356,0.025195,0.997986,0.048077,0.763779,0.765298,4
4,49154,108,45097,89,3860,0.919661,0.512624,0.548223,0.027218,0.99803,0.051861,0.75959,0.766491,5


## todo: grid search/ hyperparam tuning

In [104]:
from sklearn.model_selection import ParameterGrid
import pandas as pd
import numpy as np

def grid_search_lr(data, feature_cols, target_col, param_grid):
    rows = []
    best = {"mean_auc": -np.inf, "params": None}

    for i, params in enumerate(ParameterGrid(param_grid), start=1):
        cv = cv_lr(data, feature_cols, target_col, params=params)  # your CV function
        mean_auc = cv["roc_auc"].mean()
        std_auc  = cv["roc_auc"].std()

        rows.append({
            **params,
            "mean_auc": mean_auc,
            "std_auc": std_auc,
            "mean_recall": cv["rec"].mean(),
            "mean_f1": cv["f1"].mean(),
            "mean_acc": cv["acc"].mean(),
            "mean_bal_acc": cv["bal_acc"].mean(),
        })

        if mean_auc > best["mean_auc"]:
            best = {"mean_auc": mean_auc, "params": params}

        if i % 5 == 0:
            print(f"{i} combos | best AUC={best['mean_auc']:.4f} | best={best['params']}")

    results = pd.DataFrame(rows).sort_values("mean_auc", ascending=False).reset_index(drop=True)
    return results, best["params"]

In [105]:
param_grid = {
    "penalty": ["l2"],
    "C": [0.01, 0.1, 1, 10, 100],    
    "solver": ["lbfgs", "saga"],    
    "class_weight": ["balanced"],    
    "max_iter": [500]
}

In [106]:
lr_grid_results, best_lr_params = grid_search_lr(apps_cv_strat, feature_cols, target_col, param_grid)
lr_grid_results.head(), best_lr_params

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n

5 combos | best AUC=0.7556 | best={'C': 0.01, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n

10 combos | best AUC=0.7556 | best={'C': 0.01, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}


(      C class_weight  max_iter penalty solver  mean_auc  std_auc  mean_recall  \
 0  0.01     balanced       500      l2  lbfgs  0.755582  0.00174     0.018696   
 1  0.01     balanced       500      l2   saga  0.755582  0.00174     0.018696   
 2  0.10     balanced       500      l2  lbfgs  0.755582  0.00174     0.018696   
 3  0.10     balanced       500      l2   saga  0.755582  0.00174     0.018696   
 4  1.00     balanced       500      l2  lbfgs  0.755582  0.00174     0.018696   
 
     mean_f1  mean_acc  mean_bal_acc  
 0  0.036135  0.919476      0.508644  
 1  0.036135  0.919476      0.508644  
 2  0.036135  0.919476      0.508644  
 3  0.036135  0.919476      0.508644  
 4  0.036135  0.919476      0.508644  ,
 {'C': 0.01,
  'class_weight': 'balanced',
  'max_iter': 500,
  'penalty': 'l2',
  'solver': 'lbfgs'})

In [107]:
lr_grid_results

Unnamed: 0,C,class_weight,max_iter,penalty,solver,mean_auc,std_auc,mean_recall,mean_f1,mean_acc,mean_bal_acc
0,0.01,balanced,500,l2,lbfgs,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
1,0.01,balanced,500,l2,saga,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
2,0.1,balanced,500,l2,lbfgs,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
3,0.1,balanced,500,l2,saga,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
4,1.0,balanced,500,l2,lbfgs,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
5,1.0,balanced,500,l2,saga,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
6,10.0,balanced,500,l2,lbfgs,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
7,10.0,balanced,500,l2,saga,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
8,100.0,balanced,500,l2,lbfgs,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644
9,100.0,balanced,500,l2,saga,0.755582,0.00174,0.018696,0.036135,0.919476,0.508644


## holdout set

In [96]:
X_train, y_train = apps_cv_strat[feature_cols], apps_cv_strat[target_col]
X_test, y_test = apps_holdout_strat[feature_cols], apps_holdout_strat[target_col]
    
params = {
            "penalty": "l2",
            "C": 1.0,
            "solver": "lbfgs",
            "class_weight": "balanced",
            "max_iter": 500
        }

# pca = PCA(n_components = 0.95)
# pca.fit(X_train)

lr_model = LogisticRegression(**params)
lr_model.fit(X_train, y_train)

y_prob = lr_model.predict_proba(X_test)[:, 1]
y_train_prob = lr_model.predict_proba(X_train)[:, 1]
y_pred = lr_model.predict(X_test)

metrics = classification_metrics(y_test, y_pred)
metrics['roc_auc'] = roc_auc_from_probs(y_test, y_prob)
metrics['train_roc_auc'] = roc_auc_from_probs(y_train, y_train_prob)
metrics = pd.DataFrame([metrics])

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [97]:
metrics

Unnamed: 0,n,tp,tn,fp,fn,acc,bal_acc,prec,rec,spec,f1,roc_auc,train_roc_auc
0,61443,2860,35091,21392,2100,0.617662,0.59894,0.117928,0.576613,0.621267,0.19581,0.633409,0.631768
