## Appendix — Baselines & Experiments

Nesta seção estão modelos iniciais testados apenas para referência histórica:

- Logistic Regression (AUC ~0.74, convergência lenta).
- ElasticNet (AUC ~0.73, muito pesado, não adotado).


In [None]:
# Preprocessing
num_cols  = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols  = X.select_dtypes(include=["object"]).columns.tolist()
low_card  = [c for c in cat_cols if X[c].nunique() < 10]
high_card = [c for c in cat_cols if X[c].nunique() >= 10]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler",  RobustScaler())])

cat_low_pipe  = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

cat_high_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("tgt", TargetEncoder())])

preprocessor = ColumnTransformer([
    ("num",  num_pipe,      num_cols),
    ("ohe",  cat_low_pipe,  low_card),
    ("tgt",  cat_high_pipe, high_card),
])

# Model
clf = LogisticRegression(solver="lbfgs", penalty="l2", max_iter=2000, class_weight="balanced")

pipe = Pipeline([("prep", preprocessor),
                 ("clf",  clf)])

# Helpers
def ks_statistic(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return np.max(tpr - fpr)

def precision_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return precision_score(y_true, y_pred)

def recall_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return recall_score(y_true, y_pred)

# CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))
fold_rows = []

for i, (tr, va) in enumerate(cv.split(X, y), 1):
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    pipe.fit(X_tr, y_tr)
    proba = pipe.predict_proba(X_va)[:, 1]
    oof[va] = proba

    auc = roc_auc_score(y_va, proba)
    ks  = ks_statistic(y_va, proba)
    p20 = precision_at_k(y_va, proba, k=0.2)
    r20 = recall_at_k(y_va, proba, k=0.2)
    fold_rows.append({"fold": i, "AUC": auc, "KS": ks, "P@20": p20, "R@20": r20})
    print(f"[Fold {i}] AUC={auc:.4f} | KS={ks:.4f} | P@20={p20:.4f} | R@20={r20:.4f}")

df_folds = pd.DataFrame(fold_rows)
auc_all = roc_auc_score(y, oof)
ks_all  = ks_statistic(y, oof)
p20_all = precision_at_k(y, oof, k=0.2)
r20_all = recall_at_k(y, oof, k=0.2)

print("\n[OOF] AUC={:.4f} | KS={:.4f} | P@20={:.4f} | R@20={:.4f}".format(auc_all, ks_all, p20_all, r20_all))
display(df_folds)


c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 2000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 1] AUC=0.7389 | KS=0.3545 | P@20=0.1996 | R@20=0.4945
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 2000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 2] AUC=0.7504 | KS=0.3713 | P@20=0.2006 | R@20=0.4971
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 2000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 3] AUC=0.7445 | KS=0.3640 | P@20=0.2005 | R@20=0.4967
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 2000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 4] AUC=0.7488 | KS=0.3737 | P@20=0.2053 | R@20=0.5086
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 2000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 5] AUC=0.7379 | KS=0.3539 | P@20=0.1960 | R@20=0.4856

[OOF] AUC=0.7441 | KS=0.3613 | P@20=0.2003 | R@20=0.4962


fold	AUC	KS	P@20	R@20
0	1	0.738865	0.354508	0.199577	0.494461
1	2	0.750388	0.371348	0.200634	0.497080
2	3	0.744543	0.363961	0.200472	0.496677
3	4	0.748788	0.373713	0.205268	0.508560
4	5	0.737892	0.353851	0.196000	0.485599


### Baseline Logistic Regression — Results & Takeaways

My 5-fold OOF results are:

- **AUC:** 0.7441  
- **KS:** 0.3613  
- **Precision@20%:** 0.2003  
- **Recall@20%:** 0.4962  

This is a solid baseline. The lift at the top 20% is meaningful, and KS is in an acceptable range for a first pass.

I did observe a **ConvergenceWarning** from `lbfgs` (max_iter hit the limit). This typically happens with high-dimensional feature spaces (OHE + target encoders). I’ll fix it by increasing `max_iter` and slightly strengthening regularization (**lower C**). That keeps the solution stable and avoids overfitting.


### Baseline Logistic Regression — Results & Takeaways

My 5-fold OOF results are:

- **AUC:** 0.7441  
- **KS:** 0.3613  
- **Precision@20%:** 0.2003  
- **Recall@20%:** 0.4962  

This is a solid baseline. The lift at the top 20% is meaningful, and KS is in an acceptable range for a first pass.

I did observe a **ConvergenceWarning** from `lbfgs` (max_iter hit the limit). This typically happens with high-dimensional feature spaces (OHE + target encoders). I’ll fix it by increasing `max_iter` and slightly strengthening regularization (**lower C**). That keeps the solution stable and avoids overfitting.


In [None]:
clf = LogisticRegression(
    solver="lbfgs",
    penalty="l2",
    C=0.5,         
    max_iter=5000,  
    class_weight="balanced"
)

pipe = Pipeline([
    ("prep", preprocessor),
    ("clf",  clf)
])

# Stratified 5-Fold CV — OOF metrics

I evaluate the baseline with Stratified K-Fold CV (no leakage).  
Metrics reported: **ROC-AUC**, **KS**, **Precision@20%**, **Recall@20%**.

def ks_statistic(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return np.max(tpr - fpr)

def precision_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return precision_score(y_true, y_pred)

def recall_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return recall_score(y_true, y_pred)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X))
rows = []

for i, (tr, va) in enumerate(cv.split(X, y), 1):
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    pipe.fit(X_tr, y_tr)
    proba = pipe.predict_proba(X_va)[:, 1]
    oof[va] = proba

    auc = roc_auc_score(y_va, proba)
    ks  = ks_statistic(y_va, proba)
    p20 = precision_at_k(y_va, proba, k=0.2)
    r20 = recall_at_k(y_va, proba, k=0.2)
    rows.append({"fold": i, "AUC": auc, "KS": ks, "P@20": p20, "R@20": r20})
    print(f"[Fold {i}] AUC={auc:.4f} | KS={ks:.4f} | P@20={p20:.4f} | R@20={r20:.4f}")

df_folds = pd.DataFrame(rows)
auc_all = roc_auc_score(y, oof)
ks_all  = ks_statistic(y, oof)
p20_all = precision_at_k(y, oof, k=0.2)
r20_all = recall_at_k(y, oof, k=0.2)

print("\n[OOF] AUC={:.4f} | KS={:.4f} | P@20={:.4f} | R@20={:.4f}".format(auc_all, ks_all, p20_all, r20_all))
display(df_folds)

c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 5000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 1] AUC=0.7407 | KS=0.3603 | P@20=0.2007 | R@20=0.4973
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 5000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 2] AUC=0.7525 | KS=0.3746 | P@20=0.2027 | R@20=0.5023
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 5000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 3] AUC=0.7460 | KS=0.3645 | P@20=0.2021 | R@20=0.5007
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 5000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 4] AUC=0.7505 | KS=0.3775 | P@20=0.2057 | R@20=0.5096
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 5000 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Fold 5] AUC=0.7401 | KS=0.3569 | P@20=0.1971 | R@20=0.4884

[OOF] AUC=0.7459 | KS=0.3645 | P@20=0.2018 | R@20=0.5000


fold	AUC	KS	P@20	R@20
0	1	0.740652	0.360300	0.200715	0.497281
1	2	0.752507	0.374647	0.202748	0.502316
2	3	0.745985	0.364495	0.202097	0.500705
3	4	0.750539	0.377460	0.205674	0.509567
4	5	0.740120	0.356866	0.197138	0.488419


### Baseline Logistic Regression — OOF Results

My 5-fold OOF metrics with the stability patch are:

- **AUC:** 0.7459  
- **KS:** 0.3645  
- **Precision@20%:** 0.2018  
- **Recall@20%:** 0.5000  

This is a solid starting point. I still see a `ConvergenceWarning` from `lbfgs` (even at `max_iter=5000`), which is common in high-dimensional spaces after OHE/encoders. To keep the baseline fully stable (and sometimes a tad better), I’ll switch to the `saga` solver (supports large feature spaces and elastic-net regularization). I’ll keep L2 first; if needed, I’ll try a small L1 component (`elasticnet`) to shrink noisy coefficients.


# Elastic Net variant for high-dimensional OHE

As an alternative, I can try `solver="saga"` with `elasticnet`, which often behaves well in wide feature spaces by shrinking coefficients.


In [None]:
# === Logistic Regression (saga + elasticnet) — optimized CV block ===
import time
import numpy as np
import pandas as pd
from IPython.display import display
from joblib import Memory

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# --- Assumes X, y, and `preprocessor` are already defined above ---
# If not, ensure you run the preprocessing cell first.

# Cache (helps when re-running the same pipeline code during experimentation)
cache = Memory(location="artifacts/sk_cache", verbose=0)

# Metrics helpers (idempotent: safe to re-run)
def ks_statistic(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return float(np.max(tpr - fpr))

def precision_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return float(precision_score(y_true, y_pred))

def recall_at_k(y_true, y_score, k=0.2):
    n = len(y_true); topk = int(np.ceil(k*n))
    idx = np.argsort(-y_score)[:topk]
    y_pred = np.zeros(n, dtype=int); y_pred[idx] = 1
    return float(recall_score(y_true, y_pred))

# CV config (set to 3 for quick iteration; 5 for final)
N_SPLITS = 5
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# --- Model: saga + elasticnet (faster/more stable settings) ---
# Tip: if training is still slow, switch to penalty='l2' (same solver) and/or N_SPLITS=3
clf_en = LogisticRegression(
    solver="saga",
    penalty="elasticnet",
    l1_ratio=0.15,      # smaller L1 -> faster convergence than 0.2
    C=0.6,              # a bit stronger regularization (speeds up, reduces overfit)
    max_iter=3000,      # 3000 + tol=2e-3 converge bem mais rápido que 5000 + tol=1e-4
    tol=2e-3,
    class_weight="balanced",
    random_state=42
)

pipe_en = Pipeline(
    steps=[("prep", preprocessor), ("clf", clf_en)],
    memory=cache
)

# --- OOF evaluation ---
oof_en = np.zeros(len(X), dtype=float)
rows_en = []

t0_all = time.time()
for i, (tr, va) in enumerate(cv.split(X, y), 1):
    t0 = time.time()
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    pipe_en.fit(X_tr, y_tr)
    proba = pipe_en.predict_proba(X_va)[:, 1]
    oof_en[va] = proba

    auc = roc_auc_score(y_va, proba)
    ks  = ks_statistic(y_va, proba)
    p20 = precision_at_k(y_va, proba, k=0.2)
    r20 = recall_at_k(y_va, proba, k=0.2)

    dt = time.time() - t0
    rows_en.append({"fold": i, "AUC": auc, "KS": ks, "P@20": p20, "R@20": r20, "time_s": dt})
    print(f"[EN Fold {i}] AUC={auc:.4f} | KS={ks:.4f} | P@20={p20:.4f} | R@20={r20:.4f} | {dt:.1f}s")

# Overall OOF
auc_all_en = roc_auc_score(y, oof_en)
ks_all_en  = ks_statistic(y, oof_en)
p20_all_en = precision_at_k(y, oof_en, k=0.2)
r20_all_en = recall_at_k(y, oof_en, k=0.2)
dt_all = time.time() - t0_all

df_folds_en = pd.DataFrame(rows_en)
print("\n[EN OOF] AUC={:.4f} | KS={:.4f} | P@20={:.4f} | R@20={:.4f} | total={:.1f}s"
      .format(auc_all_en, ks_all_en, p20_all_en, r20_all_en, dt_all))
display(df_folds_en.round(4))


[EN Fold 1] AUC=0.7263 | KS=0.3393 | P@20=0.1928 | R@20=0.4777 | 277.8s
[EN Fold 2] AUC=0.7373 | KS=0.3500 | P@20=0.1946 | R@20=0.4822 | 5748.5s
[EN Fold 3] AUC=0.7307 | KS=0.3414 | P@20=0.1938 | R@20=0.4802 | 412.5s
[EN Fold 4] AUC=0.7368 | KS=0.3570 | P@20=0.1985 | R@20=0.4918 | 10313.8s
[EN Fold 5] AUC=0.7283 | KS=0.3460 | P@20=0.1885 | R@20=0.4671 | 210.8s

[EN OOF] AUC=0.7318 | KS=0.3456 | P@20=0.1937 | R@20=0.4799 | total=16963.9s


fold	AUC	KS	P@20	R@20	time_s
0	1	0.7263	0.3393	0.1928	0.4777	277.7931
1	2	0.7373	0.3500	0.1946	0.4822	5748.5468
2	3	0.7307	0.3414	0.1938	0.4802	412.5458
3	4	0.7368	0.3570	0.1985	0.4918	10313.8338
4	5	0.7283	0.3460	0.1885	0.4671	210.7878


### Elastic Net (Logistic, saga) — Cross-Validation Summary

The elastic-net variant underperformed the L2 baseline and was extremely slow/variable to train.

**Fold results (5-fold):**
- AUC (mean): **0.732**  
- KS (mean): **0.349**  
- Precision@20% (mean): **0.194**  
- Recall@20% (mean): **0.480**  
- Training time per fold was highly unstable (from ~3–170 minutes), indicating poor scalability for this setup on our dataset (307k rows + OHE + target encoding).

**Decision:** I will **drop the elastic-net logistic** as a baseline. The **L2 logistic (saga or lbfgs)** remains my reference model (OOF AUC ≈ **0.746**, KS ≈ **0.365**), and I will proceed to gradient boosting (LightGBM) for performance gains.


# Persist transformed feature names

For auditability and SHAP alignment later, I fit the preprocessor on the full data only to **extract feature names**.  
This is for documentation only (I do not use this fit for evaluation).


In [None]:
# Fit preprocessor once to extract feature names for documentation
_ = preprocessor.fit(X, y)

# Extract names safely
# numeric: original names
num_names = num_cols

# one-hot names
ohe = preprocessor.named_transformers_["ohe"].named_steps["ohe"]
ohe_names = ohe.get_feature_names_out(low_card).tolist()

# target-encoded: keep original high_card names (they become numeric columns)
tgt_names = high_card

feature_names = num_names + ohe_names + tgt_names

pd.Series(feature_names, name="feature_name").to_csv("artifacts/feature_names_logreg.csv", index=False)
print(f"Saved {len(feature_names)} feature names to artifacts/feature_names_logreg.csv")


# Calibration diagnostics (Brier + reliability curve)

Because I will interpret scores as PD estimates, I check probabilistic calibration using the Brier score and a reliability curve (predicted vs. observed).


In [None]:
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

# Use OOF probabilities from the chosen baseline (e.g., lbfgs patch)
brier = brier_score_loss(y, oof)
print(f"Brier score (OOF): {brier:.4f}")

# Reliability curve
prob_true, prob_pred = calibration_curve(y, oof, n_bins=10, strategy="quantile")

plt.figure(figsize=(6,5))
plt.plot([0,1], [0,1], linestyle="--")
plt.plot(prob_pred, prob_true, marker="o")
plt.xlabel("Predicted probability")
plt.ylabel("Observed default rate")
plt.title("Reliability curve (OOF)")
plt.grid(True)
plt.show()

In [None]:
# ===== LightGBM + Optuna (CV) — sem Pipeline no fit, com eval_set transformado =====
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import roc_auc_score, roc_curve

# >>> pressupõe X, y, preprocessor, ks_statistic, precision_at_k, recall_at_k já definidos <<<

pos_rate = y.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 500),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
        "objective": "binary",
        "metric": "auc",
        "n_jobs": -1,
        "random_state": 42,
        "scale_pos_weight": scale_pos_weight,
    }

    oof = np.zeros(len(X))
    for tr, va in cv.split(X, y):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        # 1) Fit o preprocessor NO TREINO do fold e transforme treino/val
        prep = clone(preprocessor)
        prep.fit(X_tr, y_tr)
        Xtr_t = prep.transform(X_tr)
        Xva_t = prep.transform(X_va)

        # 2) Modelo LGBM por fold
        clf = lgb.LGBMClassifier(**params)

        # 3) LightGBM 4.x: early stopping via callbacks
        clf.fit(
            Xtr_t, y_tr,
            eval_set=[(Xva_t, y_va)],
            eval_metric="auc",
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
                lgb.log_evaluation(period=0),
            ],
        )

        oof[va] = clf.predict_proba(Xva_t)[:, 1]

    return roc_auc_score(y, oof)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, show_progress_bar=False)

print("Best AUC:", study.best_value)
print("Best params:", study.best_trial.params)

# ===== OOF final com os melhores hiperparâmetros =====
best = study.best_trial.params
rows, oof_lgbm = [], np.zeros(len(X))

for i, (tr, va) in enumerate(cv.split(X, y), 1):
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    prep = clone(preprocessor)
    prep.fit(X_tr, y_tr)
    Xtr_t = prep.transform(X_tr)
    Xva_t = prep.transform(X_va)

    clf = lgb.LGBMClassifier(
        n_estimators=5000,
        objective="binary",
        metric="auc",
        n_jobs=-1,
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        **best
    )
    clf.fit(
        Xtr_t, y_tr,
        eval_set=[(Xva_t, y_va)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=0),
        ],
    )

    proba = clf.predict_proba(Xva_t)[:, 1]
    oof_lgbm[va] = proba

    auc = roc_auc_score(y_va, proba)
    fpr, tpr, _ = roc_curve(y_va, proba)
    ks  = np.max(tpr - fpr)
    p20 = precision_at_k(y_va, proba, 0.2)
    r20 = recall_at_k(y_va, proba, 0.2)
    rows.append({"fold": i, "AUC": auc, "KS": ks, "P@20": p20, "R@20": r20})
    print(f"[LGBM Fold {i}] AUC={auc:.4f} | KS={ks:.4f} | P@20={p20:.4f} | R@20={r20:.4f}")

auc_all = roc_auc_score(y, oof_lgbm)
fpr, tpr, _ = roc_curve(y, oof_lgbm)
ks_all  = np.max(tpr - fpr)
p20_all = precision_at_k(y, oof_lgbm, 0.2)
r20_all = recall_at_k(y, oof_lgbm, 0.2)

print("\n[LGBM OOF] AUC={:.4f} | KS={:.4f} | P@20={:.4f} | R@20={:.4f}"
      .format(auc_all, ks_all, p20_all, r20_all))
pd.DataFrame(rows)


### LightGBM — Cross-Validation Results (OOF)

LightGBM outperforms my logistic baseline across all decision metrics:

- **AUC**: ~0.761 (vs 0.746)
- **KS**: ~0.391 (vs 0.365)
- **Precision@20%**: ~0.212 (vs 0.200)
- **Recall@20%**: ~0.526 (vs 0.500)

Takeaway: the boosted trees capture non-linearities and interactions que a regressão linear não alcança, mantendo robustez fora-da-amostra. I will proceed with **probability calibration** (if needed) and **model explainability (SHAP)** to communicate risk drivers to non-technical stakeholders.


In [None]:
import time, warnings, json, os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.base import clone
from sklearn.metrics import roc_auc_score, roc_curve, brier_score_loss
from sklearn.isotonic import IsotonicRegression

# --- utilities (reuse-safe) ---
def ks_statistic(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return float((tpr - fpr).max())

def reliability_points(y_true, y_score, n_bins=10):
    # quantile bins → predicted vs observed
    qs = np.linspace(0, 1, n_bins + 1)
    cuts = np.quantile(y_score, qs)
    cuts[0], cuts[-1] = -np.inf, np.inf
    bin_id = np.digitize(y_score, cuts[1:-1], right=True)
    df = pd.DataFrame({"bin": bin_id, "pred": y_score, "y": y_true})
    grp = df.groupby("bin", as_index=False).agg(predicted=("pred", "mean"),
                                                observed=("y", "mean"),
                                                count=("y","size"))
    return grp

# --- ensure artifacts dir ---
os.makedirs("artifacts", exist_ok=True)

# --- grab best params from Optuna if present; else fallback ---
try:
    best  # type: ignore
except NameError:
    best = {
        "learning_rate": 0.05,
        "num_leaves": 127,
        "min_data_in_leaf": 100,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "max_depth": -1,
        "lambda_l1": 0.0,
        "lambda_l2": 2.0,
    }

# --- build final LGBM with best params ---
pos_rate = y.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
base_lgbm = lgb.LGBMClassifier(
    n_estimators=5000,
    objective="binary",
    metric="auc",
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    **best
)

# ------------ FAST MODE (single holdout) ------------
STRICT_CV = False   # << troque para True se quiser calibração por fold (lenta)

if not STRICT_CV:
    print(">> FAST calibration mode (single stratified holdout = 10%)")
    t0 = time.time()

    # 1) split for calibration (10%)
    X_tr, X_cal, y_tr, y_cal = train_test_split(
        X, y, test_size=0.10, stratify=y, random_state=42
    )

    # 2) fit preprocessor on training-only; transform train & cal
    prep = clone(preprocessor)
    prep.fit(X_tr, y_tr)
    Xtr_t = prep.transform(X_tr)
    Xcal_t = prep.transform(X_cal)

    # 3) train LightGBM with early stopping using the calibration split for validation
    base_lgbm.fit(
        Xtr_t, y_tr,
        eval_set=[(Xcal_t, y_cal)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False),
                   lgb.log_evaluation(period=0)]
    )

    # 4) raw probabilities on calibration split (to learn isotonic)
    proba_cal_raw = base_lgbm.predict_proba(Xcal_t)[:, 1]
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(proba_cal_raw, y_cal)

    # 5) OOF-like evaluation by re-scoring full data (transform once)
    Xall_t = prep.transform(X)
    proba_raw_all = base_lgbm.predict_proba(Xall_t)[:, 1]
    proba_cal_all = iso.transform(proba_raw_all)

    # 6) metrics
    auc_raw  = roc_auc_score(y, proba_raw_all)
    auc_cal  = roc_auc_score(y, proba_cal_all)
    ks_raw   = ks_statistic(y, proba_raw_all)
    ks_cal   = ks_statistic(y, proba_cal_all)
    brier_raw = brier_score_loss(y, proba_raw_all)
    brier_cal = brier_score_loss(y, proba_cal_all)
    print(f"[FAST] Raw  AUC={auc_raw:.4f} | KS={ks_raw:.4f} | Brier={brier_raw:.4f}")
    print(f"[FAST] Cal  AUC={auc_cal:.4f} | KS={ks_cal:.4f} | Brier={brier_cal:.4f}")
    print(f"Done in {time.time()-t0:.1f}s")

    # 7) reliability points (for plot downstream if you want)
    rel_raw = reliability_points(y.values, proba_raw_all, n_bins=10)
    rel_cal = reliability_points(y.values, proba_cal_all, n_bins=10)

else:
    # ------------ STRICT CV MODE (slower, leakage-safe per fold) ------------
    print(">> STRICT CV calibration mode (isotonic per fold) — SLOW")
    t0 = time.time()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_raw = np.zeros(len(X), dtype=float)
    oof_cal = np.zeros(len(X), dtype=float)

    for fold, (tr, va) in enumerate(cv.split(X, y), 1):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        # inner split for isotonic train (10% of training fold)
        inner = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=fold)
        fit_idx, cal_idx = next(inner.split(X_tr, y_tr))
        X_fit, y_fit = X_tr.iloc[fit_idx], y_tr.iloc[fit_idx]
        X_cal, y_cal = X_tr.iloc[cal_idx], y_tr.iloc[cal_idx]

        # fit preprocessor and transform
        prep = clone(preprocessor)
        prep.fit(X_fit, y_fit)
        Xfit_t = prep.transform(X_fit)
        Xcal_t = prep.transform(X_cal)
        Xva_t  = prep.transform(X_va)

        # train model
        clf = clone(base_lgbm)
        clf.fit(
            Xfit_t, y_fit,
            eval_set=[(Xva_t, y_va)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False),
                       lgb.log_evaluation(period=0)]
        )

        # raw proba on val fold
        proba_raw = clf.predict_proba(Xva_t)[:, 1]
        oof_raw[va] = proba_raw

        # isotonic fitted on calibration split of training fold
        proba_cal_train = clf.predict_proba(Xcal_t)[:, 1]
        iso = IsotonicRegression(out_of_bounds="clip")
        # (speed tip) subsample calibration if huge:
        if len(proba_cal_train) > 50000:
            idx = np.random.RandomState(42).choice(len(proba_cal_train), 50000, replace=False)
            iso.fit(proba_cal_train[idx], y_cal.iloc[idx])
        else:
            iso.fit(proba_cal_train, y_cal)

        oof_cal[va] = iso.transform(proba_raw)

    # aggregate metrics
    auc_raw  = roc_auc_score(y, oof_raw)
    auc_cal  = roc_auc_score(y, oof_cal)
    ks_raw   = ks_statistic(y, oof_raw)
    ks_cal   = ks_statistic(y, oof_cal)
    brier_raw = brier_score_loss(y, oof_raw)
    brier_cal = brier_score_loss(y, oof_cal)
    print(f"[CV] Raw  AUC={auc_raw:.4f} | KS={ks_raw:.4f} | Brier={brier_raw:.4f}")
    print(f"[CV] Cal  AUC={auc_cal:.4f} | KS={ks_cal:.4f} | Brier={brier_cal:.4f}")
    print(f"Done in {time.time()-t0:.1f}s")


>> FAST calibration mode (single stratified holdout = 10%)
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Info] Number of positive: 22342, number of negative: 254417
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11392
[LightGBM] [Info] Number of data points in the train set: 276759, number of used features: 151
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[FAST] Raw  AUC=0.8355 | KS=0.5143 | Brier=0.1709
[FAST] Cal  AUC=0.8352 | KS=0.5141 | Brier=0.0639
Done in 49.1s

## B) SHAP — Global & Local (fast and production-friendly)

I refit the final model on a train/holdout split (10%) with early stopping, then:
- Use **TreeExplainer** and a **capped sample** (e.g., 8k) for global plots (fast).
- Show a **local** explanation (top-risk case from the holdout).


In [None]:
# === SHAP — Global & Local (robusto e rápido) ===
import numpy as np
import pandas as pd
import shap
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.base import clone

# Helper para garantir 2D e denso
def ensure_2d_dense(X):
    try:
        from scipy import sparse
        if sparse.issparse(X):
            X = X.toarray()
    except Exception:
        pass
    X = np.asarray(X)
    if X.ndim == 1:
        X = X.reshape(1, -1)
    return X

# 1) Split para SHAP (rápido e consistente)
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.10, stratify=y, random_state=123
)

prep_full = clone(preprocessor)
prep_full.fit(X_tr, y_tr)
Xtr_t = prep_full.transform(X_tr)
Xva_t = prep_full.transform(X_va)

Xtr_t = ensure_2d_dense(Xtr_t)
Xva_t = ensure_2d_dense(Xva_t)

# 2) Refit do LGBM (usa seus 'best' params; se não houver, cai no fallback)
try:
    best
except NameError:
    best = {
        "learning_rate": 0.05, "num_leaves": 127, "min_data_in_leaf": 100,
        "feature_fraction": 0.9, "bagging_fraction": 0.8, "bagging_freq": 3,
        "max_depth": -1, "lambda_l1": 0.0, "lambda_l2": 2.0
    }

pos_rate = y.mean()
final_lgbm = lgb.LGBMClassifier(
    n_estimators=5000, objective="binary", metric="auc",
    n_jobs=-1, random_state=123, scale_pos_weight=(1-pos_rate)/pos_rate,
    **best
)
final_lgbm.fit(
    Xtr_t, y_tr,
    eval_set=[(Xva_t, y_va)],
    eval_metric="auc",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200, verbose=False),
        lgb.log_evaluation(period=0),
    ],
)

# 3) Nomes de features alinhados ao transform
try:
    feat_names = pd.read_csv("artifacts/feature_names_logreg.csv")["feature_name"].tolist()
except Exception:
    num_cols  = X.select_dtypes(include=["int64","float64"]).columns.tolist()
    cat_cols  = X.select_dtypes(include=["object"]).columns.tolist()
    low_card  = [c for c in cat_cols if X[c].nunique() < 10]
    high_card = [c for c in cat_cols if X[c].nunique() >= 10]
    ohe = prep_full.named_transformers_["ohe"].named_steps["ohe"]
    ohe_names = ohe.get_feature_names_out(low_card).tolist()
    feat_names = [f"num__{c}" for c in num_cols] + ohe_names + [f"tgt__{c}" for c in high_card]

# 4) SHAP Global (amostra controlada + classe positiva)
explainer = shap.TreeExplainer(final_lgbm)
N_SHAP = min(8000, Xtr_t.shape[0])
rng = np.random.RandomState(42)
idx = rng.choice(Xtr_t.shape[0], N_SHAP, replace=False)
X_global = Xtr_t[idx]

# Algumas versões retornam lista [class0, class1]; outras retornam ndarray direto
shap_vals = explainer.shap_values(X_global, check_additivity=False)
if isinstance(shap_vals, list):
    shap_pos = shap_vals[1]
else:
    shap_pos = shap_vals

# Plots globais
shap.summary_plot(shap_pos, X_global, feature_names=feat_names[:X_global.shape[1]], plot_type="bar", show=True)
shap.summary_plot(shap_pos, X_global, feature_names=feat_names[:X_global.shape[1]], show=True)

# 5) SHAP Local (cliente mais arriscado do holdout)
proba_holdout = final_lgbm.predict_proba(Xva_t)[:, 1]
i = int(np.argsort(-proba_holdout)[0])

x_row = Xva_t[i:i+1]  # <- fatia 2D
shap_row = explainer.shap_values(x_row, check_additivity=False)
if isinstance(shap_row, list):
    shap_row = shap_row[1]   # classe positiva
shap_row = shap_row[0]       # (1, n_features) -> (n_features,)

base_val = explainer.expected_value
if isinstance(base_val, (list, np.ndarray)):
    base_val = base_val[1]

ex_local = shap.Explanation(
    values=shap_row,
    base_values=base_val,
    data=x_row[0],
    feature_names=feat_names[:x_row.shape[1]],
)
shap.plots.waterfall(ex_local, max_display=20)


[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Info] Number of positive: 22342, number of negative: 254417
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11366
[LightGBM] [Info] Number of data points in the train set: 276759, number of used features: 151
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432506
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\shap\explainers\_tree.py:583: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  warnings.warn(

## C) Persist artifacts & provide `score_applications(df)`

I persist the **preprocessor**, **LightGBM model**, **isotonic calibrator** and **feature names**, and expose a `score_applications(df)` function that returns **calibrated PD** ready for dashboards/APIs.


In [None]:
import os, json
import joblib
import numpy as np
import pandas as pd

os.makedirs("artifacts", exist_ok=True)

# --- choose which fitted objects to persist ---
# From FAST calibration block:
#   - 'prep'  : fitted on full training (or training part)
#   - 'base_lgbm': fitted LGBM (trained with early stopping)
#   - 'iso'   : isotonic calibrator fitted on holdout
# If you ran STRICT_CV instead, you may prefer to refit a single model like below before saving.

# If FAST mode variables don't exist (e.g., you executed STRICT_CV only), refit quickly here:
try:
    prep, base_lgbm, iso  # type: ignore
except NameError:
    # quick refit on single holdout to produce production artifacts
    from sklearn.model_selection import train_test_split
    from sklearn.base import clone
    import lightgbm as lgb

    X_tr, X_cal, y_tr, y_cal = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)
    prep = clone(preprocessor).fit(X_tr, y_tr)
    Xtr_t = prep.transform(X_tr)
    Xcal_t = prep.transform(X_cal)

    base_lgbm = lgb.LGBMClassifier(
        n_estimators=5000,
        objective="binary",
        metric="auc",
        n_jobs=-1,
        random_state=42,
        scale_pos_weight=(1 - y.mean())/y.mean(),
        **best
    ).fit(
        Xtr_t, y_tr,
        eval_set=[(Xcal_t, y_cal)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False),
                   lgb.log_evaluation(period=0)]
    )
    # isotonic on calibration set
    from sklearn.isotonic import IsotonicRegression
    iso = IsotonicRegression(out_of_bounds="clip").fit(
        base_lgbm.predict_proba(Xcal_t)[:,1], y_cal
    )

# --- persist artifacts ---
joblib.dump(prep,      "artifacts/preprocessor.joblib")
joblib.dump(base_lgbm, "artifacts/lgbm_model.joblib")
joblib.dump(iso,       "artifacts/isotonic.joblib")

# feature names (if you have them)
try:
    pd.read_csv("artifacts/feature_names_logreg.csv")  # ensure exists
except Exception:
    # build and save current names
    num_cols  = X.select_dtypes(include=["int64","float64"]).columns.tolist()
    cat_cols  = X.select_dtypes(include=["object"]).columns.tolist()
    low_card  = [c for c in cat_cols if X[c].nunique() < 10]
    high_card = [c for c in cat_cols if X[c].nunique() >= 10]
    ohe = prep.named_transformers_["ohe"].named_steps["ohe"]
    ohe_names = ohe.get_feature_names_out(low_card).tolist()
    feat_names = [f"num__{c}" for c in num_cols] + ohe_names + [f"tgt__{c}" for c in high_card]
    pd.Series(feat_names, name="feature_name").to_csv("artifacts/feature_names_logreg.csv", index=False)

# best params
with open("artifacts/best_params.json", "w", encoding="utf-8") as f:
    json.dump(best, f, indent=2)

print("Saved: preprocessor.joblib, lgbm_model.joblib, isotonic.joblib, feature_names_logreg.csv, best_params.json")

# --- production scorer ---
def score_applications(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return calibrated PD for new applications DataFrame.
    Expects same columns as training raw schema (including categoricals).
    """
    prep = joblib.load("artifacts/preprocessor.joblib")
    model = joblib.load("artifacts/lgbm_model.joblib")
    iso   = joblib.load("artifacts/isotonic.joblib")

    Xt = prep.transform(df)
    proba_raw = model.predict_proba(Xt)[:, 1]
    pd_cal = iso.transform(proba_raw)
    return pd.DataFrame({
        "SK_ID_CURR": df.get("SK_ID_CURR", pd.Series(range(len(df)))),
        "pd_raw": proba_raw,
        "pd_calibrated": pd_cal
    })

# --- quick smoke test on a small slice of training ---
sample = X.head(5).copy()
out = score_applications(sample)
display(out)


Saved: preprocessor.joblib, lgbm_model.joblib, isotonic.joblib, feature_names_logreg.csv, best_params.json
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(


SK_ID_CURR	pd_raw	pd_calibrated
0	0	0.910207	0.555556
1	1	0.272722	0.040662
2	2	0.235538	0.030796
3	3	0.335316	0.045103
4	4	0.546277	0.108856


### Probability Calibration — Reliability Curve (raw vs calibrated)

I plot calibration curves on the full training set using the fitted preprocessor, model and isotonic calibrator.  
Lower Brier and curves closer to the diagonal confirm better PD quality.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
import joblib

# Load persisted artifacts
prep  = joblib.load("artifacts/preprocessor.joblib")
model = joblib.load("artifacts/lgbm_model.joblib")
iso   = joblib.load("artifacts/isotonic.joblib")

# Transform and score (full training)
Xt = prep.transform(X)
pd_raw = model.predict_proba(Xt)[:, 1]
pd_cal = iso.transform(pd_raw)

# Brier
print(f"Brier raw={brier_score_loss(y, pd_raw):.4f} | Brier cal={brier_score_loss(y, pd_cal):.4f}")

# Reliability (quantile bins)
pt_raw, pp_raw = calibration_curve(y, pd_raw, n_bins=10, strategy="quantile")
pt_cal, pp_cal = calibration_curve(y, pd_cal, n_bins=10, strategy="quantile")

# Plot
plt.figure(figsize=(6,5))
plt.plot([0,1],[0,1],"--",linewidth=1)
plt.plot(pp_raw, pt_raw, marker="o", label="Raw")
plt.plot(pp_cal, pt_cal, marker="o", label="Calibrated (Isotonic)")
plt.xlabel("Predicted probability")
plt.ylabel("Observed default rate")
plt.title("Reliability Curve — OOF-style (raw vs calibrated)")
plt.grid(True); plt.legend(); plt.tight_layout(); plt.show()


c:\Users\luizo\anaconda3\envs\credit-risk\lib\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(
[LightGBM] [Warning] min_data_in_leaf is set=466, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=466
[LightGBM] [Warning] feature_fraction is set=0.823409610342382, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.823409610342382
[LightGBM] [Warning] lambda_l1 is set=2.510486193166801, reg_alpha=0.0 will be ignored. Current value: lambda_l1=2.510486193166801
[LightGBM] [Warning] lambda_l2 is set=5.670894047918985, reg_lambda=0.0 will be ignored. Current value: lambda_l2=5.670894047918985
[LightGBM] [Warning] bagging_fraction is set=0.9377595784067674, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9377595784067674
[LightGBM] [Warning] bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6
Brier raw=0.1709 | Brier cal=0.0639