In [None]:
# ===========================================================
# Rozdz. 4.3 — Regresja logistyczna (pełny pipeline uczący)
# ===========================================================
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss, roc_curve
)

# ---------- ustawienia ----------
ART = "artifacts_43"
os.makedirs(ART, exist_ok=True)
RANDOM_STATE   = 42
N_SPLITS_TIME  = 6
N_BINS_CALIB   = 10
PROFIT_GOOD    = 1_000
LOSS_BAD       = -5_000

# ---------- 1) dane ----------
SNAP_PATH = Path("C:/Users/lukasz.wrobel/Desktop/PRACA MAGISTERSKA/pliki/artifacts/artifacts/engineered_snapshot.csv")
if not SNAP_PATH.exists():
    SNAP_PATH = Path("engineered_snapshot.csv")

df = pd.read_csv(SNAP_PATH)
if "issue_d" in df.columns:
    df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")

assert "loan_status_bin" in df.columns, "Brak kolumny loan_status_bin w snapshotcie."
df["loan_status_bin"] = pd.to_numeric(df["loan_status_bin"], errors="coerce")

# fallback mapowanie, gdyby jednak nie było etykiet 0/1
if df["loan_status_bin"].isna().all() and "loan_status" in df.columns:
    map_rules = {
        "Fully Paid": 0,
        "Charged Off": 1, "Default": 1,
        "Late (31-120 days)": 1, "Late (16-30 days)": 1, "In Grace Period": 1,
        "Does not meet the credit policy. Status:Fully Paid": 0,
        "Does not meet the credit policy. Status:Charged Off": 1,
        "Current": np.nan, "Issued": np.nan
    }
    df["loan_status_bin"] = df["loan_status"].map(map_rules)

# filtrujemy tylko 0/1
n0 = len(df)
df = df.loc[df["loan_status_bin"].isin([0, 1])].copy()
print(f"y-clean: start={n0}, po={len(df)}")

# zabezpieczenie cech
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# === KLUCZOWA ZMIANA ===
# y jako Series (zachowuje index) – później używamy y.loc[...]
y = df["loan_status_bin"].astype("int8")

# pomocnicza kolumna czasowa (jeśli mamy datę)
if "issue_d" in df.columns and pd.api.types.is_datetime64_any_dtype(df["issue_d"]):
    df["issue_d_ord"] = (df["issue_d"].dt.year * 12 + df["issue_d"].dt.month).astype("int32")

# usuń całkowicie puste i stałe kolumny
empty_cols = [c for c in df.columns if df[c].isna().all()]
if empty_cols:
    df.drop(columns=empty_cols, inplace=True)
zero_var = [c for c in df.columns if df[c].nunique(dropna=True) <= 1 and c != "loan_status_bin"]
if zero_var:
    df.drop(columns=zero_var, inplace=True)

# listy cech (bez celu i bez daty wprost)
target_col  = "loan_status_bin"
feature_cols = [c for c in df.columns if c != target_col and not pd.api.types.is_datetime64_any_dtype(df[c])]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]
print(f"#kolumn num: {len(num_cols)}, kat: {len(cat_cols)}")

# ---------- 2) helpery ----------
def time_blocks(frame, date_col="issue_d", n_splits=N_SPLITS_TIME):
    """Zwraca listę (train_idx, valid_idx) z podziałem po miesiącach (rosnąco w czasie)."""
    assert date_col in frame.columns, f"Brak kolumny {date_col} do CV czasowego."
    months = frame[date_col].dt.to_period("M").astype(str)
    uniq = months.sort_values().unique()
    chunks = np.array_split(uniq, n_splits)
    pairs = []
    for i in range(1, len(chunks)):
        train_months = np.concatenate(chunks[:i])
        valid_months = chunks[i]
        tr_idx = frame.index[months.isin(train_months)]
        va_idx = frame.index[months.isin(valid_months)]
        if len(tr_idx) and len(va_idx):
            pairs.append((tr_idx, va_idx))
    return pairs

def ks_score(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    return float(np.max(tpr - fpr))

def ece_score(y_true, y_prob, n_bins=20):
    bins = np.linspace(0,1,n_bins+1)
    idx = np.digitize(y_prob, bins) - 1
    ece = 0.0
    for b in range(n_bins):
        m = idx==b
        if m.sum()==0: 
            continue
        ece += m.mean() * abs(y_prob[m].mean() - y_true[m].mean())
    return float(ece)

def decile_table(y_true, y_prob, deciles=10):
    df_ = pd.DataFrame({"y":y_true, "p":y_prob}).sort_values("p", ascending=False).reset_index(drop=True)
    df_["decile"] = pd.qcut(df_.index, q=deciles, labels=False) + 1  # 1=top risk
    tab = df_.groupby("decile").agg(
        n=("y","size"),
        bad=("y","sum"),
        good=("y", lambda s: (1-s).sum()),
        prob_mean=("p","mean")
    ).reset_index()
    tab["bad_rate"] = tab["bad"] / tab["n"]
    total_bad = tab["bad"].sum()
    total_good = tab["good"].sum()
    tab["cum_bad"]  = tab["bad"].cumsum() / max(total_bad, 1)
    tab["cum_good"] = tab["good"].cumsum() / max(total_good,1)
    tab["ks"] = (tab["cum_bad"] - tab["cum_good"]).abs()
    return tab

def profit_curve(y_true, y_prob, profit_good=PROFIT_GOOD, loss_bad=LOSS_BAD, steps=101):
    taus = np.linspace(0, 1, steps)
    ev = []
    for t in taus:
        accept = y_prob < t
        tg = ((y_true==0) & accept).sum()
        tb = ((y_true==1) & accept).sum()
        ev.append(tg*profit_good + tb*loss_bad)
    return taus, np.array(ev)

# ---------- 3) preprocesor + pipeline (bazowy) ----------
# Uwaga: w pętli CV będziemy podawać konkretne listy kolumn (valid_num/valid_cat) – patrz niżej.
def make_pipeline(num_list, cat_list):
    # imputacja + skala dla num
    num_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler())
    ])
    # OHE dla kat (z imputacją najczęstszej kategorii)
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    cat_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent", fill_value="__MISSING__")),
        ("ohe", ohe)
    ])

    pre = ColumnTransformer(
        [("num", num_pipe, num_list),
         ("cat", cat_pipe, cat_list)],
        remainder="drop",
        verbose_feature_names_out=False
    )

    logit = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(solver="saga",
                                  max_iter=1000,
                                  class_weight="balanced",
                                  penalty="l2",
                                  random_state=RANDOM_STATE))
    ])
    return logit, pre

# ---------- 4) walidacja czasowa ----------
folds = time_blocks(df, "issue_d", n_splits=N_SPLITS_TIME)
metrics, last = [], {}

for tr_idx, va_idx in folds:
    # tylko kolumny, które mają przynajmniej jedną nie-NaN w TRAIN:
    valid_num = [c for c in num_cols if df.loc[tr_idx, c].notna().any()]
    valid_cat = [c for c in cat_cols if df.loc[tr_idx, c].notna().any()]

    logit, pre = make_pipeline(valid_num, valid_cat)

    # === KLUCZOWA ZMIANA === używamy .loc do y (Series!)
    Xtr, ytr = df.loc[tr_idx, valid_num+valid_cat], y.loc[tr_idx]
    Xva, yva = df.loc[va_idx, valid_num+valid_cat], y.loc[va_idx]

    logit.fit(Xtr, ytr)
    p = logit.predict_proba(Xva)[:,1]

    metrics.append({
        "AUC":   roc_auc_score(yva, p),
        "PR_AUC": average_precision_score(yva, p),
        "KS":    ks_score(yva, p),
        "Brier": brier_score_loss(yva, p),
        "LogLoss": log_loss(yva, p, labels=[0,1]),
        "ECE":   ece_score(yva, p)
    })
    last = {"Xtr":Xtr, "ytr":ytr, "Xva":Xva, "yva":yva, "pva":p, "valid_num":valid_num, "valid_cat":valid_cat}

cv_results = pd.DataFrame(metrics)
cv_mean = cv_results.mean()
cv_results.to_csv(f"{ART}/cv_fold_metrics.csv", index=False)
cv_mean.to_csv(f"{ART}/cv_metrics_mean.csv", header=False)
print("Średnie metryki CV (czasowej):")
print(cv_mean.round(4))

# ---------- 5) wykresy na ostatnim foldzie ----------
fpr, tpr, _ = roc_curve(last["yva"], last["pva"])
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(last['yva'],last['pva']):.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — Logistic Regression (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_last_fold.png", dpi=160); plt.close()

frac_pos, mean_pred = calibration_curve(last["yva"], last["pva"], n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred, frac_pos, marker="o", label="Observed vs predicted")
plt.plot([0,1],[0,1],"--", label="Perfect")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja — Logistic Regression (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/calibration_last_fold.png", dpi=160); plt.close()

taus, ev = profit_curve(last["yva"], last["pva"], PROFIT_GOOD, LOSS_BAD, steps=201)
best_idx = int(ev.argmax())
best_tau = float(taus[best_idx])
best_ev  = float(ev.max())
pd.DataFrame({"tau":taus, "expected_profit":ev}).to_csv(f"{ART}/profit_curve_last_fold.csv", index=False)

plt.figure(figsize=(6,4))
plt.plot(taus, ev)
plt.axvline(best_tau, ls="--", label=f"tau* = {best_tau:.3f}, EV={best_ev:,.0f}")
plt.xlabel("Próg akceptacji (p < tau)"); plt.ylabel("Oczekiwany zysk (jednostki)")
plt.title("Krzywa zysku — wybór progu (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/profit_curve_last_fold.png", dpi=160); plt.close()

# ---------- 6) model finalny + kalibracja + ocena OOT ----------
oot_months  = df["issue_d"].dt.to_period("M").astype(str)
uniq_months = np.array(sorted(oot_months.unique()))
oot_mask    = (oot_months == uniq_months[-1])
train_mask  = ~oot_mask

# używamy dokładnie tych samych list kolumn co w ostatnim foldzie (stabilność)
valid_num = last["valid_num"]; valid_cat = last["valid_cat"]
logit, pre = make_pipeline(valid_num, valid_cat)

X_train, y_train = df.loc[train_mask, valid_num+valid_cat], y.loc[train_mask]
X_oot,   y_oot   = df.loc[oot_mask,   valid_num+valid_cat], y.loc[oot_mask]

logit.fit(X_train, y_train)

# kalibracja isotonic na ostatnim foldzie
calibrated = CalibratedClassifierCV(logit, cv="prefit", method="isotonic")
calibrated.fit(last["Xva"], last["yva"])

p_oot = calibrated.predict_proba(X_oot)[:,1]

oot_metrics = {
    "AUC":   roc_auc_score(y_oot, p_oot),
    "PR_AUC": average_precision_score(y_oot, p_oot),
    "KS":    ks_score(y_oot, p_oot),
    "Brier": brier_score_loss(y_oot, p_oot),
    "LogLoss": log_loss(y_oot, p_oot, labels=[0,1]),
    "ECE":   ece_score(y_oot, p_oot)
}
pd.Series(oot_metrics).to_csv(f"{ART}/oot_metrics.csv", header=False)
print("\nMetryki OOT (ostatni miesiąc):")
print(pd.Series(oot_metrics).round(4))

# ROC + kalibracja OOT
fpr_o, tpr_o, _ = roc_curve(y_oot, p_oot)
plt.figure(figsize=(5,4))
plt.plot(fpr_o, tpr_o, label=f"AUC={roc_auc_score(y_oot,p_oot):.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — Logistic Regression (OOT)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_oot.png", dpi=160); plt.close()

frac_pos_o, mean_pred_o = calibration_curve(y_oot, p_oot, n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred_o, frac_pos_o, marker="o", label="Observed vs predicted")
plt.plot([0,1],[0,1],"--", label="Perfect")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja — Logistic Regression (OOT)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/calibration_oot.png", dpi=160); plt.close()

# tabela decylowa + KS
dec_tab = decile_table(y_oot, p_oot, deciles=10)
dec_tab.to_csv(f"{ART}/decile_table_oot.csv", index=False)

plt.figure(figsize=(6,4))
plt.plot(dec_tab["decile"], dec_tab["ks"], marker="o")
plt.xlabel("Decyl (1 = najwyższe ryzyko)"); plt.ylabel("KS (kumul. różnica bad-good)")
plt.title("KS po decylach — OOT")
plt.tight_layout(); plt.savefig(f"{ART}/ks_by_decile_oot.png", dpi=160); plt.close()

# współczynniki / OR (na całym train, po preprocesingu)
pre_fitted = pre.fit(X_train)
feat_names = pre_fitted.get_feature_names_out()
clf = logit.named_steps["clf"]
coef = clf.coef_.ravel()
coef_df = pd.DataFrame({"feature": feat_names, "beta": coef, "OR": np.exp(coef)}).sort_values("beta")
coef_df.to_csv(f"{ART}/logit_coefficients_OR.csv", index=False)
print("\nNajsilniejsze ujemne wpływy (TOP 15):"); print(coef_df.head(15))
print("\nNajsilniejsze dodatnie wpływy (TOP 15):"); print(coef_df.tail(15))

# decyzje wg progu z walidacji
accept_oot = (p_oot < best_tau)
tg = int(((y_oot==0) & accept_oot).sum())
tb = int(((y_oot==1) & accept_oot).sum())
ev_oot = tg*PROFIT_GOOD + tb*LOSS_BAD
pd.Series({
    "best_tau_from_valid": best_tau,
    "accepted_cnt": int(accept_oot.sum()),
    "true_good_accepted": tg,
    "true_bad_accepted": tb,
    "expected_profit_OOT": ev_oot
}).to_csv(f"{ART}/decision_summary_oot.csv", header=False)

print(f"\nDecyzje OOT przy tau*={best_tau:.3f}: accepted={int(accept_oot.sum())}, TG={tg}, TB={tb}, EV={ev_oot:,.0f}")
print(f"\nArtefakty zapisano w: {os.path.abspath(ART)}")


y-clean: start=2029952, po=1569804


  cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]


#kolumn num: 12, kat: 3




Średnie metryki CV (czasowej):
AUC        0.6981
PR_AUC     0.3832
KS         0.2880
Brier      0.2510
LogLoss    0.6978
ECE        0.2940
dtype: float64





Metryki OOT (ostatni miesiąc):
AUC        0.6853
PR_AUC     0.4400
KS         0.2675
Brier      0.1863
LogLoss    0.5532
ECE        0.0139
dtype: float64

Najsilniejsze ujemne wpływy (TOP 15):
                                          feature      beta        OR
17         loan_to_income_qbin_(-0.000839, 0.075] -0.924569  0.396702
36   installment_to_income_qbin_(0.0118, 1189.24] -0.653092  0.520434
18             loan_to_income_qbin_(0.075, 0.109] -0.633439  0.530763
35   installment_to_income_qbin_(0.00956, 0.0118] -0.440708  0.643580
19              loan_to_income_qbin_(0.109, 0.14] -0.431890  0.649281
1                                    int_rate_num -0.328287  0.720157
34  installment_to_income_qbin_(0.00805, 0.00956] -0.268430  0.764579
20              loan_to_income_qbin_(0.14, 0.169] -0.251028  0.778001
33  installment_to_income_qbin_(0.00689, 0.00805] -0.137513  0.871523
21               loan_to_income_qbin_(0.169, 0.2] -0.089739  0.914170
46          revol_to_income_qbin_(0.