In [1]:
# ===========================================================
# Rozdz. 4.7 — Artificial Neural Network (MLP, Keras/TensorFlow)
# pełny pipeline: time-CV, metryki, kalibracja, OOT, artefakty
# ===========================================================
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from time import time

# --- TF/Keras ---
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, regularizers, callbacks
except Exception as e:
    raise ImportError(
        "Brak TensorFlow/Keras. Zainstaluj: pip install tensorflow"
    ) from e

# --- sklearn ---
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss, roc_curve
)
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve

# ---------- ścieżki / artefakty ----------
ART = "artifacts_47_ann"
os.makedirs(ART, exist_ok=True)

# Ekonomia decyzji (dostosuj do realiów)
PROFIT_GOOD = 1_000
LOSS_BAD   = -5_000

# Walidacja czasowa
N_SPLITS_TIME = 6
N_BINS_CALIB  = 10
RANDOM_STATE  = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# ---------- 1) dane ----------
SNAP_PATH = Path("C:/Users/lukasz.wrobel/Desktop/PRACA MAGISTERSKA/pliki/artifacts/artifacts/engineered_snapshot.csv")
if not SNAP_PATH.exists():
    SNAP_PATH = Path("engineered_snapshot.csv")

df = pd.read_csv(SNAP_PATH)
if "issue_d" in df.columns:
    df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")

assert "loan_status_bin" in df.columns, "Brak kolumny 'loan_status_bin' w snapshotcie."
df["loan_status_bin"] = pd.to_numeric(df["loan_status_bin"], errors="coerce")
df = df.loc[df["loan_status_bin"].isin([0,1])].copy()

# y jako Series (zachowuje index -> później .loc)
y = df["loan_status_bin"].astype("int8")

# sanity — NaN/Inf w cechach
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# listy cech (bez gołych datetime)
feature_cols = [c for c in df.columns if c != "loan_status_bin" and not pd.api.types.is_datetime64_any_dtype(df[c])]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]
print(f"#kolumn num: {len(num_cols)}, kat: {len(cat_cols)}")

# ---------- 2) helpery ----------
def time_blocks(frame: pd.DataFrame, date_col="issue_d", n_splits=N_SPLITS_TIME):
    """Zwraca listę (train_idx, valid_idx) rosnących bloków czasowych (po miesiącach)."""
    if date_col not in frame.columns or frame[date_col].isna().all():
        idx = frame.index.to_numpy()
        cut = int(len(idx)*0.8)
        return [(idx[:cut], idx[cut:])]
    months = frame[date_col].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    if len(uniq) < n_splits:
        n_splits = max(2, len(uniq))
    chunks = np.array_split(uniq, n_splits)
    pairs = []
    for i in range(1, len(chunks)):
        tr_m = np.concatenate(chunks[:i])
        va_m = chunks[i]
        tr_idx = frame.index[months.isin(tr_m)]
        va_idx = frame.index[months.isin(va_m)]
        if len(tr_idx) and len(va_idx):
            pairs.append((tr_idx, va_idx))
    return pairs

def ks_score(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    return float(np.max(tpr - fpr))

def ece_score(y_true, y_prob, n_bins=20):
    bins = np.linspace(0,1,n_bins+1)
    idx = np.digitize(y_prob, bins) - 1
    ece = 0.0
    for b in range(n_bins):
        m = (idx == b)
        if m.sum()==0: 
            continue
        ece += m.mean() * abs(y_prob[m].mean() - y_true[m].mean())
    return float(ece)

def decile_table(y_true, y_prob, deciles=10):
    d = pd.DataFrame({"y": y_true, "p": y_prob}).sort_values("p", ascending=False).reset_index(drop=True)
    d["decile"] = pd.qcut(d.index, q=deciles, labels=False) + 1
    tab = d.groupby("decile").agg(
        n=("y","size"),
        bad=("y","sum"),
        good=("y", lambda s: (1-s).sum()),
        prob_mean=("p","mean")
    ).reset_index()
    tab["bad_rate"] = tab["bad"]/tab["n"]
    total_bad, total_good = tab["bad"].sum(), tab["good"].sum()
    tab["cum_bad"]  = tab["bad"].cumsum()/max(total_bad,1)
    tab["cum_good"] = tab["good"].cumsum()/max(total_good,1)
    tab["ks"] = (tab["cum_bad"] - tab["cum_good"]).abs()
    return tab

def profit_curve(y_true, y_prob, profit_good=PROFIT_GOOD, loss_bad=LOSS_BAD, steps=201):
    taus = np.linspace(0,1,steps)
    ev = []
    for t in taus:
        acc = y_prob < t
        tg = ((y_true==0) & acc).sum()
        tb = ((y_true==1) & acc).sum()
        ev.append(tg*profit_good + tb*loss_bad)
    return taus, np.array(ev)

# ---------- 3) preprocessing ----------
# ANN wymaga skalowania cech numerycznych -> StandardScaler
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median", add_indicator=True)),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])
pre = ColumnTransformer(
    [("num", num_pipe, num_cols),
     ("cat", cat_pipe, cat_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

# ---------- 4) Architektura MLP ----------
def build_mlp(input_dim: int, l2=1e-4, p1=0.30, p2=0.20, lr=1e-3):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,), dtype="float32"),
        layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2)),
        layers.Dropout(p1),
        layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(l2)),
        layers.Dropout(p2),
        layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(l2)),
        layers.Dense(1, activation="sigmoid")
    ])
    opt = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["AUC"])
    return model

def train_mlp(Xtr, ytr, Xva, yva, class_weight=None, lr=1e-3, epochs=100, batch=2048, patience=8):
    model = build_mlp(Xtr.shape[1], lr=lr)
    cb = [
        callbacks.EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True, verbose=0),
        callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=max(3, patience//2), min_lr=1e-5, verbose=0)
    ]
    hist = model.fit(
        Xtr, ytr,
        validation_data=(Xva, yva),
        epochs=epochs,
        batch_size=batch,
        class_weight=class_weight,
        verbose=0,
        callbacks=cb
    )
    return model, hist

def plot_learning_curves(history, out_png):
    h = history.history
    plt.figure(figsize=(6,4))
    plt.plot(h["loss"], label="train loss")
    plt.plot(h["val_loss"], label="val loss")
    if "auc" in h and "val_auc" in h:
        plt.plot(h["auc"], label="train AUC")
        plt.plot(h["val_auc"], label="val AUC")
    plt.xlabel("Epoka"); plt.title("Krzywe uczenia — MLP")
    plt.legend(); plt.tight_layout(); plt.savefig(out_png, dpi=160); plt.close()

# ---------- 5) Walidacja czasowa ----------
folds = time_blocks(df, "issue_d", n_splits=N_SPLITS_TIME)
metrics, last = [], {}
t0 = time()

for tr_idx, va_idx in folds:
    # fit preprocessing na TRAIN i transformuj do macierzy wejściowej MLP
    pre_fitted = pre.fit(df.loc[tr_idx, :])
    Xtr = pre_fitted.transform(df.loc[tr_idx, :]).astype("float32")
    Xva = pre_fitted.transform(df.loc[va_idx, :]).astype("float32")
    feat_names = pre_fitted.get_feature_names_out()

    ytr = y.loc[tr_idx].to_numpy()
    yva = y.loc[va_idx].to_numpy()

    # class weights (imbalance): wagi ∝ #neg / #pos
    pos, neg = int((ytr==1).sum()), int((ytr==0).sum())
    cw = {0: 1.0, 1: (neg / max(pos,1))}

    model, hist = train_mlp(Xtr, ytr, Xva, yva, class_weight=cw, lr=1e-3, epochs=60, batch=2048, patience=8)
    p = model.predict(Xva, batch_size=4096, verbose=0).ravel()

    # metryki
    metrics.append({
        "AUC": roc_auc_score(yva, p),
        "PR_AUC": average_precision_score(yva, p),
        "KS": ks_score(yva, p),
        "Brier": brier_score_loss(yva, p),
        "LogLoss": log_loss(yva, p, labels=[0,1]),
        "ECE": ece_score(yva, p)
    })

    last = {"pre":pre_fitted, "Xva":Xva, "yva":yva, "pva":p, "feat_names":feat_names,
            "hist":hist, "model":model}

cv_results = pd.DataFrame(metrics)
cv_results.to_csv(f"{ART}/cv_fold_metrics_ann.csv", index=False)
cv_mean = cv_results.mean()
cv_mean.to_csv(f"{ART}/cv_metrics_mean_ann.csv", header=False)
print("Średnie metryki CV (ANN):\n", cv_mean.round(4))
print(f"Czas CV: {time()-t0:.1f}s")

# ---------- 6) ROC i kalibracja (ostatni fold, przed i po) ----------
# ROC
fpr, tpr, _ = roc_curve(last["yva"], last["pva"])
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(last['yva'],last['pva']):.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — ANN (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_last_fold_ann.png", dpi=160); plt.close()

# krzywe uczenia
plot_learning_curves(last["hist"], f"{ART}/learning_curves_last_fold_ann.png")

# kalibracja — przed
frac_pos, mean_pred = calibration_curve(last["yva"], last["pva"], n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred, frac_pos, marker="o", label="Observed")
plt.plot([0,1],[0,1],"--", label="Perfect")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja (przed) — ANN (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/calibration_before_last_fold_ann.png", dpi=160); plt.close()

# dopasuj isotonic na (p, y) z walidacji
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(last["pva"], last["yva"])
pva_cal = iso.transform(last["pva"])

# kalibracja — po
frac_pos_c, mean_pred_c = calibration_curve(last["yva"], pva_cal, n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred_c, frac_pos_c, marker="o", label="Observed (calibrated)")
plt.plot([0,1],[0,1],"--", label="Perfect")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja (po) — ANN (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/calibration_after_last_fold_ann.png", dpi=160); plt.close()

# ---------- 7) Krzywa zysku + próg (ostatni fold, po kalibracji) ----------
taus, ev = profit_curve(last["yva"], pva_cal, PROFIT_GOOD, LOSS_BAD, steps=201)
best_tau = float(taus[int(ev.argmax())])
pd.DataFrame({"tau":taus, "expected_profit":ev}).to_csv(f"{ART}/profit_curve_last_fold_ann.csv", index=False)
plt.figure(figsize=(6,4))
plt.plot(taus, ev); plt.axvline(best_tau, ls="--", label=f"tau*={best_tau:.3f}")
plt.xlabel("Próg akceptacji (p < tau)"); plt.ylabel("Oczekiwany zysk")
plt.title("Krzywa zysku — ANN (ostatni fold, po kalibracji)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/profit_curve_last_fold_ann.png", dpi=160); plt.close()

# ---------- 8) Test OOT (ostatni miesiąc) ----------
if "issue_d" in df.columns and df["issue_d"].notna().any():
    months = df["issue_d"].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    oot_mask = (months == uniq[-1])
    train_mask = ~oot_mask
else:
    idx = df.index.to_numpy()
    cut = int(len(idx)*0.8)
    train_mask = np.zeros(len(idx), dtype=bool); train_mask[:cut] = True
    oot_mask = ~train_mask

# fit pre na TRAIN i transformacje
pre_train = pre.fit(df.loc[train_mask, :])
X_train = pre_train.transform(df.loc[train_mask, :]).astype("float32")
X_oot   = pre_train.transform(df.loc[oot_mask,   :]).astype("float32")
y_train = y.loc[train_mask].to_numpy()
y_oot   = y.loc[oot_mask].to_numpy()

# class weights na TRAIN
pos_tr, neg_tr = int((y_train==1).sum()), int((y_train==0).sum())
cw_train = {0: 1.0, 1: (neg_tr / max(pos_tr,1))}

# ucz model finalny (early stopping na wew. walidacji 10%)
model_final, hist_final = train_mlp(
    X_train, y_train,
    X_train[int(0.9*len(X_train)):], y_train[int(0.9*len(y_train)):],  # prosty val-split na końcówce
    class_weight=cw_train, lr=8e-4, epochs=80, batch=4096, patience=8
)

# kalibracja isotonic — użyj tej z ostatniego folda (spójnie z §4.3–4.6)
# (iso już dopasowany na (pva, yva))
p_oot_raw = model_final.predict(X_oot, batch_size=4096, verbose=0).ravel()
p_oot = iso.transform(p_oot_raw)

# metryki OOT
oot_metrics = {
    "AUC": roc_auc_score(y_oot, p_oot),
    "PR_AUC": average_precision_score(y_oot, p_oot),
    "KS": ks_score(y_oot, p_oot),
    "Brier": brier_score_loss(y_oot, p_oot),
    "LogLoss": log_loss(y_oot, p_oot, labels=[0,1]),
    "ECE": ece_score(y_oot, p_oot)
}
pd.Series(oot_metrics).to_csv(f"{ART}/oot_metrics_ann.csv", header=False)
print("\nMetryki OOT (ANN):\n", pd.Series(oot_metrics).round(4))

# ROC/kalibracja OOT
fpr_o, tpr_o, _ = roc_curve(y_oot, p_oot)
plt.figure(figsize=(5,4))
plt.plot(fpr_o, tpr_o, label=f"AUC={roc_auc_score(y_oot,p_oot):.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — ANN (OOT)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_oot_ann.png", dpi=160); plt.close()

frac_pos_o, mean_pred_o = calibration_curve(y_oot, p_oot, n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred_o, frac_pos_o, marker="o")
plt.plot([0,1],[0,1],"--")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja — ANN (OOT)")
plt.tight_layout(); plt.savefig(f"{ART}/calibration_oot_ann.png", dpi=160); plt.close()

# tabela decylowa i KS po decylach (OOT)
dec_tab = decile_table(y_oot, p_oot, deciles=10)
dec_tab.to_csv(f"{ART}/decile_table_oot_ann.csv", index=False)
plt.figure(figsize=(6,4))
plt.plot(dec_tab["decile"], dec_tab["ks"], marker="o")
plt.xlabel("Decyl (1 = najwyższe ryzyko)"); plt.ylabel("KS")
plt.title("KS po decylach — ANN (OOT)")
plt.tight_layout(); plt.savefig(f"{ART}/ks_by_decile_oot_ann.png", dpi=160); plt.close()

# ---------- 9) Permutation Importance (ostatni fold, po preprocesingu) ----------
# Prosta implementacja PI (spadek AUC po losowej permutacji kolumny)
def permutation_importance_ann(model, X_val, y_val, feat_names, n_repeats=3, batch=4096, seed=RANDOM_STATE):
    rng = np.random.RandomState(seed)
    base = roc_auc_score(y_val, model.predict(X_val, batch_size=batch, verbose=0).ravel())
    imps = []
    Xc = X_val.copy()
    for j in range(Xc.shape[1]):
        drops = []
        for _ in range(n_repeats):
            col = Xc[:, j].copy()
            rng.shuffle(col)
            Xc[:, j] = col
            p = model.predict(Xc, batch_size=batch, verbose=0).ravel()
            drops.append(base - roc_auc_score(y_val, p))
            Xc[:, j] = X_val[:, j]  # restore
        imps.append(np.mean(drops))
    imp_df = pd.DataFrame({"feature": feat_names, "importance_perm": imps}).sort_values("importance_perm", ascending=False)
    return imp_df

# licz PI na ostatnim foldzie i zapisz TOP-15
imp_df = permutation_importance_ann(last["model"], last["Xva"], last["yva"], last["feat_names"], n_repeats=3)
imp_df.to_csv(f"{ART}/ann_feature_importance_permutation.csv", index=False)
plt.figure(figsize=(8,6))
top = imp_df.head(15)[::-1]
plt.barh(top["feature"], top["importance_perm"])
plt.title("ANN — TOP 15 ważności (Permutation)")
plt.tight_layout(); plt.savefig(f"{ART}/ann_feature_importance_perm_top15.png", dpi=160); plt.close()

print(f"\nArtefakty zapisano w: {os.path.abspath(ART)}")





  cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]


#kolumn num: 11, kat: 3



Średnie metryki CV (ANN):
 AUC        0.7010
PR_AUC     0.3874
KS         0.2919
Brier      0.2237
LogLoss    0.6368
ECE        0.2470
dtype: float64
Czas CV: 1505.2s

Metryki OOT (ANN):
 AUC        0.7005
PR_AUC     0.4601
KS         0.2979
Brier      0.1833
LogLoss    0.5465
ECE        0.0164
dtype: float64

Artefakty zapisano w: c:\Users\lukasz.wrobel\Desktop\PRACA MAGISTERSKA\pliki\artifacts\artifacts_47_ann
