In [2]:
# ===========================================================
# Rozdz. 4.4 — Drzewo decyzyjne (CART) – pełny pipeline uczący
# ===========================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss, roc_curve
)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# ---------- ścieżki / artefakty ----------
ART = "artifacts_44_tree"
os.makedirs(ART, exist_ok=True)

# Wskaźniki zysku (dostosuj do swoich realiów)
PROFIT_GOOD = 1_000
LOSS_BAD   = -5_000

# liczba podziałów w walidacji czasowej
N_SPLITS_TIME = 6
N_BINS_CALIB  = 10
RANDOM_STATE  = 42

# ---------- 1) dane ----------
SNAP_PATH = Path("C:/Users/lukasz.wrobel/Desktop/PRACA MAGISTERSKA/pliki/artifacts/artifacts/engineered_snapshot.csv")
if not SNAP_PATH.exists():
    SNAP_PATH = Path("engineered_snapshot.csv")

df = pd.read_csv(SNAP_PATH)
if "issue_d" in df.columns:
    df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")

assert "loan_status_bin" in df.columns, "Brak kolumny 'loan_status_bin' w snapshotcie."

# docelowa y
df["loan_status_bin"] = pd.to_numeric(df["loan_status_bin"], errors="coerce")
df = df.loc[df["loan_status_bin"].isin([0,1])].copy()
y = df["loan_status_bin"].astype("int8").to_numpy()

# usuń ±inf -> NaN (imputer je obsłuży)
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# zdefiniuj listy cech dynamicznie (bez kolumn datetime)
feature_cols = [c for c in df.columns if c != "loan_status_bin" and not np.issubdtype(df[c].dtype, np.datetime64)]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]
print(f"#kolumn num: {len(num_cols)}, kat: {len(cat_cols)}")

# ---------- 2) helpery ----------
def time_blocks(frame: pd.DataFrame, date_col="issue_d", n_splits=N_SPLITS_TIME):
    """Zwraca listę (train_idx, valid_idx) wzrastających bloków czasowych po miesiącach."""
    if date_col not in frame.columns or frame[date_col].isna().all():
        # fallback: jeden fold 80/20 bez czasu
        n = len(frame)
        cut = int(n*0.8)
        idx = frame.index.to_numpy()
        return [(idx[:cut], idx[cut:])]
    months = frame[date_col].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    if len(uniq) < n_splits:
        n_splits = max(2, len(uniq))
    chunks = np.array_split(uniq, n_splits)
    pairs = []
    for i in range(1, len(chunks)):
        tr_m = np.concatenate(chunks[:i])
        va_m = chunks[i]
        tr_idx = frame.index[months.isin(tr_m)]
        va_idx = frame.index[months.isin(va_m)]
        if len(tr_idx) and len(va_idx):
            pairs.append((tr_idx, va_idx))
    return pairs

def ks_score(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    return float(np.max(tpr - fpr))

def ece_score(y_true, y_prob, n_bins=20):
    bins = np.linspace(0,1,n_bins+1)
    idx = np.digitize(y_prob, bins) - 1
    ece = 0.0
    for b in range(n_bins):
        m = (idx == b)
        if m.sum()==0: 
            continue
        ece += m.mean() * abs(y_prob[m].mean() - y_true[m].mean())
    return float(ece)

def decile_table(y_true, y_prob, deciles=10):
    d = pd.DataFrame({"y": y_true, "p": y_prob}).sort_values("p", ascending=False).reset_index(drop=True)
    d["decile"] = pd.qcut(d.index, q=deciles, labels=False) + 1
    tab = d.groupby("decile").agg(
        n=("y","size"),
        bad=("y","sum"),
        good=("y", lambda s: (1-s).sum()),
        prob_mean=("p","mean")
    ).reset_index()
    tab["bad_rate"] = tab["bad"]/tab["n"]
    total_bad, total_good = tab["bad"].sum(), tab["good"].sum()
    tab["cum_bad"]  = tab["bad"].cumsum()/max(total_bad,1)
    tab["cum_good"] = tab["good"].cumsum()/max(total_good,1)
    tab["ks"] = (tab["cum_bad"] - tab["cum_good"]).abs()
    return tab

def profit_curve(y_true, y_prob, profit_good=PROFIT_GOOD, loss_bad=LOSS_BAD, steps=201):
    taus = np.linspace(0,1,steps)
    ev = []
    for t in taus:
        acc = y_prob < t
        tg = ((y_true==0) & acc).sum()
        tb = ((y_true==1) & acc).sum()
        ev.append(tg*profit_good + tb*loss_bad)
    return taus, np.array(ev)

# ---------- 3) preprocessing ----------
# Drzewo nie wymaga standaryzacji; ale robimy imputację i OHE
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median", add_indicator=True))  # dodaje flagi braków
    # brak skalowania – niepotrzebne dla drzewa
])

cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

pre = ColumnTransformer(
    [("num", num_pipe, num_cols),
     ("cat", cat_pipe, cat_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

# bazowy klasyfikator CART
tree_base = DecisionTreeClassifier(
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=100,   # sensowny próg dla dużych danych
    class_weight="balanced",
    random_state=RANDOM_STATE
)

tree_pipe = Pipeline([
    ("pre", pre),
    ("clf", tree_base)
])

# ---------- 4) walidacja czasowa + prosty tuning ----------
param_grid = {
    "clf__max_depth": [6, 8, 10, 12, None],
    "clf__min_samples_leaf": [50, 100, 200],
    "clf__ccp_alpha": [0.0, 1e-4, 5e-4, 1e-3]
}

folds = time_blocks(df, "issue_d", n_splits=N_SPLITS_TIME)
results, last = [], {}
best_auc, best_params = -np.inf, None

for max_depth in param_grid["clf__max_depth"]:
    for min_leaf in param_grid["clf__min_samples_leaf"]:
        for ccp in param_grid["clf__ccp_alpha"]:
            tree_pipe.set_params(clf__max_depth=max_depth,
                                 clf__min_samples_leaf=min_leaf,
                                 clf__ccp_alpha=ccp)
            fold_metrics = []
            y = y.astype("int64") if isinstance(y, pd.Series) else pd.Series(y, index=df.index)
            for tr_idx, va_idx in folds:
                Xtr, ytr = df.loc[tr_idx, :], y[tr_idx]
                Xva, yva = df.loc[va_idx, :], y[va_idx]
                tree_pipe.fit(Xtr, ytr)
                p = tree_pipe.predict_proba(Xva)[:,1]
                fold_metrics.append(roc_auc_score(yva, p))
                last = {"Xtr":Xtr, "ytr":ytr, "Xva":Xva, "yva":yva, "pva":p}
            mean_auc = float(np.mean(fold_metrics))
            results.append({"max_depth":max_depth, "min_leaf":min_leaf, "ccp_alpha":ccp, "AUC_mean":mean_auc})
            if mean_auc > best_auc:
                best_auc, best_params = mean_auc, (max_depth, min_leaf, ccp)

cv_grid = pd.DataFrame(results).sort_values("AUC_mean", ascending=False)
cv_grid.to_csv(f"{ART}/cv_grid_tree.csv", index=False)
print("Najlepsze parametry:", best_params, "AUC_mean=", round(best_auc,4))

# ustaw najlepsze i policz pełny zestaw metryk na foldach
tree_pipe.set_params(clf__max_depth=best_params[0],
                     clf__min_samples_leaf=best_params[1],
                     clf__ccp_alpha=best_params[2])

metrics = []
for tr_idx, va_idx in folds:
    Xtr, ytr = df.loc[tr_idx, :], y[tr_idx]
    Xva, yva = df.loc[va_idx, :], y[va_idx]
    tree_pipe.fit(Xtr, ytr)
    p = tree_pipe.predict_proba(Xva)[:,1]
    metrics.append({
        "AUC": roc_auc_score(yva, p),
        "PR_AUC": average_precision_score(yva, p),
        "KS": ks_score(yva, p),
        "Brier": brier_score_loss(yva, p),
        "LogLoss": log_loss(yva, p, labels=[0,1]),
        "ECE": ece_score(yva, p)
    })
    last = {"Xtr":Xtr, "ytr":ytr, "Xva":Xva, "yva":yva, "pva":p}

cv_results = pd.DataFrame(metrics)
cv_results.to_csv(f"{ART}/cv_fold_metrics_tree.csv", index=False)
cv_mean = cv_results.mean()
cv_mean.to_csv(f"{ART}/cv_metrics_mean_tree.csv", header=False)
print("Średnie metryki CV (drzewo):\n", cv_mean.round(4))

# ---------- 5) ROC i kalibracja (ostatni fold) ----------
fpr, tpr, _ = roc_curve(last["yva"], last["pva"])
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(last['yva'],last['pva']):.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — Decision Tree (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_last_fold_tree.png", dpi=160); plt.close()

frac_pos, mean_pred = calibration_curve(last["yva"], last["pva"], n_bins=N_BINS_CALIB, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(mean_pred, frac_pos, marker="o")
plt.plot([0,1],[0,1],"--")
plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
plt.title("Kalibracja — Decision Tree (ostatni fold)")
plt.tight_layout(); plt.savefig(f"{ART}/calibration_last_fold_tree.png", dpi=160); plt.close()

# ---------- 6) Krzywa zysku i próg ----------
taus, ev = profit_curve(last["yva"], last["pva"], PROFIT_GOOD, LOSS_BAD, steps=201)
best_tau = float(taus[int(ev.argmax())])
pd.DataFrame({"tau":taus, "expected_profit":ev}).to_csv(f"{ART}/profit_curve_last_fold_tree.csv", index=False)

plt.figure(figsize=(6,4))
plt.plot(taus, ev); plt.axvline(best_tau, ls="--", label=f"tau*={best_tau:.3f}")
plt.xlabel("Próg akceptacji (p < tau)"); plt.ylabel("Oczekiwany zysk")
plt.title("Krzywa zysku — Decision Tree (ostatni fold)")
plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/profit_curve_last_fold_tree.png", dpi=160); plt.close()

# ---------- 7) Test OOT (ostatni miesiąc) + kalibracja isotonic ----------
if "issue_d" in df.columns and df["issue_d"].notna().any():
    months = df["issue_d"].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    oot_mask = (months == uniq[-1])      # ostatni miesiąc jako OOT
    train_mask = ~oot_mask
else:
    # fallback 80/20
    idx = df.index.to_numpy()
    cut = int(len(idx)*0.8)
    train_mask = np.zeros(len(idx), dtype=bool); train_mask[:cut] = True
    oot_mask = ~train_mask

X_train, y_train = df.loc[train_mask, :], y[train_mask]
X_oot,   y_oot   = df.loc[oot_mask,   :], y[oot_mask]

tree_pipe.fit(X_train, y_train)
calibrated = CalibratedClassifierCV(tree_pipe, cv="prefit", method="isotonic")
calibrated.fit(last["Xva"], last["yva"])
p_oot = calibrated.predict_proba(X_oot)[:,1]

oot_metrics = {
    "AUC": roc_auc_score(y_oot, p_oot),
    "PR_AUC": average_precision_score(y_oot, p_oot),
    "KS": ks_score(y_oot, p_oot),
    "Brier": brier_score_loss(y_oot, p_oot),
    "LogLoss": log_loss(y_oot, p_oot, labels=[0,1]),
    "ECE": ece_score(y_oot, p_oot)
}
pd.Series(oot_metrics).to_csv(f"{ART}/oot_metrics_tree.csv", header=False)
print("\nMetryki OOT (drzewo):\n", pd.Series(oot_metrics).round(4))

# ---------- 8) Tabela decylowa i KS (OOT) ----------
dec_tab = decile_table(y_oot, p_oot, deciles=10)
dec_tab.to_csv(f"{ART}/decile_table_oot_tree.csv", index=False)

plt.figure(figsize=(6,4))
plt.plot(dec_tab["decile"], dec_tab["ks"], marker="o")
plt.xlabel("Decyl (1 = najwyższe ryzyko)"); plt.ylabel("KS")
plt.title("KS po decylach — Decision Tree (OOT)")
plt.tight_layout(); plt.savefig(f"{ART}/ks_by_decile_oot_tree.png", dpi=160); plt.close()

# ---------- 9) Ważność cech + szkic drzewa ----------
# dopasuj preprocessing na TRAIN, by uzyskać nazwy cech po transformacji
pre_fitted = pre.fit(X_train)
feat_names = pre_fitted.get_feature_names_out()

# pobierz ważności z wewnątrz pipeline
fitted_tree = tree_pipe.named_steps["clf"]
# uwaga: feature_importances_ odnosi się do wyjścia z preprocesora
imp = getattr(fitted_tree, "feature_importances_", None)
if imp is not None and len(imp) == len(feat_names):
    imp_df = pd.DataFrame({"feature": feat_names, "importance": imp}).sort_values("importance", ascending=False)
    imp_df.to_csv(f"{ART}/tree_feature_importance.csv", index=False)

    plt.figure(figsize=(8,6))
    top = imp_df.head(15)[::-1]
    plt.barh(top["feature"], top["importance"])
    plt.title("Decision Tree — TOP 15 ważności cech")
    plt.tight_layout(); plt.savefig(f"{ART}/tree_feature_importance_top15.png", dpi=160); plt.close()

# szkic drzewa: aby wizualizacja była czytelna, zbuduj płytszą kopię (np. max_depth=4)
small_tree = Pipeline([
    ("pre", pre),
    ("clf", DecisionTreeClassifier(
        criterion="gini", max_depth=4, min_samples_leaf=200,
        class_weight="balanced", random_state=RANDOM_STATE))
])
small_tree.fit(X_train, y_train)

plt.figure(figsize=(16,10))
plot_tree(
    small_tree.named_steps["clf"],
    filled=True, impurity=True, proportion=True,
    max_depth=4, fontsize=8
)
plt.title("Decision Tree — szkic (max_depth=4)")
plt.tight_layout(); plt.savefig(f"{ART}/tree_sketch_depth4.png", dpi=160); plt.close()

print(f"\nArtefakty zapisano w: {os.path.abspath(ART)}")


  cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]


#kolumn num: 11, kat: 3
Najlepsze parametry: (6, 200, 0.0) AUC_mean= 0.694
Średnie metryki CV (drzewo):
 AUC        0.6940
PR_AUC     0.3674
KS         0.2867
Brier      0.2262
LogLoss    0.6418
ECE        0.2510
dtype: float64

Metryki OOT (drzewo):
 AUC        0.6877
PR_AUC     0.4282
KS         0.2830
Brier      0.1858
LogLoss    0.5512
ECE        0.0087
dtype: float64

Artefakty zapisano w: c:\Users\lukasz.wrobel\Desktop\PRACA MAGISTERSKA\pliki\artifacts\artifacts_44_tree
