In [3]:
# ===========================================================
# Rozdz. 4.6 — Gradient Boosting: XGBoost & LightGBM
# pełny pipeline: walidacja czasowa, kalibracja, OOT, artefakty
# ===========================================================
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path

import lightgbm as lgb
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss, roc_curve
)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# ---------- ścieżki / artefakty ----------
ART = "artifacts_46_gbm"
os.makedirs(ART, exist_ok=True)

# Ekonomia decyzji (dostosuj do realiów)
PROFIT_GOOD = 1_000
LOSS_BAD   = -5_000

# Walidacja czasowa
N_SPLITS_TIME = 6
N_BINS_CALIB  = 10
RANDOM_STATE  = 42

# ---------- 0) zależności zewnętrzne: xgboost i lightgbm ----------
try:
    from xgboost import XGBClassifier
except Exception as e:
    raise ImportError("Brak pakietu 'xgboost'. Zainstaluj: pip install xgboost") from e

try:
    from lightgbm import LGBMClassifier
except Exception as e:
    raise ImportError("Brak pakietu 'lightgbm'. Zainstaluj: pip install lightgbm") from e

# ---------- 1) dane ----------
SNAP_PATH = Path("C:/Users/lukasz.wrobel/Desktop/PRACA MAGISTERSKA/pliki/artifacts/artifacts/engineered_snapshot.csv")
if not SNAP_PATH.exists():
    SNAP_PATH = Path("engineered_snapshot.csv")

df = pd.read_csv(SNAP_PATH)
if "issue_d" in df.columns:
    df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")

assert "loan_status_bin" in df.columns, "Brak kolumny 'loan_status_bin' w snapshotcie."
df["loan_status_bin"] = pd.to_numeric(df["loan_status_bin"], errors="coerce")
df = df.loc[df["loan_status_bin"].isin([0,1])].copy()

# y jako Series (zachowuje index -> później .loc)
y = df["loan_status_bin"].astype("int8")

# sanity — NaN/Inf w cechach
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# listy cech (bez gołych datetime)
feature_cols = [c for c in df.columns if c != "loan_status_bin" and not pd.api.types.is_datetime64_any_dtype(df[c])]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]
print(f"#kolumn num: {len(num_cols)}, kat: {len(cat_cols)}")

# ---------- 2) helpery ----------
def time_blocks(frame: pd.DataFrame, date_col="issue_d", n_splits=N_SPLITS_TIME):
    """Zwraca listę (train_idx, valid_idx) rosnących bloków czasowych (po miesiącach)."""
    if date_col not in frame.columns or frame[date_col].isna().all():
        # fallback 80/20 bez czasu
        idx = frame.index.to_numpy()
        cut = int(len(idx)*0.8)
        return [(idx[:cut], idx[cut:])]
    months = frame[date_col].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    if len(uniq) < n_splits:
        n_splits = max(2, len(uniq))
    chunks = np.array_split(uniq, n_splits)
    pairs = []
    for i in range(1, len(chunks)):
        tr_m = np.concatenate(chunks[:i])
        va_m = chunks[i]
        tr_idx = frame.index[months.isin(tr_m)]
        va_idx = frame.index[months.isin(va_m)]
        if len(tr_idx) and len(va_idx):
            pairs.append((tr_idx, va_idx))
    return pairs

def ks_score(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    return float(np.max(tpr - fpr))

def ece_score(y_true, y_prob, n_bins=20):
    bins = np.linspace(0,1,n_bins+1)
    idx = np.digitize(y_prob, bins) - 1
    ece = 0.0
    for b in range(n_bins):
        m = (idx == b)
        if m.sum()==0: 
            continue
        ece += m.mean() * abs(y_prob[m].mean() - y_true[m].mean())
    return float(ece)

def decile_table(y_true, y_prob, deciles=10):
    d = pd.DataFrame({"y": y_true, "p": y_prob}).sort_values("p", ascending=False).reset_index(drop=True)
    d["decile"] = pd.qcut(d.index, q=deciles, labels=False) + 1
    tab = d.groupby("decile").agg(
        n=("y","size"),
        bad=("y","sum"),
        good=("y", lambda s: (1-s).sum()),
        prob_mean=("p","mean")
    ).reset_index()
    tab["bad_rate"] = tab["bad"]/tab["n"]
    total_bad, total_good = tab["bad"].sum(), tab["good"].sum()
    tab["cum_bad"]  = tab["bad"].cumsum()/max(total_bad,1)
    tab["cum_good"] = tab["good"].cumsum()/max(total_good,1)
    tab["ks"] = (tab["cum_bad"] - tab["cum_good"]).abs()
    return tab

def profit_curve(y_true, y_prob, profit_good=PROFIT_GOOD, loss_bad=LOSS_BAD, steps=201):
    taus = np.linspace(0,1,steps)
    ev = []
    for t in taus:
        acc = y_prob < t
        tg = ((y_true==0) & acc).sum()
        tb = ((y_true==1) & acc).sum()
        ev.append(tg*profit_good + tb*loss_bad)
    return taus, np.array(ev)

# ---------- 3) preprocessing (wspólny) ----------
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median", add_indicator=True))  # GBM nie wymaga skalowania
])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])
pre = ColumnTransformer(
    [("num", num_pipe, num_cols),
     ("cat", cat_pipe, cat_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

# Uwaga: dla early_stopping musimy podać eval_set już po transformacji,
# więc w pętli fitujemy pre na TRAIN, transformujemy X i dopiero trenujemy booster.

# ---------- 4) XGBoost i LightGBM — siatki parametrów ----------
xgb_grid = [
    dict(
        n_estimators=800, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        min_child_weight=5, gamma=0.0, reg_lambda=1.0
    ),
    dict(
        n_estimators=600, max_depth=5, learning_rate=0.07,
        subsample=0.8, colsample_bytree=0.7,
        min_child_weight=3, gamma=0.0, reg_lambda=1.0
    )
]

lgb_grid = [
    dict(
        n_estimators=1200, learning_rate=0.04,
        num_leaves=63, max_depth=-1,
        min_data_in_leaf=200, feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1
    ),
    dict(
        n_estimators=900, learning_rate=0.06,
        num_leaves=63, max_depth=-1,
        min_data_in_leaf=150, feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1
    )
]

# ---------- 5) pętla walidacji czasowej ----------
folds = time_blocks(df, "issue_d", n_splits=N_SPLITS_TIME)

def run_model(model_name, param_list):
    results_grid = []
    best_auc, best_cfg, last = -np.inf, None, {}

    for params in param_list:
        fold_aucs = []
        for tr_idx, va_idx in folds:
            # 1) fit pre na TRAIN, transformuj
            pre_fitted = pre.fit(df.loc[tr_idx, :])
            Xtr_enc = pre_fitted.transform(df.loc[tr_idx, :])
            Xva_enc = pre_fitted.transform(df.loc[va_idx, :])
            feat_names = pre_fitted.get_feature_names_out()

            ytr = y.loc[tr_idx]
            yva = y.loc[va_idx]

            # 2) imbalance weight (neg/pos) liczymy na TRAIN
            pos = int((ytr == 1).sum()); neg = int((ytr == 0).sum())
            scale_pos_weight = (neg / max(pos, 1))

            # 3) zbuduj model
            if model_name == "xgb":
                clf = XGBClassifier(
                    objective="binary:logistic",
                    eval_metric="auc",
                    tree_method="hist",
                    random_state=RANDOM_STATE,
                    enable_categorical=False,
                    n_jobs=-1,
                    **params,
                    scale_pos_weight=scale_pos_weight
                )
                clf.fit(
                    Xtr_enc, ytr,
                    eval_set=[(Xva_enc, yva)],
                    verbose=False,
                    early_stopping_rounds=100
                )
                proba = clf.predict_proba(Xva_enc)[:, 1]

            elif model_name == "lgb":
                clf = LGBMClassifier(
                    objective="binary",
                    random_state=RANDOM_STATE,
                    n_jobs=-1,
                    **params
                )
                # dla LGBM: imbalance – można użyć is_unbalance lub scale_pos_weight
                # (nie oba naraz). Wybierzemy scale_pos_weight dla spójności:
                import lightgbm as lgb

                lgb_callbacks = [
                    lgb.early_stopping(stopping_rounds=200, verbose=False),  # zamiast early_stopping_rounds=
                    lgb.log_evaluation(period=0),                            # 0 = brak logów; np. 50 aby logować co 50 iteracji
                ]

                clf.fit(
                    Xtr_enc, ytr,
                    eval_set=[(Xva_enc, yva)],
                    eval_metric="auc",
                    callbacks=lgb_callbacks
                )
                proba = clf.predict_proba(Xva_enc)[:, 1]
            else:
                raise ValueError("model_name musi być 'xgb' lub 'lgb'.")

            auc = roc_auc_score(yva, proba)
            fold_aucs.append(auc)
            last = dict(pre=pre_fitted, Xva_enc=Xva_enc, yva=yva, proba=proba,
                        feat_names=feat_names, model=clf)

        mean_auc = float(np.mean(fold_aucs))
        res = {"model": model_name, **params, "AUC_mean": mean_auc}
        results_grid.append(res)
        if mean_auc > best_auc:
            best_auc, best_cfg = mean_auc, params

    grid_df = pd.DataFrame(results_grid).sort_values("AUC_mean", ascending=False)
    return grid_df, best_cfg, best_auc, last

# Uruchom XGB i LGB
xgb_grid_df, xgb_best, xgb_best_auc, xgb_last = run_model("xgb", xgb_grid)
lgb_grid_df, lgb_best, lgb_best_auc, lgb_last = run_model("lgb", lgb_grid)

xgb_grid_df.to_csv(f"{ART}/cv_grid_xgb.csv", index=False)
lgb_grid_df.to_csv(f"{ART}/cv_grid_lgb.csv", index=False)
print("XGB best:", xgb_best, "AUC_mean=", round(xgb_best_auc, 4))
print("LGB best:", lgb_best, "AUC_mean=", round(lgb_best_auc, 4))

# ---------- 6) Pełny zestaw metryk na foldach dla najlepszego wariantu ----------
def eval_best(model_name, best_params):
    metrics, last = [], {}
    for tr_idx, va_idx in folds:
        pre_fitted = pre.fit(df.loc[tr_idx, :])
        Xtr_enc = pre_fitted.transform(df.loc[tr_idx, :])
        Xva_enc = pre_fitted.transform(df.loc[va_idx, :])
        feat_names = pre_fitted.get_feature_names_out()
        ytr, yva = y.loc[tr_idx], y.loc[va_idx]
        pos, neg = int((ytr==1).sum()), int((ytr==0).sum())
        scale_pos_weight = (neg / max(pos,1))

        if model_name == "xgb":
            clf = XGBClassifier(
                objective="binary:logistic",
                eval_metric="auc",
                tree_method="hist",
                random_state=RANDOM_STATE,
                enable_categorical=False,
                n_jobs=-1,
                **best_params,
                scale_pos_weight=scale_pos_weight
            )
            clf.fit(Xtr_enc, ytr, eval_set=[(Xva_enc, yva)], verbose=False, early_stopping_rounds=100)
        else:
            clf = LGBMClassifier(
                objective="binary",
                random_state=RANDOM_STATE,
                n_jobs=-1,
                **best_params,
                scale_pos_weight=scale_pos_weight,
                verbosity=-1
            )
            clf.fit(Xtr_enc, ytr, eval_set=[(Xva_enc, yva)], eval_metric="auc", callbacks=[lgb.log_evaluation(period=0)])

        p = clf.predict_proba(Xva_enc)[:,1]
        metrics.append({
            "AUC": roc_auc_score(yva, p),
            "PR_AUC": average_precision_score(yva, p),
            "KS": ks_score(yva, p),
            "Brier": brier_score_loss(yva, p),
            "LogLoss": log_loss(yva, p, labels=[0,1]),
            "ECE": ece_score(yva, p)
        })
        last = {"pre":pre_fitted, "Xva_enc":Xva_enc, "yva":yva, "pva":p, "feat_names":feat_names, "model":clf}
    return pd.DataFrame(metrics), last

xgb_cv, xgb_last = eval_best("xgb", xgb_best)
lgb_cv, lgb_last = eval_best("lgb", lgb_best)

xgb_cv.to_csv(f"{ART}/cv_fold_metrics_xgb.csv", index=False)
lgb_cv.to_csv(f"{ART}/cv_fold_metrics_lgb.csv", index=False)
xgb_cv.mean().to_csv(f"{ART}/cv_metrics_mean_xgb.csv", header=False)
lgb_cv.mean().to_csv(f"{ART}/cv_metrics_mean_lgb.csv", header=False)
print("Średnie metryki CV XGB:\n", xgb_cv.mean().round(4))
print("Średnie metryki CV LGB:\n", lgb_cv.mean().round(4))

# ---------- 7) ROC i kalibracja (ostatni fold) ----------
def plot_roc_calib(last_obj, prefix):
    fpr, tpr, _ = roc_curve(last_obj["yva"], last_obj["pva"])
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(last_obj['yva'], last_obj['pva']):.3f}")
    plt.plot([0,1],[0,1],"--")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC — {prefix} (ostatni fold)")
    plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_last_fold_{prefix}.png", dpi=160); plt.close()

    frac_pos, mean_pred = calibration_curve(last_obj["yva"], last_obj["pva"], n_bins=N_BINS_CALIB, strategy="quantile")
    plt.figure(figsize=(5,4))
    plt.plot(mean_pred, frac_pos, marker="o")
    plt.plot([0,1],[0,1],"--")
    plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
    plt.title(f"Kalibracja — {prefix} (ostatni fold)")
    plt.tight_layout(); plt.savefig(f"{ART}/calibration_last_fold_{prefix}.png", dpi=160); plt.close()

plot_roc_calib(xgb_last, "xgb")
plot_roc_calib(lgb_last, "lgb")

# ---------- 8) Krzywa zysku + próg (ostatni fold) ----------
def save_profit(last_obj, prefix):
    taus, ev = profit_curve(last_obj["yva"], last_obj["pva"], PROFIT_GOOD, LOSS_BAD, steps=201)
    best_tau = float(taus[int(ev.argmax())])
    pd.DataFrame({"tau":taus, "expected_profit":ev}).to_csv(f"{ART}/profit_curve_last_fold_{prefix}.csv", index=False)
    plt.figure(figsize=(6,4))
    plt.plot(taus, ev); plt.axvline(best_tau, ls="--", label=f"tau*={best_tau:.3f}")
    plt.xlabel("Próg akceptacji (p < tau)"); plt.ylabel("Oczekiwany zysk")
    plt.title(f"Krzywa zysku — {prefix} (ostatni fold)")
    plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/profit_curve_last_fold_{prefix}.png", dpi=160); plt.close()
    return best_tau

xgb_tau = save_profit(xgb_last, "xgb")
lgb_tau = save_profit(lgb_last, "lgb")

# ---------- 9) Test OOT (ostatni miesiąc) + kalibracja isotonic ----------
if "issue_d" in df.columns and df["issue_d"].notna().any():
    months = df["issue_d"].dt.to_period("M").astype(str)
    uniq = np.array(sorted(months.dropna().unique()))
    oot_mask = (months == uniq[-1])
    train_mask = ~oot_mask
else:
    idx = df.index.to_numpy()
    cut = int(len(idx)*0.8)
    train_mask = np.zeros(len(idx), dtype=bool); train_mask[:cut] = True
    oot_mask = ~train_mask

def oot_eval(best_params, model_name, tau_from_valid, last_obj, tag):
    # fit pre na TRAIN i transformuj
    pre_fitted = pre.fit(df.loc[train_mask, :])
    X_train = pre_fitted.transform(df.loc[train_mask, :])
    X_oot   = pre_fitted.transform(df.loc[oot_mask,   :])
    y_train = y.loc[train_mask]
    y_oot_  = y.loc[oot_mask]
    feat_names = pre_fitted.get_feature_names_out()

    pos, neg = int((y_train==1).sum()), int((y_train==0).sum())
    scale_pos_weight = (neg / max(pos,1))

    if model_name == "xgb":
        clf = XGBClassifier(
            objective="binary:logistic",
            eval_metric="auc",
            tree_method="hist",
            random_state=RANDOM_STATE,
            enable_categorical=False,
            n_jobs=-1,
            **best_params,
            scale_pos_weight=scale_pos_weight
        )
        clf.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=False)
    else:
        clf = LGBMClassifier(
            objective="binary",
            random_state=RANDOM_STATE,
            n_jobs=-1,
            **best_params,
            scale_pos_weight=scale_pos_weight
        )
        clf.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric="auc", callbacks=[lgb.log_evaluation(period=0)])

    # kalibracja na walidacji z ostatniego foldu
    calibrated = CalibratedClassifierCV(clf, cv="prefit", method="isotonic")
    calibrated.fit(last_obj["Xva_enc"], last_obj["yva"])
    p_oot = calibrated.predict_proba(X_oot)[:,1]

    # metryki OOT
    oot_metrics = {
        "AUC": roc_auc_score(y_oot_, p_oot),
        "PR_AUC": average_precision_score(y_oot_, p_oot),
        "KS": ks_score(y_oot_, p_oot),
        "Brier": brier_score_loss(y_oot_, p_oot),
        "LogLoss": log_loss(y_oot_, p_oot, labels=[0,1]),
        "ECE": ece_score(y_oot_, p_oot)
    }
    pd.Series(oot_metrics).to_csv(f"{ART}/oot_metrics_{tag}.csv", header=False)
    print(f"\nMetryki OOT ({tag}):\n", pd.Series(oot_metrics).round(4))

    # ROC/kalibracja
    fpr, tpr, _ = roc_curve(y_oot_, p_oot)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(y_oot_,p_oot):.3f}")
    plt.plot([0,1],[0,1],"--")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC — {tag} (OOT)")
    plt.legend(); plt.tight_layout(); plt.savefig(f"{ART}/roc_oot_{tag}.png", dpi=160); plt.close()

    frac_pos, mean_pred = calibration_curve(y_oot_, p_oot, n_bins=N_BINS_CALIB, strategy="quantile")
    plt.figure(figsize=(5,4))
    plt.plot(mean_pred, frac_pos, marker="o")
    plt.plot([0,1],[0,1],"--")
    plt.xlabel("Przewidziana PD"); plt.ylabel("Zaobserwowana stopa defaultu")
    plt.title(f"Kalibracja — {tag} (OOT)")
    plt.tight_layout(); plt.savefig(f"{ART}/calibration_oot_{tag}.png", dpi=160); plt.close()

    # decyle i KS
    dec_tab = decile_table(y_oot_, p_oot, deciles=10)
    dec_tab.to_csv(f"{ART}/decile_table_oot_{tag}.csv", index=False)
    plt.figure(figsize=(6,4))
    plt.plot(dec_tab["decile"], dec_tab["ks"], marker="o")
    plt.xlabel("Decyl (1 = najwyższe ryzyko)"); plt.ylabel("KS")
    plt.title(f"KS po decylach — {tag} (OOT)")
    plt.tight_layout(); plt.savefig(f"{ART}/ks_by_decile_oot_{tag}.png", dpi=160); plt.close()

    # decyzje wg progu z walidacji
    accept_oot = (p_oot < tau_from_valid)
    tg = int(((y_oot_==0) & accept_oot).sum())
    tb = int(((y_oot_==1) & accept_oot).sum())
    ev_oot = tg*PROFIT_GOOD + tb*LOSS_BAD
    pd.Series({
        "best_tau_from_valid": tau_from_valid,
        "accepted_cnt": int(accept_oot.sum()),
        "true_good_accepted": tg,
        "true_bad_accepted": tb,
        "expected_profit_OOT": ev_oot
    }).to_csv(f"{ART}/decision_summary_oot_{tag}.csv", header=False)

    # ważności (gain/split → zamieniamy na barplot TOP-15)
    try:
        importances = getattr(clf, "feature_importances_", None)
        if importances is not None and len(importances) == len(feat_names):
            imp_df = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)
            imp_df.to_csv(f"{ART}/feature_importance_{tag}.csv", index=False)
            plt.figure(figsize=(8,6))
            top = imp_df.head(15)[::-1]
            plt.barh(top["feature"], top["importance"])
            plt.title(f"{tag} — TOP 15 ważności cech")
            plt.tight_layout(); plt.savefig(f"{ART}/feature_importance_top15_{tag}.png", dpi=160); plt.close()
    except Exception:
        pass

    return clf, pre_fitted, feat_names

# XGB – OOT
xgb_clf, xgb_pre, xgb_feats = oot_eval(xgb_best, "xgb", xgb_tau, xgb_last, "xgb")
# LGB – OOT
lgb_clf, lgb_pre, lgb_feats = oot_eval(lgb_best, "lgb", lgb_tau, lgb_last, "lgb")

print(f"\nArtefakty zapisano w: {os.path.abspath(ART)}")

  cat_cols = [c for c in feature_cols if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c])]


#kolumn num: 11, kat: 3




[LightGBM] [Info] Number of positive: 35432, number of negative: 164836
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2493
[LightGBM] [Info] Number of data points in the train set: 200268, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176923 -> initscore=-1.537336
[LightGBM] [Info] Start training from score -1.537336
[LightGBM] [Info] Number of positive: 92001, number of negative: 398578
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2591
[LightGBM] [Info] Number of data points in the train set: 490579, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187536 -> initscore=-1.46610



Średnie metryki CV XGB:
 AUC        0.7031
PR_AUC     0.3910
KS         0.2962
Brier      0.2198
LogLoss    0.6274
ECE        0.2404
dtype: float64
Średnie metryki CV LGB:
 AUC        0.6980
PR_AUC     0.3857
KS         0.2880
Brier      0.2150
LogLoss    0.6161
ECE        0.2236
dtype: float64

Metryki OOT (xgb):
 AUC        0.7052
PR_AUC     0.4642
KS         0.2996
Brier      0.1823
LogLoss    0.5418
ECE        0.0156
dtype: float64
[LightGBM] [Info] Number of positive: 327164, number of negative: 1230163
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2851
[LightGBM] [Info] Number of data points in the train set: 1557327, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.210080 -> initscore=-1.324440
[LightGBM] [Info] Start training from score -1.324440

Metryki OOT (lgb):
 AUC        0.7057
PR_AUC     0.4659
KS  