In [1]:
# ======================================================
# TEST4 - BEST TACTIC (TUNED WEIGHTED LIGHTGBM)
# Objective: maximize class-1 metrics with precision-focused thresholding
# ======================================================

from __future__ import annotations

import json
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    average_precision_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)


# ======================================================
# 1) CONFIG
# ======================================================

TARGET = "IS_SEVERE"
MIN_RECALL = 0.58                      # trade-off scelto da benchmark
THRESHOLDS = np.round(np.arange(0.10, 0.951, 0.01), 2)
RANDOM_STATE = 42
SAVE_MODEL = True

# Parametri tuned emersi dal benchmark
LGBM_PARAMS = {
    "n_estimators": 650,
    "learning_rate": 0.03,
    "num_leaves": 95,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "min_child_samples": 60,
    "reg_alpha": 0.1,
    "reg_lambda": 3.0,
}


def find_project_root() -> Path:
    cwd = Path.cwd().resolve()
    for p in [cwd, *cwd.parents]:
        if (p / "data").exists() and (p / "models").exists():
            return p
    return cwd


ROOT = find_project_root()
print(f"Project root: {ROOT}")

TRAIN_PATH = ROOT / "data" / "interim" / "splits" / "train_step6.csv"
VAL_PATH = ROOT / "data" / "interim" / "splits" / "val_step6.csv"
TEST_PATH = ROOT / "data" / "interim" / "splits" / "test_step6.csv"

REPORTS_DIR = ROOT / "reports" / "metrics"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)


# ======================================================
# 2) HELPERS
# ======================================================

def safe_div(a: float, b: float) -> float:
    return float(a / b) if b else 0.0


def threshold_scan(y_true: pd.Series, probs: np.ndarray, thresholds: np.ndarray) -> pd.DataFrame:
    y = y_true.to_numpy(dtype=int)
    rows = []
    for t in thresholds:
        pred = (probs >= t).astype(int)
        rows.append(
            {
                "threshold": float(t),
                "precision": precision_score(y, pred, zero_division=0),
                "recall": recall_score(y, pred, zero_division=0),
                "f1": f1_score(y, pred, zero_division=0),
                "false_negatives": int(((y == 1) & (pred == 0)).sum()),
                "false_positives": int(((y == 0) & (pred == 1)).sum()),
            }
        )
    return pd.DataFrame(rows).sort_values("threshold").reset_index(drop=True)


def pick_best_threshold(thr_df: pd.DataFrame, min_recall: float) -> tuple[float, str]:
    # Regola principale: massimizza precisione mantenendo recall minimo
    feasible = thr_df[thr_df["recall"] >= min_recall]
    if not feasible.empty:
        row = feasible.sort_values(["precision", "f1"], ascending=False).iloc[0]
        return float(row["threshold"]), f"best_precision_with_recall>={min_recall:.2f}"

    # Fallback: se nessuna soglia raggiunge recall minimo, usa best F1
    row = thr_df.sort_values(["f1", "precision"], ascending=False).iloc[0]
    return float(row["threshold"]), "fallback_best_f1"


def evaluate_split(y_true: pd.Series, probs: np.ndarray, threshold: float) -> dict:
    y = y_true.to_numpy(dtype=int)
    pred = (probs >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    return {
        "precision": precision_score(y, pred, zero_division=0),
        "recall": recall_score(y, pred, zero_division=0),
        "f1": f1_score(y, pred, zero_division=0),
        "pr_auc": average_precision_score(y, probs),
        "roc_auc": roc_auc_score(y, probs),
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
        "tp": int(tp),
    }


# ======================================================
# 3) LOAD DATA
# ======================================================

for p in [TRAIN_PATH, VAL_PATH, TEST_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Missing required file: {p}")

train = pd.read_csv(TRAIN_PATH)
val = pd.read_csv(VAL_PATH)
test = pd.read_csv(TEST_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET].astype(int)

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET].astype(int)

X_test = test.drop(columns=[TARGET])
y_test = test[TARGET].astype(int)

features_order = list(X_train.columns)
if list(X_val.columns) != features_order or list(X_test.columns) != features_order:
    raise ValueError("Feature mismatch between train/val/test")

neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = safe_div(neg, pos)

print("Train:", X_train.shape, "positive rate:", y_train.mean())
print("Val  :", X_val.shape, "positive rate:", y_val.mean())
print("Test :", X_test.shape, "positive rate:", y_test.mean())
print("scale_pos_weight:", scale_pos_weight)


# ======================================================
# 4) TRAIN BEST MODEL
# ======================================================

model = LGBMClassifier(
    objective="binary",
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=-1,
    **LGBM_PARAMS,
)

print("\nTraining tuned weighted LightGBM...")
model.fit(X_train, y_train)

val_probs = model.predict_proba(X_val)[:, 1]
test_probs = model.predict_proba(X_test)[:, 1]


# ======================================================
# 5) THRESHOLD OPTIMIZATION ON VALIDATION
# ======================================================

thr_df = threshold_scan(y_val, val_probs, THRESHOLDS)
best_thr, reason = pick_best_threshold(thr_df, MIN_RECALL)

val_metrics = evaluate_split(y_val, val_probs, best_thr)
test_metrics = evaluate_split(y_test, test_probs, best_thr)

print("\nSelected threshold:", best_thr)
print("Selection rule:", reason)
print("\nValidation metrics:", val_metrics)
print("\nTest metrics:", test_metrics)


# ======================================================
# 6) FINAL TEST REPORT
# ======================================================

test_pred = (test_probs >= best_thr).astype(int)

print("\n===== CLASSIFICATION REPORT (TEST) =====")
print(classification_report(y_test, test_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, test_pred))


# ======================================================
# 7) SAVE OUTPUTS
# ======================================================

# threshold scan table
thr_path = REPORTS_DIR / "test4_lgbm_threshold_scan.csv"
thr_df.to_csv(thr_path, index=False)

# summary report
summary = {
    "tactic": "tuned_weighted_lgbm",
    "selection_rule": reason,
    "threshold": float(best_thr),
    "min_recall": MIN_RECALL,
    "lgbm_params": LGBM_PARAMS,
    "scale_pos_weight": scale_pos_weight,
    "val": val_metrics,
    "test": test_metrics,
}

summary_path = REPORTS_DIR / "test4_lgbm_best_report.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

# false negatives for manual audit
analysis = test.copy()
analysis["prob"] = test_probs
analysis["pred"] = test_pred
fn = analysis[(analysis[TARGET] == 1) & (analysis["pred"] == 0)]
fn_path = ROOT / "data" / "evaluation" / "false_negatives_test4_lgbm_best.csv"
fn.to_csv(fn_path, index=False)

# save production bundle
if SAVE_MODEL:
    model_bundle = {
        "type": "lgbm_weighted_tuned",
        "model": model,
        "threshold": float(best_thr),
        "features_order": features_order,
        "metadata": {
            "selection_rule": reason,
            "min_recall": MIN_RECALL,
            "scale_pos_weight": scale_pos_weight,
            "lgbm_params": LGBM_PARAMS,
        },
    }
    model_out = ROOT / "models" / "production" / "severe_model_test4_best_tactic.pkl"
    joblib.dump(model_bundle, model_out)
    print(f"\nSaved model: {model_out}")

print(f"Saved threshold table: {thr_path}")
print(f"Saved summary report : {summary_path}")
print(f"Saved FN file        : {fn_path}")


Project root: /Users/marcodonatiello/PycharmProjects/JupyterProject
Train: (536370, 78) positive rate: 0.12842254413930682
Val  : (134093, 78) positive rate: 0.12842579403846585
Test : (167616, 78) positive rate: 0.12842449408171058
scale_pos_weight: 6.786794808513109

Training tuned weighted LightGBM...

Selected threshold: 0.79
Selection rule: best_precision_with_recall>=0.58

Validation metrics: {'precision': 0.6616805411030177, 'recall': 0.5907903141513269, 'f1': 0.6242292235481793, 'pr_auc': 0.692871038576882, 'roc_auc': 0.9112179309750683, 'tn': 111670, 'fp': 5202, 'fn': 7047, 'tp': 10174}

Test metrics: {'precision': 0.5851196856020007, 'recall': 0.6086592957353898, 'f1': 0.5966574069857462, 'pr_auc': 0.6472700170887296, 'roc_auc': 0.8920629566003202, 'tn': 136800, 'fp': 9290, 'fn': 8424, 'tp': 13102}

===== CLASSIFICATION REPORT (TEST) =====
              precision    recall  f1-score   support

           0     0.9420    0.9364    0.9392    146090
           1     0.5851    0.