In [2]:
# ======================================================
# STAGE 2 ‚Äî STACKING + THRESHOLD TUNING + FN ANALYSIS
# ======================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ======================================================
# 1Ô∏è‚É£ PATH & PARAMS
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step6.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step6.csv"

TARGET = "IS_SEVERE"
N_SPLITS = 5

# soglie da testare
THRESHOLDS = np.arange(0.20, 0.61, 0.02)

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)

# ======================================================
# 3Ô∏è‚É£ CLASS IMBALANCE
# ======================================================

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# ======================================================
# 4Ô∏è‚É£ BASE MODELS
# ======================================================

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=400,
    max_depth=-1,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# ======================================================
# 5Ô∏è‚É£ OOF META-FEATURES (TRAIN)
# ======================================================

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

train_meta = pd.DataFrame(
    np.zeros((len(train), 3)),
    columns=["rf_prob", "xgb_prob", "lgbm_prob"]
)

print("\nüöÄ Generating OOF meta-features...")
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"Fold {fold}")

    rf.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])
    xgb.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])
    lgbm.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])

    train_meta.iloc[val_idx, 0] = rf.predict_proba(X_train.iloc[val_idx])[:, 1]
    train_meta.iloc[val_idx, 1] = xgb.predict_proba(X_train.iloc[val_idx])[:, 1]
    train_meta.iloc[val_idx, 2] = lgbm.predict_proba(X_train.iloc[val_idx])[:, 1]

print("‚úÖ OOF meta-features generated")

# ======================================================
# 6Ô∏è‚É£ TRAIN BASE MODELS ON FULL TRAIN
# ======================================================

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

val_meta = pd.DataFrame({
    "rf_prob": rf.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb.predict_proba(X_val)[:, 1],
    "lgbm_prob": lgbm.predict_proba(X_val)[:, 1]
})

# ======================================================
# 7Ô∏è‚É£ META-MODEL
# ======================================================

meta_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

print("\nüöÄ Training meta-model...")
meta_model.fit(train_meta, y_train)
print("‚úÖ Meta-model trained")

val_probs = meta_model.predict_proba(val_meta)[:, 1]

# ======================================================
# 8Ô∏è‚É£ THRESHOLD TUNING (VALIDATION)
# ======================================================

rows = []
for t in THRESHOLDS:
    pred = (val_probs >= t).astype(int)

    rows.append({
        "threshold": t,
        "recall": recall_score(y_val, pred),
        "precision": precision_score(y_val, pred),
        "f1": f1_score(y_val, pred),
        "false_negatives": ((y_val == 1) & (pred == 0)).sum(),
        "false_positives": ((y_val == 0) & (pred == 1)).sum()
    })

thr_df = pd.DataFrame(rows).sort_values("f1", ascending=False)

print("\nüìä THRESHOLD TUNING RESULTS (TOP 10)")
print(thr_df.head(10))

BEST_T = thr_df.iloc[0]["threshold"]
print("\nüèÜ BEST THRESHOLD:", BEST_T)

# ======================================================
# 9Ô∏è‚É£ FINAL EVALUATION (BEST THRESHOLD)
# ======================================================

val_pred = (val_probs >= BEST_T).astype(int)

print("\nüìä CLASSIFICATION REPORT (FINAL)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

# ======================================================
# üîü FN ANALYSIS POST-FEATURE
# ======================================================

val_analysis = val.copy()
val_analysis["prob"] = val_probs
val_analysis["pred"] = val_pred

fn = val_analysis[(y_val == 1) & (val_pred == 0)]
tp = val_analysis[(y_val == 1) & (val_pred == 1)]

print("\nüìâ FALSE NEGATIVES:", fn.shape[0])
print("‚úÖ TRUE POSITIVES :", tp.shape[0])

numeric_cols = val.select_dtypes(include=["int64", "float64"]).columns
numeric_cols = [c for c in numeric_cols if c not in [TARGET, "prob", "pred"]]

rows = []
for col in numeric_cols:
    rows.append({
        "feature": col,
        "FN_mean": fn[col].mean(),
        "TP_mean": tp[col].mean(),
        "delta_FN_minus_TP": fn[col].mean() - tp[col].mean()
    })

diff_df = (
    pd.DataFrame(rows)
    .sort_values(by="delta_FN_minus_TP", key=abs, ascending=False)
)

print("\nüîç TOP 15 DIFFERENZE FN vs TP")
print(diff_df.head(15))

fn["prob_bucket"] = pd.cut(
    fn["prob"],
    bins=[0, 0.2, 0.4, 0.6, 1.0],
    labels=["very_low", "low", "borderline", "high"]
)

print("\nüìä FN ‚Äî DISTRIBUZIONE PROBABILIT√Ä")
print(fn["prob_bucket"].value_counts(normalize=True))

# ======================================================
# 1Ô∏è‚É£1Ô∏è‚É£ SAVE FILES
# ======================================================

fn.to_csv("false_negatives_stage2.csv", index=False)
tp.sample(2000, random_state=42).to_csv("true_positives_stage2_sample.csv", index=False)

print("\nüìÅ FILE SALVATI")
print("- false_negatives_stage2.csv")
print("- true_positives_stage2_sample.csv")

print("\nüèÜ STAGE 2 COMPLETATO ‚Äî TEST SET NON UTILIZZATO")


Train shape: (536370, 78)
Val shape  : (134093, 78)
scale_pos_weight: 6.786794808513109

üöÄ Generating OOF meta-features...
Fold 1
[LightGBM] [Info] Number of positive: 55106, number of negative: 373990
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 429096, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Fold 2
[LightGBM] [Info] Number of positive: 55106, number of negative: 373990
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[Lig

NameError: name 'val_pred_proba' is not defined

In [1]:
# ======================================================
# STAGE 2 ‚Äî STACKING + THRESHOLD TUNING + FN ANALYSIS
# (Versione per Dataset SMOTE Bilanciato)
# ======================================================

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ======================================================
# 1Ô∏è‚É£ PATH & PARAMS
# ======================================================

# ‚ö†Ô∏è MODIFICA QUI: Puntiamo al file SMOTE creato prima
# Assumiamo siano nella cartella corrente. Se sono altrove, rimetti il percorso completo.
TRAIN_PATH = "train_step6_SMOTE.csv"
VAL_PATH   = "val_step6.csv"          # Il validation deve restare quello ORIGINALE

TARGET = "IS_SEVERE"
N_SPLITS = 5

# Range di soglie (Thresholds)
THRESHOLDS = np.arange(0.20, 0.81, 0.05) # Ho esteso un po' il range verso l'alto

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

if not os.path.exists(TRAIN_PATH):
    print(f"‚ùå Errore: Non trovo {TRAIN_PATH}. Assicurati di aver eseguito lo script di salvataggio prima.")
else:
    print(f"üìÇ Caricamento Train (SMOTE): {TRAIN_PATH}")
    train = pd.read_csv(TRAIN_PATH)

print(f"üìÇ Caricamento Val (Originale): {VAL_PATH}")
val = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("Train shape (Balanced):", X_train.shape)
print("Val shape (Imbalanced):", X_val.shape)

# ======================================================
# 3Ô∏è‚É£ BILANCIAMENTO (Ora √® 1:1)
# ======================================================

# Dato che abbiamo usato SMOTE, il rapporto √® circa 1.
# Non forziamo pi√π scale_pos_weight.
scale_pos_weight = 1.0
print("scale_pos_weight impostato a 1.0 (Dati gi√† bilanciati da SMOTE)")

# ======================================================
# 4Ô∏è‚É£ BASE MODELS (Senza Class Weights)
# ======================================================

# ‚ö†Ô∏è NOTA: Ho rimosso 'class_weight="balanced"' perch√© SMOTE ha gi√† fatto il lavoro.

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42
    # class_weight="balanced"  <-- RIMOSSO
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,        # <-- IMPOSTATO A 1
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=400,
    max_depth=-1,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    # class_weight="balanced", <-- RIMOSSO
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# ======================================================
# 5Ô∏è‚É£ OOF META-FEATURES (TRAIN)
# ======================================================

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Matrice vuota per salvare le previsioni "Out Of Fold"
train_meta = pd.DataFrame(
    np.zeros((len(train), 3)),
    columns=["rf_prob", "xgb_prob", "lgbm_prob"]
)

print("\nüöÄ Generating OOF meta-features (Stacking)...")
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"  -> Fold {fold}/{N_SPLITS} processing...")

    # Dati del fold corrente
    X_tr_fold, y_tr_fold = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    X_val_fold = X_train.iloc[val_idx]

    # Addestramento modelli base sul fold
    rf.fit(X_tr_fold, y_tr_fold)
    xgb.fit(X_tr_fold, y_tr_fold)
    lgbm.fit(X_tr_fold, y_tr_fold)

    # Predizione sulla parte "lasciata fuori" (Hold-out)
    train_meta.iloc[val_idx, 0] = rf.predict_proba(X_val_fold)[:, 1]
    train_meta.iloc[val_idx, 1] = xgb.predict_proba(X_val_fold)[:, 1]
    train_meta.iloc[val_idx, 2] = lgbm.predict_proba(X_val_fold)[:, 1]

print("‚úÖ OOF meta-features generated")

# ======================================================
# 6Ô∏è‚É£ TRAIN BASE MODELS ON FULL TRAIN
# ======================================================
print("\nüöÄ Re-training base models on FULL Train set...")

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

# Generiamo le feature per il validation set reale
val_meta = pd.DataFrame({
    "rf_prob": rf.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb.predict_proba(X_val)[:, 1],
    "lgbm_prob": lgbm.predict_proba(X_val)[:, 1]
})

# ======================================================
# 7Ô∏è‚É£ META-MODEL (Logistic Regression)
# ======================================================

meta_model = LogisticRegression(
    # Qui possiamo lasciare balanced o toglierlo, ma su meta-features
    # spesso √® meglio lasciarlo neutro se il train era SMOTE.
    # Proviamo neutro per vedere la pura probabilit√†.
    max_iter=1000,
    random_state=42
)

print("üöÄ Training Meta-Model (The Judge)...")
meta_model.fit(train_meta, y_train)
print("‚úÖ Meta-model trained")

# Predizioni finali (Probabilit√† combinate)
val_probs = meta_model.predict_proba(val_meta)[:, 1]

# ======================================================
# 8Ô∏è‚É£ THRESHOLD TUNING (VALIDATION)
# ======================================================

rows = []
print("\nüîé Testing Thresholds...")
for t in THRESHOLDS:
    pred = (val_probs >= t).astype(int)

    rows.append({
        "threshold": t,
        "recall": recall_score(y_val, pred),
        "precision": precision_score(y_val, pred),
        "f1": f1_score(y_val, pred),
        "false_negatives": ((y_val == 1) & (pred == 0)).sum(),
        "false_positives": ((y_val == 0) & (pred == 1)).sum()
    })

thr_df = pd.DataFrame(rows).sort_values("f1", ascending=False)

print("\nüìä THRESHOLD TUNING RESULTS (TOP 5 per F1)")
print(thr_df.head(5))

# Scegliamo la soglia migliore basata su F1 (puoi cambiare logica se preferisci Recall)
BEST_T = thr_df.iloc[0]["threshold"]
print(f"\nüèÜ BEST THRESHOLD: {BEST_T:.2f}")

# ======================================================
# 9Ô∏è‚É£ FINAL EVALUATION (BEST THRESHOLD)
# ======================================================

val_pred = (val_probs >= BEST_T).astype(int)

print("\nüìä CLASSIFICATION REPORT (FINAL STACKING)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

# ======================================================
# üîü FN ANALYSIS
# ======================================================

val_analysis = val.copy()
val_analysis["prob"] = val_probs
val_analysis["pred"] = val_pred

fn = val_analysis[(y_val == 1) & (val_pred == 0)] # Malati persi
tp = val_analysis[(y_val == 1) & (val_pred == 1)] # Malati presi

print(f"\nüìâ FALSE NEGATIVES: {fn.shape[0]}")
print(f"‚úÖ TRUE POSITIVES : {tp.shape[0]}")

# Salvataggio per analisi manuale
fn.to_csv("false_negatives_stacking.csv", index=False)
print("\nüìÅ Analisi salvata in 'false_negatives_stacking.csv'")

‚ùå Errore: Non trovo train_step6_SMOTE.csv. Assicurati di aver eseguito lo script di salvataggio prima.
üìÇ Caricamento Val (Originale): val_step6.csv


FileNotFoundError: [Errno 2] No such file or directory: 'val_step6.csv'