In [1]:
# ======================================================
# MODELLO 2 ‚Äî XGBOOST (SAME DATASET AS MODEL 1)
# TRAIN + VALIDATION ONLY ‚Äî NO TEST SET
# ======================================================

import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

# ======================================================
# 1Ô∏è‚É£ PATH (STESSI DEL MODELLO 1)
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step5.csv"
VAL_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step5.csv"

TARGET = "IS_SEVERE"

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val = pd.read_csv(VAL_PATH)

print("Train shape:", train.shape)
print("Val shape:", val.shape)

# ======================================================
# 3Ô∏è‚É£ SPLIT FEATURES / TARGET
# ======================================================

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("\nTarget distribution (train):")
print(y_train.value_counts(normalize=True))

print("\nTarget distribution (val):")
print(y_val.value_counts(normalize=True))

# ======================================================
# 4Ô∏è‚É£ SCALE POSITIVE CLASS (CLASS IMBALANCE)
# ======================================================

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("\nscale_pos_weight:", scale_pos_weight)

# ======================================================
# 5Ô∏è‚É£ MODELLO XGBOOST
# ======================================================

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

print("\n‚úÖ Modello XGBoost addestrato")

# ======================================================
# 6Ô∏è‚É£ VALIDATION ‚Äî SOGLIA DECISIONALE
# ======================================================

THRESHOLD = 0.30  # üëà di solito XGB lavora bene con soglie pi√π basse

val_probs = model.predict_proba(X_val)[:, 1]
val_pred = (val_probs >= THRESHOLD).astype(int)

# ======================================================
# 7Ô∏è‚É£ METRICHE FINALI
# ======================================================

print("\nüìä CLASSIFICATION REPORT (VALIDATION)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

print("\nüéØ CLASSE SEVERA (1)")
print("Recall    :", recall_score(y_val, val_pred))
print("Precision :", precision_score(y_val, val_pred))
print("F1-score  :", f1_score(y_val, val_pred))

false_negatives = ((y_val == 1) & (val_pred == 0)).sum()
false_positives = ((y_val == 0) & (val_pred == 1)).sum()

print("\n‚ùó Errori critici")
print("False Negatives (severi persi):", false_negatives)
print("False Positives:", false_positives)

print("\nüîí MODELLO 2 COMPLETATO ‚Äî TEST SET NON UTILIZZATO")


Train shape: (536370, 78)
Val shape: (134093, 78)

Target distribution (train):
IS_SEVERE
0    0.871577
1    0.128423
Name: proportion, dtype: float64

Target distribution (val):
IS_SEVERE
0    0.871574
1    0.128426
Name: proportion, dtype: float64

scale_pos_weight: 6.786794808513109

‚úÖ Modello XGBoost addestrato

üìä CLASSIFICATION REPORT (VALIDATION)
              precision    recall  f1-score   support

           0     0.9817    0.7096    0.8238    116872
           1     0.3159    0.9101    0.4690     17221

    accuracy                         0.7353    134093
   macro avg     0.6488    0.8099    0.6464    134093
weighted avg     0.8962    0.7353    0.7782    134093

üìâ CONFUSION MATRIX
[[82932 33940]
 [ 1548 15673]]

üéØ CLASSE SEVERA (1)
Recall    : 0.910109749724174
Precision : 0.31590510551669926
F1-score  : 0.4690127779274022

‚ùó Errori critici
False Negatives (severi persi): 1548
False Positives: 33940

üîí MODELLO 2 COMPLETATO ‚Äî TEST SET NON UTILIZZATO


In [2]:
# ======================================================
# STAGE 2 ‚Äî STACKING ENSEMBLE (RF + XGB ‚Üí META MODEL)
# ======================================================

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier

# ======================================================
# 1Ô∏è‚É£ PATH
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step5.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step5.csv"

TARGET = "IS_SEVERE"

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)

# ======================================================
# 3Ô∏è‚É£ CLASS IMBALANCE
# ======================================================

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# ======================================================
# 4Ô∏è‚É£ BASE MODELS (LEVEL 0)
# ======================================================

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

print("\nüöÄ Training base models...")
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

print("‚úÖ Base models trained")

# ======================================================
# 5Ô∏è‚É£ META-FEATURES
# ======================================================
# Usiamo SOLO le probabilit√† della classe severa (1)

train_meta = pd.DataFrame({
    "rf_prob":  rf.predict_proba(X_train)[:, 1],
    "xgb_prob": xgb.predict_proba(X_train)[:, 1]
})

val_meta = pd.DataFrame({
    "rf_prob":  rf.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb.predict_proba(X_val)[:, 1]
})

# ======================================================
# 6Ô∏è‚É£ META-MODEL (LEVEL 1)
# ======================================================

meta_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

print("\nüöÄ Training meta-model...")
meta_model.fit(train_meta, y_train)

print("‚úÖ Meta-model trained")

# ======================================================
# 7Ô∏è‚É£ PREDICTION (STACKING)
# ======================================================

THRESHOLD = 0.35  # ‚Üê REGOLABILE

val_probs = meta_model.predict_proba(val_meta)[:, 1]
val_pred  = (val_probs >= THRESHOLD).astype(int)

# ======================================================
# 8Ô∏è‚É£ METRICHE FINALI
# ======================================================

print("\nüìä CLASSIFICATION REPORT (STACKING ‚Äî VALIDATION)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

print("\nüéØ CLASSE SEVERA (1)")
print("Recall    :", recall_score(y_val, val_pred))
print("Precision :", precision_score(y_val, val_pred))
print("F1-score  :", f1_score(y_val, val_pred))

false_negatives = ((y_val == 1) & (val_pred == 0)).sum()
false_positives = ((y_val == 0) & (val_pred == 1)).sum()

print("\n‚ùó Errori critici")
print("False Negatives:", false_negatives)
print("False Positives:", false_positives)

print("\nüèÜ STACKING COMPLETATO ‚Äî TEST SET NON UTILIZZATO")



Train shape: (536370, 77)
Val shape: (134093, 77)
scale_pos_weight: 6.786794808513109

üöÄ Training base models...
‚úÖ Base models trained

üöÄ Training meta-model...
‚úÖ Meta-model trained

üìä CLASSIFICATION REPORT (STACKING ‚Äî VALIDATION)
              precision    recall  f1-score   support

           0     0.9770    0.7686    0.8603    116872
           1     0.3584    0.8772    0.5088     17221

    accuracy                         0.7825    134093
   macro avg     0.6677    0.8229    0.6846    134093
weighted avg     0.8976    0.7825    0.8152    134093

üìâ CONFUSION MATRIX
[[89822 27050]
 [ 2114 15107]]

üéØ CLASSE SEVERA (1)
Recall    : 0.877242901109111
Precision : 0.3583509262993097
F1-score  : 0.508841658526727

‚ùó Errori critici
False Negatives: 2114
False Positives: 27050

üèÜ STACKING COMPLETATO ‚Äî TEST SET NON UTILIZZATO


In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix
)

# ======================================================
# 1Ô∏è‚É£ PROBABILIT√Ä STACKING SU VALIDATION
# ======================================================

val_probs = meta_model.predict_proba(X_val_meta)[:, 1]

# ======================================================
# 2Ô∏è‚É£ SWEEP SOGLIA
# ======================================================

thresholds = np.arange(0.20, 0.51, 0.05)

rows = []

for th in thresholds:
    val_pred = (val_probs >= th).astype(int)

    recall = recall_score(y_val, val_pred)
    precision = precision_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred)

    tn, fp, fn, tp = confusion_matrix(y_val, val_pred).ravel()

    rows.append({
        "threshold": th,
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "false_negatives": fn,
        "false_positives": fp
    })

results = pd.DataFrame(rows)

# ======================================================
# 3Ô∏è‚É£ RISULTATI
# ======================================================

print("\nüìä TUNING SOGLIA ‚Äî STACKING (VALIDATION ONLY)")
print(results)

# ======================================================
# 4Ô∏è‚É£ FILTRO CONSIGLIATO (recall minimo)
# ======================================================

print("\nüéØ Soglie con Recall ‚â• 0.88")
print(results[results["recall"] >= 0.88])


NameError: name 'X_val_meta' is not defined

In [3]:
# ======================================================
# STAGE 2 ‚Äî STACKING ENSEMBLE (OOF, NO LEAKAGE)
# RF + XGB ‚Üí META MODEL
# ======================================================

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier

# ======================================================
# 1Ô∏è‚É£ PATH
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step6.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step6.csv"

TARGET = "IS_SEVERE"

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)

# ======================================================
# 3Ô∏è‚É£ CLASS IMBALANCE
# ======================================================

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# ======================================================
# 4Ô∏è‚É£ OOF META-FEATURES (TRAIN)
# ======================================================

skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

train_meta = pd.DataFrame(
    index=X_train.index,
    columns=["rf_prob", "xgb_prob"]
)

print("\nüöÄ Generating OOF meta-features...")

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"Fold {fold}")

    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr = y_train.iloc[tr_idx]

    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_leaf=10,
        n_jobs=-1,
        random_state=42,
        class_weight="balanced"
    )

    xgb = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42
    )

    rf.fit(X_tr, y_tr)
    xgb.fit(X_tr, y_tr)

    train_meta.loc[X_va.index, "rf_prob"]  = rf.predict_proba(X_va)[:, 1]
    train_meta.loc[X_va.index, "xgb_prob"] = xgb.predict_proba(X_va)[:, 1]

train_meta = train_meta.astype(float)

print("‚úÖ OOF meta-features generated")

# ======================================================
# 5Ô∏è‚É£ FIT BASE MODELS SU TUTTO TRAIN (PER VALIDATION)
# ======================================================

rf_final = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

xgb_final = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

print("\nüöÄ Training base models on FULL TRAIN...")
rf_final.fit(X_train, y_train)
xgb_final.fit(X_train, y_train)
print("‚úÖ Base models trained")

# ======================================================
# 6Ô∏è‚É£ META-FEATURES (VALIDATION)
# ======================================================

val_meta = pd.DataFrame({
    "rf_prob":  rf_final.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb_final.predict_proba(X_val)[:, 1]
})

# ======================================================
# 7Ô∏è‚É£ META-MODEL
# ======================================================

meta_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

print("\nüöÄ Training meta-model...")
meta_model.fit(train_meta, y_train)
print("‚úÖ Meta-model trained")

# ======================================================
# 8Ô∏è‚É£ PREDICTION (STACKING)
# ======================================================

THRESHOLD = 0.60   # ‚Üê REGOLABILE

val_probs = meta_model.predict_proba(val_meta)[:, 1]
val_pred  = (val_probs >= THRESHOLD).astype(int)

# ======================================================
# 9Ô∏è‚É£ METRICHE FINALI
# ======================================================

print("\nüìä CLASSIFICATION REPORT (STACKING ‚Äî VALIDATION)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

print("\nüéØ CLASSE SEVERA (1)")
print("Recall    :", recall_score(y_val, val_pred))
print("Precision :", precision_score(y_val, val_pred))
print("F1-score  :", f1_score(y_val, val_pred))

false_negatives = ((y_val == 1) & (val_pred == 0)).sum()
false_positives = ((y_val == 0) & (val_pred == 1)).sum()

print("\n‚ùó Errori critici")
print("False Negatives:", false_negatives)
print("False Positives:", false_positives)

print("\nüèÜ STACKING OOF COMPLETATO ‚Äî TEST SET NON UTILIZZATO")


Train shape: (536370, 78)
Val shape: (134093, 78)
scale_pos_weight: 6.786794808513109

üöÄ Generating OOF meta-features...
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
‚úÖ OOF meta-features generated

üöÄ Training base models on FULL TRAIN...
‚úÖ Base models trained

üöÄ Training meta-model...
‚úÖ Meta-model trained

üìä CLASSIFICATION REPORT (STACKING ‚Äî VALIDATION)
              precision    recall  f1-score   support

           0     0.9658    0.8609    0.9103    116872
           1     0.4565    0.7928    0.5794     17221

    accuracy                         0.8522    134093
   macro avg     0.7111    0.8269    0.7449    134093
weighted avg     0.9004    0.8522    0.8678    134093

üìâ CONFUSION MATRIX
[[100620  16252]
 [  3568  13653]]

üéØ CLASSE SEVERA (1)
Recall    : 0.792811102723419
Precision : 0.45654572813910715
F1-score  : 0.5794253702839197

‚ùó Errori critici
False Negatives: 3568
False Positives: 16252

üèÜ STACKING OOF COMPLETATO ‚Äî TEST SET NON UTILIZZATO


In [6]:
# ======================================================
# THRESHOLD TUNING ‚Äî STACKING META-MODEL
# ======================================================

import numpy as np
import pandas as pd

from sklearn.metrics import (
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix,
    classification_report
)

# ======================================================
# 1Ô∏è‚É£ PROBABILIT√Ä DEL META-MODEL (VALIDATION)
# ======================================================

val_probs = meta_model.predict_proba(val_meta)[:, 1]

# ======================================================
# 2Ô∏è‚É£ SWEEP DELLE SOGLIE
# ======================================================

thresholds = np.arange(0.10, 0.61, 0.02)
results = []

for t in thresholds:
    val_pred = (val_probs >= t).astype(int)

    recall = recall_score(y_val, val_pred)
    precision = precision_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred)

    tn, fp, fn, tp = confusion_matrix(y_val, val_pred).ravel()

    results.append({
        "threshold": round(t, 2),
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "false_negatives": fn,
        "false_positives": fp
    })

# ======================================================
# 3Ô∏è‚É£ RISULTATI ORDINATI
# ======================================================

results_df = pd.DataFrame(results)

results_df = results_df.sort_values(
    by="f1", ascending=False
).reset_index(drop=True)

print("\nüìä THRESHOLD TUNING RESULTS (sorted by F1)\n")
print(results_df.head(15))

# ======================================================
# 4Ô∏è‚É£ SOGLIA MIGLIORE (AUTOMATICA)
# ======================================================

best = results_df.iloc[0]

BEST_THRESHOLD = best["threshold"]

print("\nüèÜ BEST THRESHOLD SELECTED")
print("Threshold       :", BEST_THRESHOLD)
print("Recall          :", round(best["recall"], 4))
print("Precision       :", round(best["precision"], 4))
print("F1-score        :", round(best["f1"], 4))
print("False Negatives :", int(best["false_negatives"]))
print("False Positives :", int(best["false_positives"]))

# ======================================================
# 5Ô∏è‚É£ METRICHE FINALI CON SOGLIA SCELTA
# ======================================================

final_pred = (val_probs >= BEST_THRESHOLD).astype(int)

print("\nüìâ CONFUSION MATRIX (FINAL)")
print(confusion_matrix(y_val, final_pred))

print("\nüìä CLASSIFICATION REPORT (FINAL)\n")
print(classification_report(y_val, final_pred, digits=4))

# ======================================================
# 6Ô∏è‚É£ ANALISI ALTERNATIVA (OPZIONALE)
# ======================================================

print("\nüîé TOP 5 SOGLIE ‚Äî MIN FALSE NEGATIVES\n")
print(
    results_df.sort_values(
        by=["false_negatives", "false_positives"]
    ).head(5)
)

print("\nüîé TOP 5 SOGLIE ‚Äî MIN FALSE POSITIVES\n")
print(
    results_df.sort_values(
        by=["false_positives", "false_negatives"]
    ).head(5)
)

print("\n‚úÖ THRESHOLD TUNING COMPLETATO")



üìä THRESHOLD TUNING RESULTS (sorted by F1)

    threshold    recall  precision        f1  false_negatives  false_positives
0        0.60  0.791650   0.459860  0.581774             3588            16013
1        0.58  0.800418   0.449635  0.575809             3437            16872
2        0.56  0.809012   0.440245  0.570201             3289            17714
3        0.54  0.817432   0.431003  0.564412             3144            18584
4        0.52  0.824981   0.422387  0.558715             3014            19428
5        0.50  0.832007   0.414715  0.553525             2893            20221
6        0.48  0.838395   0.407014  0.547994             2783            21035
7        0.46  0.843853   0.399308  0.542097             2689            21861
8        0.44  0.849486   0.390732  0.535263             2592            22811
9        0.42  0.856919   0.383010  0.529399             2464            23772
10       0.40  0.863887   0.376233  0.524179             2344            24665
11   

In [5]:
# ======================================================
# STACKING MODEL ‚Äî TRAINING & EVALUATION
# ======================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier

# ======================================================
# 1Ô∏è‚É£ PATH & PARAMS
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step5.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step5.csv"

TARGET = "IS_SEVERE"
THRESHOLD = 0.60   # soglia scelta dal tuning

# ======================================================
# 2Ô∏è‚É£ LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)

# ======================================================
# 3Ô∏è‚É£ CLASS IMBALANCE
# ======================================================

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

# ======================================================
# 4Ô∏è‚É£ BASE MODELS (LEVEL 0)
# ======================================================

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

# ======================================================
# 5Ô∏è‚É£ OOF META-FEATURES (TRAIN)
# ======================================================

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_meta = np.zeros((len(train), 2))

print("\nüöÄ Generating OOF meta-features...")
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"Fold {fold}")

    rf.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])
    xgb.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])

    train_meta[val_idx, 0] = rf.predict_proba(X_train.iloc[val_idx])[:, 1]
    train_meta[val_idx, 1] = xgb.predict_proba(X_train.iloc[val_idx])[:, 1]

print("‚úÖ OOF meta-features generated")

# ======================================================
# 6Ô∏è‚É£ TRAIN BASE MODELS ON FULL TRAIN
# ======================================================

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

val_meta = pd.DataFrame({
    "rf_prob":  rf.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb.predict_proba(X_val)[:, 1]
})

# ======================================================
# 7Ô∏è‚É£ META-MODEL (LEVEL 1)
# ======================================================

meta_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

print("\nüöÄ Training meta-model...")
meta_model.fit(train_meta, y_train)
print("‚úÖ Meta-model trained")

# ======================================================
# 8Ô∏è‚É£ FINAL PREDICTIONS
# ======================================================

val_probs = meta_model.predict_proba(val_meta)[:, 1]
val_pred  = (val_probs >= THRESHOLD).astype(int)

# ======================================================
# 9Ô∏è‚É£ METRICHE DI VALUTAZIONE
# ======================================================

print("\nüìä CLASSIFICATION REPORT (STACKING ‚Äî VALIDATION)")
print(classification_report(y_val, val_pred, digits=4))

print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, val_pred))

print("\nüéØ CLASSE SEVERA (1)")
print("Recall    :", recall_score(y_val, val_pred))
print("Precision :", precision_score(y_val, val_pred))
print("F1-score  :", f1_score(y_val, val_pred))

false_negatives = ((y_val == 1) & (val_pred == 0)).sum()
false_positives = ((y_val == 0) & (val_pred == 1)).sum()

print("\n‚ùó Errori critici")
print("False Negatives:", false_negatives)
print("False Positives:", false_positives)

print("\nüèÜ STACKING MODEL EVALUATION COMPLETATA ‚Äî TEST SET NON UTILIZZATO")


Train shape: (536370, 77)
Val shape: (134093, 77)
scale_pos_weight: 6.786794808513109

üöÄ Generating OOF meta-features...
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
‚úÖ OOF meta-features generated

üìâ FALSE NEGATIVES: 3588
‚úÖ TRUE POSITIVES : 13633

üîç TOP 15 DIFFERENZE NUMERICHE (FN vs TP)
                      feature     FN_mean     TP_mean  delta_FN_minus_TP
74      fc_age_x_num_symptoms  233.008082  514.838040        -281.829958
1                     NUMDAYS   11.154682  152.600748        -141.446066
0                     AGE_YRS   51.372074   65.675933         -14.303860
75  fc_history_x_num_symptoms    4.664716    8.260471          -3.595755
3              NUMERO_SINTOMI    4.664716    8.260471          -3.595755
76   fc_age_x_history_cardiac    0.116778    0.830118          -0.713340
68       num_symp_respiratory    0.115385    0.497103          -0.381718
2             VAX_DOSE_SERIES    1.612040    1.849703          -0.237663
5                       SEX_M    0.307971    0.53495



In [4]:
import pandas as pd
import numpy as np

# ======================================================
# PATH
# ======================================================

TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step5.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step5.csv"

OUT_TRAIN = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step6.csv"
OUT_VAL   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step6.csv"

# ======================================================
# LOAD DATA
# ======================================================

train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

print("Train shape (before):", train.shape)
print("Val shape   (before):", val.shape)

# ======================================================
# FEATURE ENGINEERING
# ratio_symp_respiratory
# ======================================================

def add_ratio_feature(df):
    df["ratio_symp_respiratory"] = (
        df["num_symp_respiratory"] /
        df["num_symp_total"].replace(0, np.nan)
    )
    df["ratio_symp_respiratory"] = df["ratio_symp_respiratory"].fillna(0)
    return df

train = add_ratio_feature(train)
val   = add_ratio_feature(val)

print("Train shape (after):", train.shape)
print("Val shape   (after):", val.shape)

# ======================================================
# SAVE
# ======================================================

train.to_csv(OUT_TRAIN, index=False)
val.to_csv(OUT_VAL, index=False)

print("\n‚úÖ DATASET STEP 6 CREATO")
print("Train:", OUT_TRAIN)
print("Val  :", OUT_VAL)
print("‚ùå Test set NON toccato")


Train shape (before): (536370, 78)
Val shape   (before): (134093, 78)
Train shape (after): (536370, 79)
Val shape   (after): (134093, 79)

‚úÖ DATASET STEP 6 CREATO
Train: /Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step6.csv
Val  : /Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step6.csv
‚ùå Test set NON toccato


In [1]:
# ======================================================
# FULL PIPELINE: TUNING -> STACKING -> PRODUCTION EXPORT
# ======================================================

import pandas as pd
import numpy as np
import joblib
import time

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ======================================================
# 1Ô∏è‚É£ CONFIGURAZIONE & PATH
# ======================================================

# Sostituisci con i tuoi percorsi reali
TRAIN_PATH = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/train_step6.csv"
VAL_PATH   = "/Users/marcodonatiello/PycharmProjects/JupyterProject/data/interim/splits/val_step6.csv"
OUTPUT_MODEL_NAME = "severe_model_stacking_production.pkl"

TARGET = "IS_SEVERE"
N_SPLITS = 5
SEED = 42

# Quante combinazioni di parametri provare per ogni modello?
# Metti 10 per un test veloce, 50+ per il risultato definitivo.
N_ITER_SEARCH = 10

# Soglie da testare nel post-processing
THRESHOLDS = np.arange(0.20, 0.61, 0.02)

# ======================================================
# 2Ô∏è‚É£ CARICAMENTO DATI
# ======================================================

print("üìÇ Caricamento dataset...")
train = pd.read_csv(TRAIN_PATH)
val   = pd.read_csv(VAL_PATH)

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_val = val.drop(columns=[TARGET])
y_val = val[TARGET]

# Calcolo sbilanciamento per XGB/LGBM
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"‚úÖ Dati caricati. Scale Pos Weight calcolato: {scale_pos_weight:.2f}")

# ======================================================
# 3Ô∏è‚É£ HYPERPARAMETER TUNING (FASE NUOVA)
# ======================================================
# Questa fase cerca i parametri migliori invece di usarne di casuali.

print(f"\n‚öôÔ∏è AVVIO TUNING (n_iter={N_ITER_SEARCH})... attendere...")

# --- A. Tuning Random Forest ---
rf_params_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 15, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

print("   ...Tuning Random Forest...")
rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    param_distributions=rf_params_dist,
    n_iter=N_ITER_SEARCH,
    scoring='f1', # O 'roc_auc' a seconda di cosa preferisci
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=0
)
rf_search.fit(X_train, y_train)
best_rf_params = rf_search.best_params_
print(f"   ‚úÖ Best RF Params: {best_rf_params}")

# --- B. Tuning XGBoost ---
xgb_params_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [scale_pos_weight] # Fisso o varia leggermente
}

print("   ...Tuning XGBoost...")
xgb_search = RandomizedSearchCV(
    XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=SEED, n_jobs=-1),
    param_distributions=xgb_params_dist,
    n_iter=N_ITER_SEARCH,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=0
)
xgb_search.fit(X_train, y_train)
best_xgb_params = xgb_search.best_params_
print(f"   ‚úÖ Best XGB Params: {best_xgb_params}")

# --- C. Tuning LightGBM ---
lgbm_params_dist = {
    'n_estimators': [200, 400, 600],
    'num_leaves': [31, 50, 64, 80],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'class_weight': ['balanced']
}

print("   ...Tuning LightGBM...")
lgbm_search = RandomizedSearchCV(
    LGBMClassifier(random_state=SEED, n_jobs=-1, verbose=-1),
    param_distributions=lgbm_params_dist,
    n_iter=N_ITER_SEARCH,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=0
)
lgbm_search.fit(X_train, y_train)
best_lgbm_params = lgbm_search.best_params_
print(f"   ‚úÖ Best LGBM Params: {best_lgbm_params}")

# ======================================================
# 4Ô∏è‚É£ INIZIALIZZAZIONE MODELLI (CON PARAMETRI OTTIMIZZATI)
# ======================================================

print("\nüöÄ Inizializzazione modelli con i parametri migliori...")

rf = RandomForestClassifier(**best_rf_params, random_state=SEED, n_jobs=-1)
xgb = XGBClassifier(**best_xgb_params, eval_metric="logloss", random_state=SEED, n_jobs=-1)
lgbm = LGBMClassifier(**best_lgbm_params, random_state=SEED, n_jobs=-1, verbose=-1)

# ======================================================
# 5Ô∏è‚É£ GENERAZIONE META-FEATURES (STACKING)
# ======================================================

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

train_meta = pd.DataFrame(np.zeros((len(train), 3)), columns=["rf_prob", "xgb_prob", "lgbm_prob"])

print("\nüîÑ Generazione OOF meta-features (Training Livello 0)...")

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_tr_fold, y_tr_fold = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    X_val_fold = X_train.iloc[val_idx]

    # Fit su fold corrente
    rf.fit(X_tr_fold, y_tr_fold)
    xgb.fit(X_tr_fold, y_tr_fold)
    lgbm.fit(X_tr_fold, y_tr_fold)

    # Previsioni out-of-fold
    train_meta.iloc[val_idx, 0] = rf.predict_proba(X_val_fold)[:, 1]
    train_meta.iloc[val_idx, 1] = xgb.predict_proba(X_val_fold)[:, 1]
    train_meta.iloc[val_idx, 2] = lgbm.predict_proba(X_val_fold)[:, 1]

    print(f"   -> Fold {fold}/{N_SPLITS} completato")

# ======================================================
# 6Ô∏è‚É£ ADDESTRAMENTO FINALE BASE MODELS (FULL TRAIN)
# ======================================================
# Ora che abbiamo le meta-features, ri-addestriamo i modelli base
# su TUTTO il train set per averli pronti per la validazione e la produzione.

print("\nüèãÔ∏è Addestramento modelli base su tutto il dataset...")
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

# Creiamo le meta-features per il validation set
val_meta = pd.DataFrame({
    "rf_prob": rf.predict_proba(X_val)[:, 1],
    "xgb_prob": xgb.predict_proba(X_val)[:, 1],
    "lgbm_prob": lgbm.predict_proba(X_val)[:, 1]
})

# ======================================================
# 7Ô∏è‚É£ ADDESTRAMENTO META-MODEL & TUNING SOGLIA
# ======================================================

meta_model = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=SEED)
meta_model.fit(train_meta, y_train)
print("‚úÖ Meta-model addestrato")

# Probabilit√† finali sul validation set
val_probs = meta_model.predict_proba(val_meta)[:, 1]

# Ricerca soglia ottimale
rows = []
for t in THRESHOLDS:
    pred = (val_probs >= t).astype(int)
    rows.append({
        "threshold": t,
        "f1": f1_score(y_val, pred),
        "recall": recall_score(y_val, pred),
        "precision": precision_score(y_val, pred)
    })

thr_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
BEST_T = thr_df.iloc[0]["threshold"]
BEST_F1 = thr_df.iloc[0]["f1"]

print(f"\nüèÜ BEST THRESHOLD: {BEST_T:.2f} (F1-Score: {BEST_F1:.4f})")

# ======================================================
# 8Ô∏è‚É£ REPORT FINALE
# ======================================================

final_val_pred = (val_probs >= BEST_T).astype(int)

print("\nüìä CLASSIFICATION REPORT (VALIDATION SET)")
print(classification_report(y_val, final_val_pred, digits=4))
print("üìâ CONFUSION MATRIX")
print(confusion_matrix(y_val, final_val_pred))

# ======================================================
# 9Ô∏è‚É£ SALVATAGGIO PER PRODUZIONE
# ======================================================

print(f"\nüíæ Salvataggio modello per produzione in '{OUTPUT_MODEL_NAME}'...")

production_bundle = {
    "rf_model": rf,             # Modello fittato su tutto X_train
    "xgb_model": xgb,           # Modello fittato su tutto X_train
    "lgbm_model": lgbm,         # Modello fittato su tutto X_train
    "meta_model": meta_model,   # Modello fittato sulle meta-features
    "threshold": BEST_T,        # La soglia che abbiamo trovato
    "features_order": X_train.columns.tolist() # Ordine colonne per sicurezza
}

joblib.dump(production_bundle, OUTPUT_MODEL_NAME)

print("‚úÖ DONE! Il file .pkl √® pronto per essere usato in produzione.")

üìÇ Caricamento dataset...
‚úÖ Dati caricati. Scale Pos Weight calcolato: 6.79

‚öôÔ∏è AVVIO TUNING (n_iter=10)... attendere...
   ...Tuning Random Forest...
   ‚úÖ Best RF Params: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 30, 'class_weight': 'balanced_subsample'}
   ...Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


   ‚úÖ Best XGB Params: {'subsample': 0.6, 'scale_pos_weight': np.float64(6.786794808513109), 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
   ...Tuning LightGBM...
   ‚úÖ Best LGBM Params: {'subsample': 0.9, 'num_leaves': 64, 'n_estimators': 600, 'max_depth': 20, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'class_weight': 'balanced'}

üöÄ Inizializzazione modelli con i parametri migliori...

üîÑ Generazione OOF meta-features (Training Livello 0)...
   -> Fold 1/5 completato
   -> Fold 2/5 completato
   -> Fold 3/5 completato
   -> Fold 4/5 completato
   -> Fold 5/5 completato

üèãÔ∏è Addestramento modelli base su tutto il dataset...
‚úÖ Meta-model addestrato

üèÜ BEST THRESHOLD: 0.60 (F1-Score: 0.5886)

üìä CLASSIFICATION REPORT (VALIDATION SET)
              precision    recall  f1-score   support

           0     0.9644    0.8712    0.9154    116872
           1     0.4721    0.7814    0.5886     17221

    accuracy                     