In [None]:
# ==========================================================
# BLOCCO 1: Caricamento dataset e scaler per training modelli
# ==========================================================
import pandas as pd
import joblib
from tensorflow.keras.models import load_model

print("📂 Caricamento dataset e scaler...")

# 1️⃣ Carica dataset
X_train_scaled = pd.read_csv("model_data/X_train_scaled.csv")
X_test_scaled = pd.read_csv("model_data/X_test_scaled.csv")
y_train = pd.read_csv("model_data/y_train.csv").squeeze()
y_test = pd.read_csv("model_data/y_test.csv").squeeze()

print(f"✅ Dataset caricati:")
print(f"   X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}")

# 2️⃣ Carica scaler
scaler_latent = joblib.load("model_data/scaler_latent.pkl")
print("✅ Scaler caricato.")

# 3️⃣ (Opzionale) Carica encoder
try:
    encoder = load_model("model_data/encoder_best.keras")
    print("✅ Encoder caricato con successo.")
except Exception as e:
    print(f"⚠️ Encoder non trovato o non necessario: {e}")

print("\n🚀 Pronto per l'addestramento dei modelli!")


In [None]:
# ==========================================================
# BLOCCO A: Random Forest con RandomizedSearchCV + Metriche
# ==========================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import time

print("🌲 Addestramento Random Forest sullo spazio latente (RandomizedSearchCV)...")

# 1️⃣ Pesi per classi sbilanciate
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
weights_dict = dict(zip(np.unique(y_train), class_weights))
print("\n⚖️ Pesi di bilanciamento per classe:")
for k, v in weights_dict.items():
    print(f"  Classe {k}: {v:.3f}")

# 2️⃣ Griglia “smart” per RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf_clf = RandomForestClassifier(
    class_weight=weights_dict,
    n_jobs=-1,
    random_state=42
)

# 3️⃣ Cross-validation 3-fold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 4️⃣ RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=20,  # esplora 20 combinazioni casuali
    scoring='f1_weighted',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# 5️⃣ Addestramento
start = time.time()
rand_search.fit(X_train_scaled, y_train)
elapsed = time.time() - start
print(f"\n✅ RandomizedSearchCV completato in {elapsed/60:.2f} minuti.")

print("\n🏆 Migliori iperparametri trovati:")
print(rand_search.best_params_)
print("✅ Best weighted F1:", rand_search.best_score_)

# 6️⃣ Miglior modello
best_rf = rand_search.best_estimator_

# 7️⃣ Predizione sul test set
y_pred = best_rf.predict(X_test_scaled)
y_prob = best_rf.predict_proba(X_test_scaled)

# 8️⃣ Metriche principali
print("\n📊 Report di classificazione (test set):")
print(classification_report(y_test, y_pred, digits=4))

# Specificità (True Negative Rate)
cm = confusion_matrix(y_test, y_pred)
tn = np.diag(cm)
fp = cm.sum(axis=0) - np.diag(cm)
specificity = tn / (tn + fp)
print("\n📈 Specificità media (macro): {:.4f}".format(np.mean(specificity)))


In [None]:
# ==========================================================
# BLOCCO B: Grafici - Confusion Matrix, ROC, PR, Feature Importance
# ==========================================================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import label_binarize

# 1️⃣ Matrice di Confusione (grafico + tabella)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax[0])
ax[0].set_title("Matrice di Confusione - Grafico")
ax[0].set_xlabel("Predetto")
ax[0].set_ylabel("Reale")

cm_df = pd.DataFrame(cm, 
                     index=[f"Reale {c}" for c in best_rf.classes_],
                     columns=[f"Pred {c}" for c in best_rf.classes_])
ax[1].axis("off")
ax[1].table(cellText=cm_df.values, colLabels=cm_df.columns, rowLabels=cm_df.index, loc='center')
ax[1].set_title("Matrice di Confusione - Tabella")
plt.tight_layout()
plt.show()

# 2️⃣ Curva ROC e AUC (multiclasse)
if len(np.unique(y_test)) > 2:
    y_test_bin = label_binarize(y_test, classes=np.unique(y_train))
    fpr, tpr, roc_auc = {}, {}, {}
    for i, cls in enumerate(np.unique(y_train)):
        fpr[cls], tpr[cls], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc[cls] = auc(fpr[cls], tpr[cls])

    plt.figure(figsize=(8, 6))
    for cls in roc_auc.keys():
        plt.plot(fpr[cls], tpr[cls], label=f"Classe {cls} (AUC = {roc_auc[cls]:.3f})")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("Curva ROC per classe")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.show()

# 3️⃣ Curva Precision-Recall (multiclasse)
if len(np.unique(y_test)) > 2:
    plt.figure(figsize=(8, 6))
    for i, cls in enumerate(np.unique(y_train)):
        precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
        plt.plot(recall, precision, label=f"Classe {cls}")
    plt.title("Curva Precision-Recall per classe")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid(True)
    plt.show()

# 4️⃣ Feature Importances
importances = pd.Series(best_rf.feature_importances_, index=X_train_scaled.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values[:15], y=importances.index[:15], palette="viridis")
plt.title("Top 15 Feature Importances (Latent Features)")
plt.xlabel("Importanza")
plt.ylabel("Feature Latente")
plt.tight_layout()
plt.show()


In [None]:
# ==========================================================
# BLOCCO A: Gradient Boosting con RandomizedSearchCV + Metriche
# ==========================================================
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import time

print("🌟 Addestramento Gradient Boosting sullo spazio latente (RandomizedSearchCV)...")

# 1️⃣ Pesi per classi sbilanciate
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
weights_dict = dict(zip(np.unique(y_train), class_weights))
print("\n⚖️ Pesi di bilanciamento per classe:")
for k, v in weights_dict.items():
    print(f"  Classe {k}: {v:.3f}")

# 2️⃣ Griglia “smart” per RandomizedSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'min_samples_leaf': [20, 50, 100],
    'max_features': ['sqrt', 'log2', None]
}

gb_clf = HistGradientBoostingClassifier(
    loss='categorical_crossentropy',
    random_state=42,
    class_weight=weights_dict
)

# 3️⃣ Cross-validation 3-fold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 4️⃣ RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=gb_clf,
    param_distributions=param_grid,
    n_iter=20,  # esplora 20 combinazioni casuali
    scoring='f1_weighted',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# 5️⃣ Addestramento
start = time.time()
rand_search.fit(X_train_scaled, y_train)
elapsed = time.time() - start
print(f"\n✅ RandomizedSearchCV completato in {elapsed/60:.2f} minuti.")

print("\n🏆 Migliori iperparametri trovati:")
print(rand_search.best_params_)
print("✅ Best weighted F1:", rand_search.best_score_)

# 6️⃣ Miglior modello
best_gb = rand_search.best_estimator_

# 7️⃣ Predizione sul test set
y_pred = best_gb.predict(X_test_scaled)
y_prob = best_gb.predict_proba(X_test_scaled)

# 8️⃣ Metriche principali
print("\n📊 Report di classificazione (test set):")
print(classification_report(y_test, y_pred, digits=4))

# Specificità (True Negative Rate)
cm = confusion_matrix(y_test, y_pred)
tn = np.diag(cm)
fp = cm.sum(axis=0) - np.diag(cm)
specificity = tn / (tn + fp)
print("\n📈 Specificità media (macro): {:.4f}".format(np.mean(specificity)))


In [None]:
# ==========================================================
# BLOCCO B: Grafici - Confusion Matrix, ROC, PR, Feature Importance
# ==========================================================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import label_binarize

# 1️⃣ Matrice di Confusione (grafico + tabella)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", ax=ax[0])
ax[0].set_title("Matrice di Confusione - Grafico")
ax[0].set_xlabel("Predetto")
ax[0].set_ylabel("Reale")

cm_df = pd.DataFrame(cm, 
                     index=[f"Reale {c}" for c in best_gb.classes_],
                     columns=[f"Pred {c}" for c in best_gb.classes_])
ax[1].axis("off")
ax[1].table(cellText=cm_df.values, colLabels=cm_df.columns, rowLabels=cm_df.index, loc='center')
ax[1].set_title("Matrice di Confusione - Tabella")
plt.tight_layout()
plt.show()

# 2️⃣ Curva ROC e AUC (multiclasse)
if len(np.unique(y_test)) > 2:
    y_test_bin = label_binarize(y_test, classes=np.unique(y_train))
    fpr, tpr, roc_auc = {}, {}, {}
    for i, cls in enumerate(np.unique(y_train)):
        fpr[cls], tpr[cls], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc[cls] = auc(fpr[cls], tpr[cls])

    plt.figure(figsize=(8, 6))
    for cls in roc_auc.keys():
        plt.plot(fpr[cls], tpr[cls], label=f"Classe {cls} (AUC = {roc_auc[cls]:.3f})")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("Curva ROC per classe")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.show()

# 3️⃣ Curva Precision-Recall (multiclasse)
if len(np.unique(y_test)) > 2:
    plt.figure(figsize=(8, 6))
    for i, cls in enumerate(np.unique(y_train)):
        precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
        plt.plot(recall, precision, label=f"Classe {cls}")
    plt.title("Curva Precision-Recall per classe")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid(True)
    plt.show()

# 4️⃣ Feature Importances
importances = pd.Series(best_gb.feature_importances_, index=X_train_scaled.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values[:15], y=importances.index[:15], palette="magma")
plt.title("Top 15 Feature Importances (Latent Features)")
plt.xlabel("Importanza")
plt.ylabel("Feature Latente")
plt.tight_layout()
plt.show()
