In [None]:
# ==========================================================
# BLOCCO 1: Lettura dataset salvati
# ==========================================================
import pandas as pd
import joblib
import os

# Cartella dei dati salvati
data_folder = "model_data"

# 1️⃣ Lettura train/test set
X_train = pd.read_csv(os.path.join(data_folder, "X_train_balanced.csv"))
X_test  = pd.read_csv(os.path.join(data_folder, "X_test_balanced.csv"))
y_train = pd.read_csv(os.path.join(data_folder, "y_train_balanced.csv")).squeeze()
y_test  = pd.read_csv(os.path.join(data_folder, "y_test_balanced.csv")).squeeze()

# 2️⃣ Lettura scaler (opzionale se serve per nuovi dati)
scaler_latent = joblib.load(os.path.join(data_folder, "scaler_latent.pkl"))

# 3️⃣ Controllo dimensioni e classi
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts()}")
print(f"y_test distribution:\n{y_test.value_counts()}")


In [None]:
# ==========================================================
# BLOCCO 2: Random Forest con cross-validation 5-fold e GridSearch
# ==========================================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import joblib
class_weights_dict = joblib.load("model_data/class_weights_dict.pkl")
print("⚖️ Class weights caricati:", class_weights_dict)


print("🏗️ Addestramento Random Forest con 5-fold CV e GridSearch...")

# 1️⃣ Definizione modello base
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight=class_weights_dict
)

# 2️⃣ GridSearch sui parametri principali (ridotto per velocità)
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# 3️⃣ Fit su train set
grid_search.fit(X_train, y_train)

# 4️⃣ Miglior modello
best_rf = grid_search.best_estimator_
print(f"\n🏆 Miglior combinazione iperparametri: {grid_search.best_params_}")

# 5️⃣ Predizioni su test set
y_pred = best_rf.predict(X_test)

# 6️⃣ Calcolo metriche
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Specificity per classe
cm = confusion_matrix(y_test, y_pred)
specificity = cm.diagonal() / (cm.sum(axis=1) - cm.diagonal() + cm.diagonal())

print("\n📊 Metriche su test set:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
for i, cls in enumerate(best_rf.classes_):
    print(f"Specificity classe '{cls}': {specificity[i]:.4f}")


In [None]:
# ==========================================================
# BLOCCO 2b: Grafici di valutazione modello
# ==========================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, precision_recall_curve, auc, RocCurveDisplay

print("📊 Visualizzazione performance Random Forest...")

# 1️⃣ Confusion Matrix
plt.figure(figsize=(6,5))
ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test, cmap='Blues', normalize='true')
plt.title("📊 Confusion Matrix (normalizzata)")
plt.show()

# 2️⃣ Precision-Recall Curve (macro)
y_prob = best_rf.predict_proba(X_test)
n_classes = y_prob.shape[1]

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    precision_i, recall_i, _ = precision_recall_curve((y_test==cls).astype(int), y_prob[:,i])
    plt.plot(recall_i, precision_i, label=f'{cls} (AUC={auc(recall_i, precision_i):.2f})')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("📊 Precision-Recall Curve per classe")
plt.legend()
plt.grid(True)
plt.show()

# 3️⃣ Feature importance
feat_importance = pd.Series(best_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feat_importance.values, y=feat_importance.index, palette='viridis')
plt.title("📊 Feature Importance Random Forest")
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.show()

# 4️⃣ ROC Curve (multi-classe)
plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    RocCurveDisplay.from_predictions((y_test==cls).astype(int), y_prob[:,i], name=str(cls))
plt.title("📊 ROC Curve per classe")
plt.show()
