In [6]:
# ==========================================================
# BLOCCO 1: Lettura dataset salvati
# ==========================================================
import pandas as pd
import joblib
import os

# Cartella dei dati salvati
data_folder = "model_data"

# 1️⃣ Lettura train/test set
X_train = pd.read_csv(os.path.join(data_folder, "X_train_balanced.csv"))
X_test  = pd.read_csv(os.path.join(data_folder, "X_test_balanced.csv"))
y_train = pd.read_csv(os.path.join(data_folder, "y_train_balanced.csv")).squeeze()
y_test  = pd.read_csv(os.path.join(data_folder, "y_test_balanced.csv")).squeeze()

# 2️⃣ Lettura scaler (opzionale se serve per nuovi dati)
scaler_latent = joblib.load(os.path.join(data_folder, "scaler_latent.pkl"))

# 3️⃣ Controllo dimensioni e classi
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts()}")
print(f"y_test distribution:\n{y_test.value_counts()}")


X_train: (38460, 16), X_test: (9615, 16)
y_train distribution:
label_tactic_reduced
Resource Development    12820
Discovery               12820
Reconnaissance          12820
Name: count, dtype: int64
y_test distribution:
label_tactic_reduced
Reconnaissance          3205
Resource Development    3205
Discovery               3205
Name: count, dtype: int64


In [None]:
# ==========================================================
# BLOCCO RAPIDO: Random Forest veloce con CV 5-fold
# ==========================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

print("🏗️ Addestramento Random Forest veloce con 5-fold CV e GridSearch...")

# 1️⃣ Modello base
rf = RandomForestClassifier(
    n_estimators=100,          # numero di alberi moderato
    random_state=42,
    class_weight='balanced'  
)

# 2️⃣ GridSearch sui parametri principali, poche combinazioni
param_grid = {
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],   # aggiunto per variabilità
    'bootstrap': [True, False]          # prova anche senza bootstrap
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1_macro',   # macro per multiclass
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# 3️⃣ Fit su train set
grid_search.fit(X_train, y_train)

# 4️⃣ Miglior modello
best_rf = grid_search.best_estimator_
print(f"\n🏆 Miglior combinazione iperparametri: {grid_search.best_params_}")

# 5️⃣ Predizioni su test set
y_pred = best_rf.predict(X_test)

# 6️⃣ Metriche su test set
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Specificity per classe
cm = confusion_matrix(y_test, y_pred)
specificity = cm.diagonal() / (cm.sum(axis=1) - cm.diagonal() + cm.diagonal())

print("\n📊 Metriche su test set:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
for i, cls in enumerate(best_rf.classes_):
    print(f"Specificity classe '{cls}': {specificity[i]:.4f}")


🏗️ Addestramento Random Forest veloce con 5-fold CV e GridSearch...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [None]:
# ==========================================================
# BLOCCO 3: Grafici e metriche visuali Random Forest
# ==========================================================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, precision_recall_curve, roc_curve, auc, label_binarize
import numpy as np

print("📊 Generazione grafici e visualizzazioni...")

# 1️⃣ Confusion Matrix
plt.figure(figsize=(8,6))
cm_display = ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test, cmap='Blues', normalize='true')
plt.title("📊 Confusion Matrix Normalizzata")
plt.show()

# 2️⃣ Precision-Recall Curve (Multiclass)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score

y_test_bin = label_binarize(y_test, classes=best_rf.classes_)
y_score = best_rf.predict_proba(X_test)

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    ap = average_precision_score(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, lw=2, label=f"{cls} (AP={ap:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("📈 Precision-Recall Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# 3️⃣ ROC Curve Multiclass
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f"{cls} (AUC={roc_auc:.2f})")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("📈 ROC Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# 4️⃣ Feature Importance
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=X_train.columns[indices], palette="viridis")
plt.title("🌟 Feature Importance Random Forest")
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.show()

print("✅ Grafici generati con successo.")
# ==========================================================
# BLOCCO 3: Grafici e metriche visuali Random Forest
# ==========================================================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, precision_recall_curve, roc_curve, auc, label_binarize
import numpy as np

print("📊 Generazione grafici e visualizzazioni...")

# 1️⃣ Confusion Matrix
plt.figure(figsize=(8,6))
cm_display = ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test, cmap='Blues', normalize='true')
plt.title("📊 Confusion Matrix Normalizzata")
plt.show()

# 2️⃣ Precision-Recall Curve (Multiclass)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score

y_test_bin = label_binarize(y_test, classes=best_rf.classes_)
y_score = best_rf.predict_proba(X_test)

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    ap = average_precision_score(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, lw=2, label=f"{cls} (AP={ap:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("📈 Precision-Recall Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# 3️⃣ ROC Curve Multiclass
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_rf.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f"{cls} (AUC={roc_auc:.2f})")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("📈 ROC Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# 4️⃣ Feature Importance
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=X_train.columns[indices], palette="viridis")
plt.title("🌟 Feature Importance Random Forest")
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.show()

print("✅ Grafici generati con successo.")
