Modello 1

In [20]:
# ==============================================
# - BLOCCO 1 — Definizione dei modelli
# ==============================================
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

print("[INFO] Inizializzazione modelli in corso...")

# 🔹 Modelli base
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight='balanced', random_state=42),
    "KNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "SVM": make_pipeline(StandardScaler(), SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)),
    "Naive Bayes": GaussianNB()
}

# 🔹 Modello ibrido (Stacking)
# Combina Random Forest e Logistic Regression come meta-classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)),
    ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42))
]
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=300, class_weight='balanced', random_state=42),
    n_jobs=-1
)

models["Hybrid Stacking (RF + DT → LR)"] = stacking_model

# 🔹 Stampa dei modelli inizializzati
print("[INFO] ✅ Modelli inizializzati correttamente:")
for name in models.keys():
    print(f" - {name}")


[INFO] Inizializzazione modelli in corso...
[INFO] ✅ Modelli inizializzati correttamente:
 - Logistic Regression
 - KNN
 - Decision Tree
 - Random Forest
 - SVM
 - Naive Bayes
 - Hybrid Stacking (RF + DT → LR)


In [21]:
# ==============================================
# - BLOCCO 10 — Naive Bayes Multiclasse (5-Fold CV + Test Finale)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento Naive Bayes con 5-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset binned
X_train_binned = np.load(os.path.join(SAVE_DIR, "X_train_binned.npy"))
X_test_binned = np.load(os.path.join(SAVE_DIR, "X_test_binned.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_binned.shape}, X_test={X_test_binned.shape}")

# 🔹 Recupero mapping LabelEncoder per etichette leggibili
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# 🔹 Preparazione cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Liste per raccogliere metriche fold per fold
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

# ===========================================================
# 1️⃣ Loop sui fold (Cross-Validation)
# ===========================================================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_binned, y_train), 1):
    X_tr, X_val = X_train_binned[train_idx], X_train_binned[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    nb = GaussianNB()
    nb.fit(X_tr, y_tr)
    y_pred = nb.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche (Cross-Validation)
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "naive_bayes_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'naive_bayes_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Matrice di confusione media
# ===========================================================
avg_cm = np.mean(cm_list, axis=0).round().astype(int)
labels_sorted = sorted(label_mapping.keys())
cm_df = pd.DataFrame(avg_cm,
                     index=[label_mapping.get(i, "unknown") for i in labels_sorted],
                     columns=[label_mapping.get(i, "unknown") for i in labels_sorted])
print("\n🔢 Matrice di Confusione Media (CV):\n")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(cm_df)

cm_df.to_csv(os.path.join(RESULTS_DIR, "naive_bayes_cv_confusion_matrix.csv"))
print(f"[INFO] Matrice di confusione media salvata in: {os.path.join(RESULTS_DIR, 'naive_bayes_cv_confusion_matrix.csv')}")

# ===========================================================
# 4️⃣ Addestramento finale su tutto il training set
# ===========================================================
print("\n[INFO] Addestramento modello finale su tutto il training set...")
final_nb = GaussianNB()
final_nb.fit(X_train_binned, y_train)

# ===========================================================
# 5️⃣ Valutazione sul test set (risultati “definitivi”)
# ===========================================================
y_pred_test = final_nb.predict(X_test_binned)

# Metriche complessive
acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

# Specificità
cm_test = confusion_matrix(y_test, y_pred_test)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "naive_bayes_test_metrics.csv"), index=False)
print(f"[INFO] Metriche test salvate in: {os.path.join(RESULTS_DIR, 'naive_bayes_test_metrics.csv')}")

# ===========================================================
# 6️⃣ Matrice di confusione e classification report (Test)
# ===========================================================
cm_test_df = pd.DataFrame(cm_test,
                          index=[label_mapping.get(i, "unknown") for i in labels_sorted],
                          columns=[label_mapping.get(i, "unknown") for i in labels_sorted])
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "naive_bayes_test_confusion_matrix.csv"))

report = classification_report(y_test, y_pred_test, target_names=[label_mapping[i] for i in labels_sorted], output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "naive_bayes_test_classification_report.csv"))

print("\n✅ [COMPLETATO] Tutte le metriche e confusion matrix salvate in:")
print(RESULTS_DIR)


[INFO] Inizio addestramento Naive Bayes con 5-Fold Cross-Validation...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato
[INFO] Fold 2 completato
[INFO] Fold 3 completato
[INFO] Fold 4 completato
[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\naive_bayes_cv_metrics.csv

🔢 Matrice di Confusione Media (CV):

                      Credential Access  Defense Evasion  Discovery  \
Credential Access                     1                0          0   
Defense Evasion                       0              462          0   
Discovery                             8                0       2569   
Exfiltration                          0                0          0   
Initial Access                        0                0          0   
Lateral Movement                      0                0          0   
Persistence                           0                0          0   
Privilege Escalation                  0                0          0   
Reconnaissance                      654                0       1282   
Resource Development     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# ==============================================
# - BLOCCO 11 — Logistic Regression Multiclasse (5-Fold CV + Test Finale)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento Logistic Regression con 5-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset scaled
X_train_scaled = np.load(os.path.join(SAVE_DIR, "X_train_scaled.npy"))
X_test_scaled  = np.load(os.path.join(SAVE_DIR, "X_test_scaled.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test  = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_scaled.shape}, X_test={X_test_scaled.shape}")

# 🔹 Recupero mapping LabelEncoder
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# 🔹 Cross-validation 5-fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Liste metriche
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

# ===========================================================
# 1️⃣ Loop sui fold (Cross-Validation)
# ===========================================================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train), 1):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    lr = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,
        n_jobs=-1,
        class_weight='balanced',
        random_state=42
    )

    lr.fit(X_tr, y_tr)
    y_pred = lr.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche (Cross-Validation)
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "logistic_regression_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'logistic_regression_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Matrice di confusione media (versione robusta)
# ===========================================================
print("[INFO] Calcolo matrice di confusione media (versione robusta)...")

# Determina il numero di classi dal label encoder
n_classes = len(label_mapping)

# Inizializza una matrice vuota
sum_cm = np.zeros((n_classes, n_classes), dtype=int)

# Somma solo le confusion matrix con la stessa dimensione
for cm in cm_list:
    if cm.shape == (n_classes, n_classes):
        sum_cm += cm

# Calcola la media
avg_cm = (sum_cm / len(cm_list)).round().astype(int)

labels_sorted = sorted(label_mapping.keys())
cm_df = pd.DataFrame(
    avg_cm,
    index=[label_mapping.get(i, "unknown") for i in labels_sorted],
    columns=[label_mapping.get(i, "unknown") for i in labels_sorted]
)
cm_df.to_csv(os.path.join(RESULTS_DIR, "logistic_regression_cv_confusion_matrix.csv"))
print(f"[INFO] Matrice di confusione media salvata in: logistic_regression_cv_confusion_matrix.csv")


# ===========================================================
# 4️⃣ Addestramento finale su tutto il training set
# ===========================================================
print("[INFO] Addestramento modello finale su tutto il training set...")
final_lr = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)
final_lr.fit(X_train_scaled, y_train)

# ===========================================================
# 5️⃣ Valutazione sul test set
# ===========================================================
y_pred_test = final_lr.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

# Specificità
cm_test = confusion_matrix(y_test, y_pred_test)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "logistic_regression_test_metrics.csv"), index=False)

# Matrice di confusione e classification report
cm_test_df = pd.DataFrame(cm_test,
                          index=[label_mapping.get(i, "unknown") for i in labels_sorted],
                          columns=[label_mapping.get(i, "unknown") for i in labels_sorted])
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "logistic_regression_test_confusion_matrix.csv"))

report = classification_report(y_test, y_pred_test, target_names=[label_mapping[i] for i in labels_sorted], output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "logistic_regression_test_classification_report.csv"))

print("\n✅ [COMPLETATO] Logistic Regression addestrata e metriche salvate.")


[INFO] Inizio addestramento Logistic Regression con 5-Fold Cross-Validation...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato




[INFO] Fold 2 completato




[INFO] Fold 3 completato




[INFO] Fold 4 completato




[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\logistic_regression_cv_metrics.csv
[INFO] Calcolo matrice di confusione media (versione robusta)...
[INFO] Matrice di confusione media salvata in: logistic_regression_cv_confusion_matrix.csv
[INFO] Addestramento modello finale su tutto il training set...





✅ [COMPLETATO] Logistic Regression addestrata e metriche salvate.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# ==============================================
# - BLOCCO 12 — KNN Multiclasse (5-Fold CV + Test Finale)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento KNN con 5-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset scaled
X_train_scaled = np.load(os.path.join(SAVE_DIR, "X_train_scaled.npy"))
X_test_scaled  = np.load(os.path.join(SAVE_DIR, "X_test_scaled.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test  = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_scaled.shape}, X_test={X_test_scaled.shape}")

# 🔹 Recupero mapping LabelEncoder
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
labels_sorted = sorted(label_mapping.keys())  # Tutte le classi ordinate

# 🔹 Cross-validation 5-fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Liste metriche
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

# ===========================================================
# 1️⃣ Loop sui fold (Cross-Validation)
# ===========================================================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train), 1):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    knn.fit(X_tr, y_tr)
    y_pred = knn.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred, labels=labels_sorted)  # ⚡ gestione classi rare
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche (Cross-Validation)
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "knn_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'knn_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Matrice di confusione media
# ===========================================================
avg_cm = np.mean(cm_list, axis=0).round().astype(int)
cm_df = pd.DataFrame(avg_cm,
                     index=[label_mapping[i] for i in labels_sorted],
                     columns=[label_mapping[i] for i in labels_sorted])
cm_df.to_csv(os.path.join(RESULTS_DIR, "knn_cv_confusion_matrix.csv"))
print(f"[INFO] Matrice di confusione CV salvata in: {os.path.join(RESULTS_DIR, 'knn_cv_confusion_matrix.csv')}")

# ===========================================================
# 4️⃣ Addestramento finale su tutto il training set
# ===========================================================
print("[INFO] Addestramento modello finale su tutto il training set...")
final_knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
final_knn.fit(X_train_scaled, y_train)

# ===========================================================
# 5️⃣ Valutazione sul test set
# ===========================================================
y_pred_test = final_knn.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

# Specificità
cm_test = confusion_matrix(y_test, y_pred_test, labels=labels_sorted)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "knn_test_metrics.csv"), index=False)

# Matrice di confusione e classification report
cm_test_df = pd.DataFrame(cm_test,
                          index=[label_mapping[i] for i in labels_sorted],
                          columns=[label_mapping[i] for i in labels_sorted])
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "knn_test_confusion_matrix.csv"))

report = classification_report(y_test, y_pred_test,
                               target_names=[label_mapping[i] for i in labels_sorted],
                               output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "knn_test_classification_report.csv"))

print("\n✅ [COMPLETATO] KNN addestrato e metriche salvate.")


[INFO] Inizio addestramento KNN con 5-Fold Cross-Validation...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato
[INFO] Fold 2 completato
[INFO] Fold 3 completato
[INFO] Fold 4 completato
[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\knn_cv_metrics.csv
[INFO] Matrice di confusione CV salvata in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\knn_cv_confusion_matrix.csv
[INFO] Addestramento modello finale su tutto il training set...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



✅ [COMPLETATO] KNN addestrato e metriche salvate.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# ==============================================
# - BLOCCO 12 — Decision Tree (3-Fold Cross-Validation + Test)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento Decision Tree con 3-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset binned
X_train_binned = np.load(os.path.join(SAVE_DIR, "X_train_binned.npy"))
X_test_binned = np.load(os.path.join(SAVE_DIR, "X_test_binned.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_binned.shape}, X_test={X_test_binned.shape}")

# 🔹 Recupero mapping LabelEncoder per etichette leggibili
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# ===========================================================
# 1️⃣ Cross-Validation
# ===========================================================
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_binned, y_train), 1):
    X_tr, X_val = X_train_binned[train_idx], X_train_binned[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    dt = DecisionTreeClassifier(
        criterion='gini',
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    )
    dt.fit(X_tr, y_tr)
    y_pred = dt.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche CV
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "decision_tree_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'decision_tree_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Addestramento modello finale + Test Set
# ===========================================================
print("[INFO] Addestramento modello finale su tutto il training set...")
dt_final = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)
dt_final.fit(X_train_binned, y_train)
y_pred_test = dt_final.predict(X_test_binned)

# Metriche test
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
test_recall = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
test_f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

# Specificità test
cm_test = confusion_matrix(y_test, y_pred_test)
specificity_test = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity_test.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_test_mean = np.mean(specificity_test)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [test_accuracy],
    "Precision_macro": [test_precision],
    "Recall_macro": [test_recall],
    "F1_macro": [test_f1],
    "Specificity_macro": [specificity_test_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "decision_tree_test_metrics.csv"), index=False)

# Matrice di confusione (test)
labels_sorted = sorted(label_mapping.keys())
cm_df = pd.DataFrame(
    cm_test,
    index=[label_mapping[i] for i in labels_sorted],
    columns=[label_mapping[i] for i in labels_sorted]
)
cm_df.to_csv(os.path.join(RESULTS_DIR, "decision_tree_confusion_matrix.csv"))

# Classification report (test)
report = classification_report(y_test, y_pred_test, target_names=[label_mapping[i] for i in labels_sorted], output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "decision_tree_classification_report.csv"))

print("\n✅ [COMPLETATO] Decision Tree addestrato e metriche salvate.")


[INFO] Inizio addestramento Decision Tree con 3-Fold Cross-Validation...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato
[INFO] Fold 2 completato
[INFO] Fold 3 completato
[INFO] Fold 4 completato
[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\decision_tree_cv_metrics.csv
[INFO] Addestramento modello finale su tutto il training set...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



✅ [COMPLETATO] Decision Tree addestrato e metriche salvate.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# ==============================================
# - BLOCCO 12 — Decision Tree Multiclasse (3-Fold CV + Test Finale)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento Decision Tree con 3-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset binned
X_train_binned = np.load(os.path.join(SAVE_DIR, "X_train_binned.npy"))
X_test_binned = np.load(os.path.join(SAVE_DIR, "X_test_binned.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_binned.shape}, X_test={X_test_binned.shape}")

# 🔹 Recupero mapping LabelEncoder
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# 🔹 Cross-validation 5-fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Liste metriche
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

# ===========================================================
# 1️⃣ Loop sui fold (Cross-Validation)
# ===========================================================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_binned, y_train), 1):
    X_tr, X_val = X_train_binned[train_idx], X_train_binned[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    dt = DecisionTreeClassifier(
        criterion='gini',
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    )
    dt.fit(X_tr, y_tr)
    y_pred = dt.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche (Cross-Validation)
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "decision_tree_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'decision_tree_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Addestramento finale su tutto il training set
# ===========================================================
print("[INFO] Addestramento modello finale su tutto il training set...")
dt_final = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)
dt_final.fit(X_train_binned, y_train)

# ===========================================================
# 4️⃣ Valutazione sul test set
# ===========================================================
y_pred_test = dt_final.predict(X_test_binned)

acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

# Specificità
cm_test = confusion_matrix(y_test, y_pred_test)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "decision_tree_test_metrics.csv"), index=False)

# Matrice di confusione e classification report
labels_sorted = sorted(label_mapping.keys())
cm_test_df = pd.DataFrame(cm_test,
                          index=[label_mapping.get(i, "unknown") for i in labels_sorted],
                          columns=[label_mapping.get(i, "unknown") for i in labels_sorted])
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "decision_tree_test_confusion_matrix.csv"))

report = classification_report(y_test, y_pred_test, target_names=[label_mapping[i] for i in labels_sorted], output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "decision_tree_test_classification_report.csv"))

print("\n✅ [COMPLETATO] Decision Tree addestrato e metriche salvate.")


[INFO] Inizio addestramento Decision Tree con 3-Fold Cross-Validation...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato
[INFO] Fold 2 completato
[INFO] Fold 3 completato
[INFO] Fold 4 completato
[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\decision_tree_cv_metrics.csv
[INFO] Addestramento modello finale su tutto il training set...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



✅ [COMPLETATO] Decision Tree addestrato e metriche salvate.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# ==============================================
# - BLOCCO 11 — SVM Multiclasse (5-Fold CV + Test Finale) con class_weight='balanced'
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento SVM con 5-Fold Cross-Validation (class_weight='balanced')...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset scaled
X_train_scaled = np.load(os.path.join(SAVE_DIR, "X_train_scaled.npy"))
X_test_scaled  = np.load(os.path.join(SAVE_DIR, "X_test_scaled.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv")).values.ravel()
y_test  = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv")).values.ravel()

print(f"[INFO] Dataset caricati: X_train={X_train_scaled.shape}, X_test={X_test_scaled.shape}")

# 🔹 Recupero mapping LabelEncoder
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
labels_sorted = sorted(label_mapping.keys())

# 🔹 Preparazione cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Liste metriche
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

# ===========================================================
# 1️⃣ Loop sui fold (Cross-Validation)
# ===========================================================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train), 1):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    svm = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr', class_weight='balanced')
    svm.fit(X_tr, y_tr)
    y_pred = svm.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità per classe
    cm = confusion_matrix(y_val, y_pred, labels=labels_sorted)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato")

# ===========================================================
# 2️⃣ Media e deviazione standard metriche (Cross-Validation)
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "svm_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'svm_cv_metrics.csv')}")

# ===========================================================
# 3️⃣ Matrice di confusione media
# ===========================================================
avg_cm = np.mean(cm_list, axis=0).round().astype(int)
cm_df = pd.DataFrame(avg_cm,
                     index=[label_mapping[i] for i in labels_sorted],
                     columns=[label_mapping[i] for i in labels_sorted])
cm_df.to_csv(os.path.join(RESULTS_DIR, "svm_cv_confusion_matrix.csv"))
print(f"[INFO] Matrice di confusione CV salvata in: {os.path.join(RESULTS_DIR, 'svm_cv_confusion_matrix.csv')}")

# ===========================================================
# 4️⃣ Addestramento finale su tutto il training set
# ===========================================================
print("[INFO] Addestramento modello finale su tutto il training set...")
final_svm = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr', class_weight='balanced')
final_svm.fit(X_train_scaled, y_train)

# 🔹 Salvataggio modello finale
import pickle
with open(os.path.join(RESULTS_DIR, "svm_final_model.pkl"), "wb") as f:
    pickle.dump(final_svm, f)
print(f"[INFO] Modello finale SVM salvato in: {os.path.join(RESULTS_DIR, 'svm_final_model.pkl')}")

# ===========================================================
# 5️⃣ Valutazione sul test set
# ===========================================================
y_pred_test = final_svm.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_test, average='macro', zero_division=0)

cm_test = confusion_matrix(y_test, y_pred_test, labels=labels_sorted)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "svm_test_metrics.csv"), index=False)

# Matrice di confusione e classification report
cm_test_df = pd.DataFrame(cm_test,
                          index=[label_mapping[i] for i in labels_sorted],
                          columns=[label_mapping[i] for i in labels_sorted])
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "svm_test_confusion_matrix.csv"))

report = classification_report(y_test, y_pred_test,
                               target_names=[label_mapping[i] for i in labels_sorted],
                               output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "svm_test_classification_report.csv"))

print("\n✅ [COMPLETATO] SVM addestrato, modello salvato e metriche disponibili.")


[INFO] Inizio addestramento SVM con 5-Fold Cross-Validation (class_weight='balanced')...
[INFO] Dataset caricati: X_train=(278716, 15), X_test=(69680, 15)




[INFO] Fold 1 completato
[INFO] Fold 2 completato
[INFO] Fold 3 completato
[INFO] Fold 4 completato
[INFO] Fold 5 completato
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\svm_cv_metrics.csv
[INFO] Matrice di confusione CV salvata in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\svm_cv_confusion_matrix.csv
[INFO] Addestramento modello finale su tutto il training set...
[INFO] Modello finale SVM salvato in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\svm_final_model.pkl

✅ [COMPLETATO] SVM addestrato, modello salvato e metriche disponibili.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# ==============================================
# - BLOCCO 11 — SVM (Classi Minori - 5-Fold CV + Test Finale)
# ==============================================
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
import pickle

print("[INFO] Inizio addestramento SVM (classi minori) con 5-Fold Cross-Validation...")

# 🔹 Directory
SAVE_DIR = r"C:\Users\maria\Desktop\Zeek_ML\TrainTestSplit"
RESULTS_DIR = r"C:\Users\maria\Desktop\Zeek_ML\ModelResults"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔹 Caricamento dataset scalati
X_train_scaled = np.load(os.path.join(SAVE_DIR, "X_train_scaled.npy"))
X_test_scaled = np.load(os.path.join(SAVE_DIR, "X_test_scaled.npy"))
y_train = pd.read_csv(os.path.join(SAVE_DIR, "y_train.csv"))
y_test = pd.read_csv(os.path.join(SAVE_DIR, "y_test.csv"))

# 🔹 Se y_train e y_test hanno una sola colonna, convertili in Series
y_train_series = y_train.iloc[:, 0]
y_test_series = y_test.iloc[:, 0]

# 🔹 Recupero mapping LabelEncoder per etichette leggibili
with open(os.path.join(SAVE_DIR, "label_encoder_tactic.pkl"), "rb") as f:
    le = pickle.load(f)
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# ===========================================================
# 1️⃣ Selezione classi minori
# ===========================================================
class_counts = y_train_series.value_counts()
threshold = 1000  # puoi modificare a seconda della distribuzione
minor_classes = class_counts[class_counts < threshold].index.tolist()
print(f"[INFO] Classi minori selezionate: {minor_classes}")

# Maschere per filtrare solo le classi minori
mask_train = y_train_series.isin(minor_classes)
mask_test = y_test_series.isin(minor_classes)

X_train_small = X_train_scaled[mask_train]
y_train_small = y_train_series[mask_train].values
X_test_small = X_test_scaled[mask_test]
y_test_small = y_test_series[mask_test].values

print(f"[INFO] Dataset ridotto: X_train_small={X_train_small.shape}, X_test_small={X_test_small.shape}")

# ===========================================================
# 2️⃣ Cross-Validation (5-Fold)
# ===========================================================
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list, specificity_list, cm_list = [], [], [], [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_small, y_train_small), 1):
    X_tr, X_val = X_train_small[train_idx], X_train_small[val_idx]
    y_tr, y_val = y_train_small[train_idx], y_train_small[val_idx]

    svm = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', probability=False)
    svm.fit(X_tr, y_tr)
    y_pred = svm.predict(X_val)

    # Metriche classiche
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recall_list.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
    f1_list.append(f1_score(y_val, y_pred, average='macro', zero_division=0))

    # Specificità
    cm = confusion_matrix(y_val, y_pred, labels=minor_classes)
    cm_list.append(cm)
    specificity_fold = []
    for i in range(len(cm)):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity_fold.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
    specificity_list.append(np.mean(specificity_fold))

    print(f"[INFO] Fold {fold} completato.")

# ===========================================================
# 3️⃣ Media e deviazione standard metriche
# ===========================================================
metrics_summary = {
    "Accuracy": [np.mean(accuracy_list), np.std(accuracy_list)],
    "Precision_macro": [np.mean(precision_list), np.std(precision_list)],
    "Recall_macro": [np.mean(recall_list), np.std(recall_list)],
    "F1_macro": [np.mean(f1_list), np.std(f1_list)],
    "Specificity_macro": [np.mean(specificity_list), np.std(specificity_list)]
}
metrics_df = pd.DataFrame(metrics_summary, index=["mean", "std"])
metrics_df.to_csv(os.path.join(RESULTS_DIR, "svm_minor_cv_metrics.csv"))
print(f"[INFO] Metriche CV salvate in: {os.path.join(RESULTS_DIR, 'svm_minor_cv_metrics.csv')}")

# ===========================================================
# 4️⃣ Matrice di confusione media (CV)
# ===========================================================
# Gestione eventuali shape diverse
if all(cm.shape == cm_list[0].shape for cm in cm_list):
    avg_cm = np.mean(cm_list, axis=0).round().astype(int)
else:
    print("[WARN] Matrici di confusione CV hanno shape diversa; salto calcolo media")
    avg_cm = cm_list[0]

minor_classes_sorted = sorted(minor_classes)
cm_df = pd.DataFrame(avg_cm,
                     index=[label_mapping.get(i, str(i)) for i in minor_classes_sorted],
                     columns=[label_mapping.get(i, str(i)) for i in minor_classes_sorted])
cm_df.to_csv(os.path.join(RESULTS_DIR, "svm_minor_cv_confusion_matrix.csv"))
print("\n🔢 Matrice di Confusione Media (CV):\n", cm_df)

# ===========================================================
# 5️⃣ Addestramento finale su tutto il subset di training
# ===========================================================
print("\n[INFO] Addestramento modello finale SVM su tutte le classi minori...")
final_svm = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced')
final_svm.fit(X_train_small, y_train_small)

# ===========================================================
# 6️⃣ Valutazione su test set
# ===========================================================
y_pred_test = final_svm.predict(X_test_small)

acc = accuracy_score(y_test_small, y_pred_test)
prec = precision_score(y_test_small, y_pred_test, average='macro', zero_division=0)
rec = recall_score(y_test_small, y_pred_test, average='macro', zero_division=0)
f1 = f1_score(y_test_small, y_pred_test, average='macro', zero_division=0)

# Specificità
cm_test = confusion_matrix(y_test_small, y_pred_test, labels=minor_classes_sorted)
specificity = []
for i in range(len(cm_test)):
    tn = np.sum(np.delete(np.delete(cm_test, i, axis=0), i, axis=1))
    fp = np.sum(cm_test[:, i]) - cm_test[i, i]
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)
specificity_mean = np.mean(specificity)

# Salvataggio metriche test
test_metrics = pd.DataFrame({
    "Accuracy": [acc],
    "Precision_macro": [prec],
    "Recall_macro": [rec],
    "F1_macro": [f1],
    "Specificity_macro": [specificity_mean]
})
test_metrics.to_csv(os.path.join(RESULTS_DIR, "svm_minor_test_metrics.csv"), index=False)
print(f"[INFO] Metriche test salvate in: {os.path.join(RESULTS_DIR, 'svm_minor_test_metrics.csv')}")

# ===========================================================
# 7️⃣ Matrice di confusione e classification report (Test)
# ===========================================================
cm_test_df = pd.DataFrame(
    cm_test,
    index=[label_mapping.get(i, str(i)) for i in minor_classes_sorted],
    columns=[label_mapping.get(i, str(i)) for i in minor_classes_sorted]
)
cm_test_df.to_csv(os.path.join(RESULTS_DIR, "svm_minor_test_confusion_matrix.csv"))

report = classification_report(
    y_test_small,
    y_pred_test,
    target_names=[label_mapping.get(i, str(i)) for i in minor_classes_sorted],
    output_dict=True
)
pd.DataFrame(report).transpose().to_csv(
    os.path.join(RESULTS_DIR, "svm_minor_test_classification_report.csv")
)

print("\n✅ [COMPLETATO] Tutte le metriche e confusion matrix salvate in:", RESULTS_DIR)
 

[INFO] Inizio addestramento SVM (classi minori) con 5-Fold Cross-Validation...
[INFO] Classi minori selezionate: [-9.0, -5.0, -4.0, -6.0, -2.0, -3.0]
[INFO] Dataset ridotto: X_train_small=(60, 15), X_test_small=(15, 15)
[INFO] Fold 1 completato.
[INFO] Fold 2 completato.
[INFO] Fold 3 completato.
[INFO] Fold 4 completato.
[INFO] Fold 5 completato.
[INFO] Metriche CV salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\svm_minor_cv_metrics.csv

🔢 Matrice di Confusione Media (CV):
       -9.0  -6.0  -5.0  -4.0  -3.0  -2.0
-9.0     4     0     0     0     0     0
-6.0     0     1     1     0     0     0
-5.0     0     0     2     0     0     0
-4.0     0     0     1     0     0     0
-3.0     0     0     1     0     0     0
-2.0     0     0     1     0     0     0

[INFO] Addestramento modello finale SVM su tutte le classi minori...
[INFO] Metriche test salvate in: C:\Users\maria\Desktop\Zeek_ML\ModelResults\svm_minor_test_metrics.csv

✅ [COMPLETATO] Tutte le metriche e confusion matri

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
