In [3]:
# ==========================================================
# BLOCCO 1️⃣ — Caricamento Dataset e Class Weights (Imbalanced)
# ==========================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections

print("📂 Caricamento dataset e class weights per training (dataset sbilanciato)...")

# ----------------------------------------------------------
# 1️⃣ Percorsi file salvati
# ----------------------------------------------------------
base_path = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"

train_test_path = fr"{base_path}\train_test_unbal.npz"
weights_path = fr"{base_path}\class_weights_unbal.npy"

# ----------------------------------------------------------
# 2️⃣ Caricamento dati
# ----------------------------------------------------------
data_npz = np.load(train_test_path, allow_pickle=True)
class_weights = np.load(weights_path, allow_pickle=True).item()

X_train = data_npz["X_train"]
X_test = data_npz["X_test"]
y_train = data_npz["y_train"]
y_test = data_npz["y_test"]

print("✅ Dati caricati correttamente:")
print(" - X_train:", X_train.shape, "| X_test:", X_test.shape)
print(" - Distribuzione y_train:", dict(collections.Counter(y_train)))
print(" - Distribuzione y_test :", dict(collections.Counter(y_test)))

print("\n✅ BLOCCO 1 COMPLETATO: Dataset pronto per il training.")


📂 Caricamento dataset e class weights per training (dataset sbilanciato)...
✅ Dati caricati correttamente:
 - X_train: (248027, 16) | X_test: (62007, 16)
 - Distribuzione y_train: {'Resource Development': 209927, 'Discovery': 12816, 'Other': 2355, 'Reconnaissance': 22929}
 - Distribuzione y_test : {'Resource Development': 52482, 'Discovery': 3204, 'Other': 589, 'Reconnaissance': 5732}

✅ BLOCCO 1 COMPLETATO: Dataset pronto per il training.


In [None]:
# ==========================================================
# BLOCCO 2️⃣ ✅ Random Forest — TRAINING SU DATASET SBILANCIATO (3-fold + parallel)
# ==========================================================
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import collections

print("🏗️ Addestramento Random Forest (dataset sbilanciato, class_weight dinamico, 3-fold)...\n")

# ==========================================================
# 1️⃣ Griglia iperparametri
# ==========================================================
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

param_combinations = list(product(
    param_grid['max_depth'],
    param_grid['min_samples_split'],
    param_grid['min_samples_leaf'],
    param_grid['max_features'],
    param_grid['bootstrap']
))

print(f"🔍 GridSearch manuale su {len(param_combinations)} combinazioni (~{len(param_combinations)*3} fit fold per fold)...\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # ✅ ridotto a 3 fold
best_score = -np.inf
best_params = None
results = []

# ==========================================================
# 2️⃣ Ciclo combinazioni + validazione fold per fold
# ==========================================================
for i, (max_depth, min_split, min_leaf, max_feat, boot) in enumerate(param_combinations, 1):
    params = {
        'max_depth': max_depth,
        'min_samples_split': min_split,
        'min_samples_leaf': min_leaf,
        'max_features': max_feat,
        'bootstrap': boot
    }
    print(f"⚙️ Combinazione {i}/{len(param_combinations)} → {params}")
    start = time.time()
    
    fold_scores = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        model = RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            class_weight=class_weights,
            n_jobs=-1,  # ✅ usa tutti i core
            **params
        )
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='macro')
        fold_scores.append(score)
        print(f"   📈 Fold {fold}/3 → F1 = {score:.4f}")

    mean_score = np.mean(fold_scores)
    elapsed = time.time() - start
    print(f"✅ Combinazione {i} completata in {elapsed:.2f}s | F1 medio = {mean_score:.4f}\n")

    results.append((params, mean_score))
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print(f"🏁 GridSearch completata. Miglior F1 = {best_score:.4f}")
print(f"🏆 Migliori iperparametri trovati: {best_params}\n")

# ==========================================================
# 3️⃣ Training finale con i migliori parametri
# ==========================================================
best_rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight=class_weights,
    n_jobs=-1,  # ✅ parallel
    **best_params
)
best_rf.fit(X_train, y_train)

# ==========================================================
# 4️⃣ Metriche dettagliate
# ==========================================================
def print_metrics(y_true, y_pred, name, classes):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"\n📊 Metriche sul {name} set:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")

    cm = confusion_matrix(y_true, y_pred, labels=classes)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.title(f"Matrice di Confusione — {name}")
    plt.xlabel("Predetto")
    plt.ylabel("Reale")
    plt.show()
    return f1

y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

classes_sorted = np.unique(np.concatenate([y_train, y_test]))
train_f1 = print_metrics(y_train, y_train_pred, "TRAIN", classes_sorted)
test_f1 = print_metrics(y_test, y_test_pred, "TEST", classes_sorted)

# ==========================================================
# 5️⃣ Analisi over/underfitting
# ==========================================================
gap_f1 = train_f1 - test_f1
if gap_f1 > 0.05:
    print(f"\n⚠️ Possibile OVERFITTING: gap F1 train-test = {gap_f1:.4f}")
elif test_f1 < 0.7:
    print(f"\n⚠️ Possibile UNDERFITTING: F1 test = {test_f1:.4f}")
else:
    print("\n✅ Modello bilanciato, nessun evidente overfitting/underfitting")

print("\n✅ BLOCCO 2 COMPLETATO: Random Forest addestrato con class weights (dataset sbilanciato, 3-fold).")


🏗️ Addestramento Random Forest (dataset sbilanciato, class_weight dinamico, 3-fold)...

🔍 GridSearch manuale su 32 combinazioni (~96 fit fold per fold)...

⚙️ Combinazione 1/32 → {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True}
   📈 Fold 1/3 → F1 = 0.9991
   📈 Fold 2/3 → F1 = 0.9993
   📈 Fold 3/3 → F1 = 0.9995
✅ Combinazione 1 completata in 21.86s | F1 medio = 0.9993

⚙️ Combinazione 2/32 → {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}
   📈 Fold 1/3 → F1 = 0.9988
   📈 Fold 2/3 → F1 = 0.9993
   📈 Fold 3/3 → F1 = 0.9996
✅ Combinazione 2 completata in 31.46s | F1 medio = 0.9992

⚙️ Combinazione 3/32 → {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}
   📈 Fold 1/3 → F1 = 0.9991
   📈 Fold 2/3 → F1 = 0.9993
   📈 Fold 3/3 → F1 = 0.9995
✅ Combinazione 3 completata in 23.47s | F1 medio = 0.9993

⚙️ Combinazione 4/32 

KeyboardInterrupt: 