In [8]:
# ==========================================================
# BLOCCO 1 ✅ DEFINITIVO
# Lettura dataset compressi + Preprocessing etichette
# ==========================================================
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

print("📂 Caricamento dataset compressi...")

# === Lettura file .npz generato nel BLOCCO 11 ===
data_npz = np.load(r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\train_test_bal.npz", allow_pickle=True)

# === Estrazione dei dataset ===
X_train = data_npz["X_train"]
X_test = data_npz["X_test"]
y_train = data_npz["y_train"]
y_test = data_npz["y_test"]

# === Dataset bilanciato opzionale ===
X_train_bal = data_npz["X_train_bal"]
y_train_bal = data_npz["y_train_bal"]

print("✅ Dataset caricati con successo dal file .npz\n")

# === Conversione etichette (da stringhe a numeri) ===
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_train_bal_enc = le.transform(y_train_bal)

# === Info riepilogative ===
print("📊 Dimensioni:")
print(f"X_train: {X_train.shape} | y_train: {y_train.shape}")
print(f"X_test : {X_test.shape}  | y_test : {y_test.shape}")
print(f"X_train_bal: {X_train_bal.shape} | y_train_bal: {y_train_bal.shape}")

print("\n🔢 Classi trovate:", list(le.classes_))
print("Numero classi:", len(le.classes_))

# === Verifica che tutto sia numerico ===
X_train_used = X_train_bal if X_train_bal is not None else X_train
print("\nTipi delle colonne in X_train_used:")
print(pd.DataFrame(X_train_used).dtypes.value_counts())

non_numeric = [
    col for col in pd.DataFrame(X_train_used).columns
    if not np.issubdtype(pd.DataFrame(X_train_used)[col].dtype, np.number)
]
print("\nColonne non numeriche:", non_numeric)

📂 Caricamento dataset compressi...
✅ Dataset caricati con successo dal file .npz

📊 Dimensioni:
X_train: (245672, 16) | y_train: (245672,)
X_test : (61418, 16)  | y_test : (61418,)
X_train_bal: (38448, 16) | y_train_bal: (38448,)

🔢 Classi trovate: ['Discovery', 'Reconnaissance', 'Resource Development']
Numero classi: 3

Tipi delle colonne in X_train_used:
float32    16
Name: count, dtype: int64

Colonne non numeriche: []


In [None]:
# ==========================================================
# BLOCCO 2 ✅ Random Forest (train bilanciato) con monitoraggio fold per fold e griglia estesa
# ==========================================================
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, learning_curve
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, 
    ConfusionMatrixDisplay, precision_recall_curve, roc_curve, auc, average_precision_score
)
from sklearn.preprocessing import label_binarize

print("🏗️ Addestramento Random Forest (TRAIN BILANCIATO) con griglia estesa e monitoraggio fold per fold...\n")

# ==========================================================
# 1️⃣ Impostazione base del modello e della griglia iperparametri
# ==========================================================
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # più alberi per stabilità

param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

param_combinations = list(product(
    param_grid['max_depth'],
    param_grid['min_samples_split'],
    param_grid['min_samples_leaf'],
    param_grid['max_features'],
    param_grid['bootstrap']
))

print(f"🔍 Inizio GridSearch manuale su {len(param_combinations)} combinazioni (~{len(param_combinations)*5} fit fold per fold)...\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_score = -np.inf
best_params = None
results = []

# ==========================================================
# 2️⃣ Ciclo combinazioni e monitoraggio training fold per fold
# ==========================================================
for i, (max_depth, min_split, min_leaf, max_feat, boot) in enumerate(param_combinations, 1):
    params = {
        'max_depth': max_depth,
        'min_samples_split': min_split,
        'min_samples_leaf': min_leaf,
        'max_features': max_feat,
        'bootstrap': boot
    }
    print(f"⚙️ Combinazione {i}/{len(param_combinations)} → {params}")
    start = time.time()
    
    fold_scores = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_bal, y_train_bal), 1):
        X_tr, X_val = X_train_bal[train_idx], X_train_bal[val_idx]
        y_tr, y_val = y_train_bal[train_idx], y_train_bal[val_idx]

        model = RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            **params
        )
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='macro')
        fold_scores.append(score)
        print(f"   📈 Fold {fold}/5 → F1 = {score:.4f}")

    mean_score = np.mean(fold_scores)
    elapsed = time.time() - start
    print(f"✅ Combinazione {i} completata in {elapsed:.2f}s | F1 medio = {mean_score:.4f}\n")

    results.append((params, mean_score))
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print(f"🏁 GridSearch completata. Miglior F1 = {best_score:.4f}")
print(f"🏆 Migliori iperparametri trovati: {best_params}\n")

# ==========================================================
# 3️⃣ Training finale con i migliori parametri
# ==========================================================
best_rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    **best_params
)
best_rf.fit(X_train_bal, y_train_bal)

# ==========================================================
# 4️⃣ Metriche dettagliate (train e test)
# ==========================================================
def print_metrics(y_true, y_pred, name, classes):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"\n📊 Metriche sul {name} set:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")

    cm = confusion_matrix(y_true, y_pred, labels=classes)
    for i, cls in enumerate(classes):
        tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
        fp = cm[:, i].sum() - cm[i, i]
        spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        print(f"Specificità classe '{cls}': {spec:.4f}")
    return f1

y_train_pred = best_rf.predict(X_train_bal)
y_test_pred = best_rf.predict(X_test)

# Convertiamo tutte le etichette in stringhe per evitare il ValueError
y_train_bal_str = y_train_bal.astype(str)
y_test_str = y_test.astype(str)
classes_str = np.unique(np.concatenate([y_train_bal_str, y_test_str]))

train_f1 = print_metrics(y_train_bal_str, y_train_pred.astype(str), "TRAIN", classes_str)
test_f1 = print_metrics(y_test_str, y_test_pred.astype(str), "TEST", classes_str)

# ==========================================================
# 5️⃣ Analisi over/underfitting
# ==========================================================
gap_f1 = train_f1 - test_f1
if gap_f1 > 0.05:
    print(f"\n⚠️ Possibile OVERFITTING: gap F1 train-test = {gap_f1:.4f}")
elif test_f1 < 0.7:
    print(f"\n⚠️ Possibile UNDERFITTING: F1 test = {test_f1:.4f}")
else:
    print("\n✅ Modello bilanciato, nessun evidente overfitting/underfitting")


🏗️ Addestramento Random Forest (TRAIN BILANCIATO) con griglia estesa e monitoraggio fold per fold...

🔍 Inizio GridSearch manuale su 216 combinazioni (~1080 fit fold per fold)...

⚙️ Combinazione 1/216 → {'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True}
   📈 Fold 1/5 → F1 = 0.9997
   📈 Fold 2/5 → F1 = 1.0000
   📈 Fold 3/5 → F1 = 1.0000
   📈 Fold 4/5 → F1 = 1.0000
   📈 Fold 5/5 → F1 = 0.9999
✅ Combinazione 1 completata in 34.66s | F1 medio = 0.9999

⚙️ Combinazione 2/216 → {'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}
   📈 Fold 1/5 → F1 = 0.9997
   📈 Fold 2/5 → F1 = 1.0000
   📈 Fold 3/5 → F1 = 1.0000
   📈 Fold 4/5 → F1 = 1.0000
   📈 Fold 5/5 → F1 = 0.9999
✅ Combinazione 2 completata in 34.24s | F1 medio = 0.9999

⚙️ Combinazione 3/216 → {'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True}
   📈 Fold 1/5 → F1 = 0.9997


KeyboardInterrupt: 

In [None]:
# ==========================================================
# BLOCCO LightGBM con CV, metriche train vs test, learning curve
# ==========================================================
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, 
    ConfusionMatrixDisplay, precision_recall_curve, roc_curve, auc, average_precision_score
)
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

print("🏗️ Addestramento LightGBM con 5-fold CV e GridSearch (parametri moderati)...")

# 1️⃣ Modello base
lgb_clf = lgb.LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

# 2️⃣ GridSearchCV per iperparametri conservativi
param_grid = {
    'num_leaves': [31, 63],
    'max_depth': [10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=lgb_clf,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# 3️⃣ Fit su train set
grid_search.fit(X_train, y_train)

# 4️⃣ Miglior modello
best_lgb = grid_search.best_estimator_
print(f"\n🏆 Miglior combinazione iperparametri: {grid_search.best_params_}")

# 5️⃣ Predizioni su train e test set
y_train_pred = best_lgb.predict(X_train)
y_test_pred = best_lgb.predict(X_test)

# 6️⃣ Metriche sul train set
train_acc = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='macro')
train_recall = recall_score(y_train, y_train_pred, average='macro')
train_f1 = f1_score(y_train, y_train_pred, average='macro')

print("\n📊 Metriche sul TRAIN set:")
print(f"Accuracy:  {train_acc:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-score:  {train_f1:.4f}")

# 7️⃣ Metriche sul test set
test_acc = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

cm = confusion_matrix(y_test, y_test_pred)
specificity = cm.diagonal() / (cm.sum(axis=1) - cm.diagonal() + cm.diagonal())

print("\n📊 Metriche sul TEST set:")
print(f"Accuracy:  {test_acc:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-score:  {test_f1:.4f}")
for i, cls in enumerate(best_lgb.classes_):
    print(f"Specificity classe '{cls}': {specificity[i]:.4f}")

# 8️⃣ Valutazione over/underfitting
gap_f1 = train_f1 - test_f1
if gap_f1 > 0.05:
    print(f"\n⚠️ Possibile OVERFITTING: gap F1 train-test = {gap_f1:.4f}")
elif test_f1 < 0.7:
    print(f"\n⚠️ Possibile UNDERFITTING: F1 test = {test_f1:.4f}")
else:
    print("\n✅ Modello bilanciato, nessun evidente overfitting/underfitting")

# ==========================================================
# Learning Curve train vs test
# ==========================================================
train_sizes, train_scores, test_scores = learning_curve(
    best_lgb, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5)
)

train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, label="Train score", marker='o')
plt.plot(train_sizes, test_mean, label="Test score", marker='s')
plt.xlabel("Numero di campioni di training")
plt.ylabel("F1-score macro")
plt.title("📈 Learning Curve LightGBM")
plt.legend()
plt.grid(True)
plt.show()

# ==========================================================
# Confusion Matrix
# ==========================================================
plt.figure(figsize=(8,6))
ConfusionMatrixDisplay.from_estimator(best_lgb, X_test, y_test, cmap='Blues', normalize='true')
plt.title("📊 Confusion Matrix Normalizzata")
plt.show()

# ==========================================================
# Precision-Recall Curve Multiclass
# ==========================================================
y_test_bin = label_binarize(y_test, classes=best_lgb.classes_)
y_score = best_lgb.predict_proba(X_test)

plt.figure(figsize=(8,6))
for i, cls in enumerate(best_lgb.classes_):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    ap = average_precision_score(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, lw=2, label=f"{cls} (AP={ap:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("📈 Precision-Recall Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# ==========================================================
# ROC Curve Multiclass
# ==========================================================
plt.figure(figsize=(8,6))
for i, cls in enumerate(best_lgb.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f"{cls} (AUC={roc_auc:.2f})")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("📈 ROC Curve Multiclass")
plt.legend()
plt.grid(True)
plt.show()

# ==========================================================
# Feature Importance
# ==========================================================
importances = best_lgb.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=X_train.columns[indices], palette="viridis")
plt.title("🌟 Feature Importance LightGBM")
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.show()
