In [None]:
# ==========================================================
# BLOCCO: Data Cleaning e Aggregazione ottimizzato (ZeekDataFall22)
# ==========================================================

import os
import glob
import pandas as pd
from tqdm import tqdm

# Percorso cartella parquet
folder_path = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekDataFall22"
parquet_files = glob.glob(os.path.join(folder_path, "*.parquet"))

# -----------------------
# 1️⃣ Caricamento incrementale
# -----------------------
dfs = []
for file in tqdm(parquet_files, desc="Caricamento file parquet"):
    dfs.append(pd.read_parquet(file))
data = pd.concat(dfs, ignore_index=True)
print(f"Totale righe iniziali: {len(data)}")
print(f"Totale colonne iniziali: {data.shape[1]}")

# -----------------------
# 2️⃣ Conversione object → category per risparmio RAM
# -----------------------
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')

# -----------------------
# 3️⃣ Analisi valori mancanti
# -----------------------
col_summary = pd.DataFrame({
    'dtype': data.dtypes,
    'num_missing': data.isna().sum(),
    'perc_missing': data.isna().mean() * 100
}).sort_values('perc_missing', ascending=False)
display(col_summary)

# -----------------------
# 4️⃣ Eliminazione colonne con troppi NaN (>50%)
# -----------------------
threshold = 50
cols_to_drop = col_summary[col_summary['perc_missing'] > threshold].index.tolist()
if cols_to_drop:
    data.drop(columns=cols_to_drop, inplace=True)
    print(f"Colonne eliminate per troppi NaN (> {threshold}%): {cols_to_drop}")
else:
    print("Nessuna colonna da eliminare per troppi NaN.")

# -----------------------
# 5️⃣ Rimozione duplicati (solo colonne chiave per risparmio RAM)
# -----------------------
subset_cols = ['uid', 'ts', 'orig_bytes', 'resp_bytes'] if 'uid' in data.columns else None
duplicates_before = data.duplicated(subset=subset_cols).sum()
data = data.drop_duplicates(subset=subset_cols)
print(f"Duplicati rimossi: {duplicates_before}")

# -----------------------
# 6️⃣ Imputazione valori mancanti
# -----------------------
num_cols = data.select_dtypes(include=['int64','float64']).columns
cat_cols = data.select_dtypes(include=['category']).columns

for col in num_cols:
    if data[col].isna().any():
        data[col].fillna(data[col].mean(), inplace=True)
for col in cat_cols:
    if data[col].isna().any():
        mode_val = data[col].mode()
        data[col].fillna(mode_val[0] if not mode_val.empty else 'unknown', inplace=True)

# -----------------------
# 7️⃣ Aggregazioni session-level features
# -----------------------
if 'uid' not in data.columns:
    data['uid'] = range(len(data))

session_features = data.groupby('uid').agg(
    total_orig_bytes=('orig_bytes', 'sum'),
    total_resp_bytes=('resp_bytes', 'sum'),
    total_orig_pkts=('orig_pkts', 'sum'),
    total_resp_pkts=('resp_pkts', 'sum'),
    mean_duration=('duration', 'mean')
).reset_index()

data = pd.merge(data, session_features, on='uid', how='left')
print(f"Totale sessioni aggregate: {session_features.shape[0]}")

# -----------------------
# 8️⃣ Anteprima finale
# -----------------------
display(data.head())
print("✅ Data Cleaning e Aggregazioni completati.")

In [None]:
# ==========================================================
# BLOCCO 2c: Rimozione traffico benigno ("none") per multiclasse
# ==========================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Controllo colonna obbligatoria
if 'label_technique' not in data.columns:
    raise KeyError("⚠️ Manca la colonna 'label_technique' nel dataset caricato.")

# ----- Prima della rimozione -----
total_count = len(data)
none_count = (data['label_technique'] == 'none').sum()
print(f"⚠️ Campioni benigni rilevati: {none_count} / {total_count} ({none_count/total_count*100:.2f}%)")

# Grafico prima della rimozione
attack_counts_before = data['label_technique'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(y=attack_counts_before.index, x=attack_counts_before.values, palette='viridis')
plt.title("📊 Distribuzione categorie di attacco PRIMA della rimozione dei benigni")
plt.xlabel("Numero campioni")
plt.ylabel("Categoria di attacco")
plt.show()

# ----- Rimozione benigni -----
data = data[data['label_technique'] != 'none'].reset_index(drop=True)
print(f"✅ Dopo rimozione benigni: {len(data)} righe rimanenti.")

# ----- Dopo la rimozione -----
attack_counts_after = data['label_technique'].value_counts().sort_values(ascending=False)
attack_percent_after = (attack_counts_after / len(data) * 100).round(2)
attack_df_after = pd.DataFrame({
    'Conteggio': attack_counts_after,
    'Percentuale (%)': attack_percent_after
})
print("\n📊 Distribuzione aggiornata per categorie di attacco (solo attacchi, benigni rimossi):")
display(attack_df_after)

plt.figure(figsize=(10,6))
sns.barplot(y=attack_counts_after.index, x=attack_counts_after.values, palette='magma')
plt.title("📊 Distribuzione categorie di attacco DOPO la rimozione dei benigni")
plt.xlabel("Numero campioni")
plt.ylabel("Categoria di attacco")
plt.show()

# ----- Aggiornamento tactic -----
if 'tactic' in data.columns:
    tactic_counts_after = data['tactic'].value_counts().sort_values(ascending=False)
    tactic_percent_after = (tactic_counts_after / len(data) * 100).round(2)
    tactic_df_after = pd.DataFrame({
        'Conteggio': tactic_counts_after,
        'Percentuale (%)': tactic_percent_after
    })
    print("\n📊 Distribuzione aggiornata per tactic (benigni rimossi):")
    display(tactic_df_after)

In [None]:
# ==========================================================
# BLOCCO 3a: Controllo valori nulli e riepilogo colonne/feature
# ==========================================================

# Controllo valori nulli residui
null_counts = data.isna().sum()
null_cols = null_counts[null_counts > 0]

if len(null_cols) == 0:
    print("✅ Non ci sono valori nulli residui.")
else:
    print("⚠️ Colonne con valori nulli residui:")
    display(null_cols)

# Riepilogo colonne e feature rimaste dopo data cleaning e aggregazioni
print("\n📊 Colonne e feature disponibili per l'analisi:")
for i, col in enumerate(data.columns):
    print(f"{i+1}. {col}")

# Opzionale: possiamo separare feature numeriche e categoriali per la fase successiva
num_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = data.select_dtypes(include=['object','category']).columns.tolist()

print("\n🔹 Feature numeriche:")
print(num_features)
print("\n🔹 Feature categoriali:")
print(cat_features)
# --------------------------
# Salvataggio feature categoriali
# --------------------------
cat_features_df = data[cat_features].copy()
cat_features_df.to_parquet(r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\categorical_features.parquet", index=False)
print(f"💾 Feature categoriali salvate: {cat_features_df.shape[1]} colonne, {cat_features_df.shape[0]} righe")

In [None]:
# ==========================================================
# BLOCCO 3b + 3c: Analisi feature numeriche, gestione outlier e trasformazione robusta
# ==========================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

# ================================
# Step 0: Pulizia sicura della label_binary
# ================================
data['label_binary_clean'] = data['label_binary'].map({True:1, False:0, 'True':1, 'False':0, 1:1, 0:0})
data = data.dropna(subset=['label_binary_clean'])
data['label_binary'] = data['label_binary_clean'].astype(int)
data = data.drop(columns=['label_binary_clean'])

# ================================
# Step 1: Selezione feature numeriche
# ================================
num_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
for col in ['label_binary','label_technique','label_tactic']:
    if col in num_features:
        num_features.remove(col)

# ================================
# Step 2: Varianza
# ================================
variance = data[num_features].var().sort_values(ascending=False)
selected_features = variance[variance > 0.01].index.tolist()
print(f"✅ Feature con varianza significativa: {selected_features}")

# ================================
# Step 3: Analisi outlier
# ================================
outlier_summary = {}
for col in selected_features:
    Q1, Q3 = data[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    outlier_summary[col] = ((data[col]<lower) | (data[col]>upper)).sum()
print("⚠️ Numero di outlier trovati per feature:")
display(pd.Series(outlier_summary))

# ================================
# Step 4: Trasformazione outlier (Winsorization + log)
# ================================
print("🏗️ Applicazione trasformazioni per gestire outlier...\n")
for col in selected_features:
    lower = data[col].quantile(0.01)
    upper = data[col].quantile(0.99)
    data[col] = np.clip(data[col], lower, upper)
    min_val = data[col].min()
    offset = abs(min_val)+1e-6 if min_val <= 0 else 0
    data[col] = np.log1p(data[col] + offset)
    data[col].replace([np.inf, -np.inf], np.nan, inplace=True)

# 🔧 Fix: rimozione eventuali NaN residui
data = data.dropna(subset=selected_features)

# ================================
# Step 5: Analisi post-transform
# ================================
print("\n📄 REPORT FINALE - Dataset post Winsorization + log\n")
print("🔹 Statistiche descrittive:")
display(data[selected_features].describe().T)

print("\n📝 Motivazione trasformazione:")
print(
    "- Alcune feature avevano outlier estremi che potevano distorcere le distribuzioni.\n"
    "- Winsorization: limita i valori ai percentili 1% e 99%, riducendo l'impatto degli outlier.\n"
    "- Log-transform: riduce l'asimmetria e migliora la stabilità numerica.\n"
    "- Il dataset risultante ha distribuzioni più compatte e valori pronti per scaling/normalizzazione."
)

# ================================
# Step 6: Visualizzazione distribuzioni post-transform
# ================================
cols = 3
rows = math.ceil(len(selected_features)/cols)
plt.figure(figsize=(max(10, cols*5), max(5, rows*3)))
for i, col in enumerate(selected_features, 1):
    plt.subplot(rows, cols, i)
    sns.histplot(data[col].dropna(), bins=50, kde=True, color='skyblue')
    plt.title(col)
    plt.xlabel('')
    plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
# ==========================================================
# BLOCCO 3d: Analisi bilanciamento delle classi
# ==========================================================
import matplotlib.pyplot as plt

print("📊 Analisi bilanciamento classi (binary e multiclass)...\n")

# Binary
binary_counts = data['label_binary'].value_counts()
binary_perc = binary_counts / binary_counts.sum() * 100
print("Distribuzione label_binary:")
print(pd.concat([binary_counts, binary_perc.round(2)], axis=1).rename(columns={0:'count',1:'%'}))

# Multiclass
multiclass_counts = data['label_tactic'].value_counts()
multiclass_perc = multiclass_counts / multiclass_counts.sum() * 100
print("\nDistribuzione label_tactic:")
print(pd.concat([multiclass_counts, multiclass_perc.round(2)], axis=1).rename(columns={0:'count',1:'%'}))

# Grafici
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
binary_counts.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title("Distribuzione Binary")
axes[0].set_xlabel("Label")
axes[0].set_ylabel("Conteggio")

multiclass_counts.plot(kind='bar', ax=axes[1], color='salmon')
axes[1].set_title("Distribuzione Multiclass (Tattiche)")
axes[1].set_xlabel("Tattica")
axes[1].set_ylabel("Conteggio")

plt.tight_layout()
plt.show()

In [None]:
# ==========================================================
# BLOCCO 3e: Consolidamento classi multiclass rare + class weights
# ==========================================================
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import joblib

# Definizione classi principali
main_classes = ['Resource Development', 'Reconnaissance', 'Discovery']

# Creazione nuova colonna multiclass ridotta
data['label_tactic_reduced'] = data['label_tactic'].apply(
    lambda x: x if x in main_classes else 'Other'
)

# Distribuzione nuove classi
reduced_counts = data['label_tactic_reduced'].value_counts()
reduced_perc = (reduced_counts / reduced_counts.sum() * 100).round(2)
reduced_df = pd.DataFrame({'Count': reduced_counts, 'Percent (%)': reduced_perc})
print("📊 Distribuzione classi multiclass ridotte:")
display(reduced_df)

# Grafico distribuzione
plt.figure(figsize=(8,5))
sns.barplot(x=reduced_counts.index, y=reduced_counts.values, palette='pastel')
plt.title("📊 Distribuzione classi multiclass ridotte")
plt.xlabel("Classe")
plt.ylabel("Conteggio")
plt.show()

# ================================
# Calcolo class weights (utile per training)
# ================================
classes = data['label_tactic_reduced'].unique()
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(classes),
    y=data['label_tactic_reduced']
)
class_weights_dict = dict(zip(classes, class_weights))
print("⚖️ Class weights per le classi ridotte:")
for k,v in class_weights_dict.items():
    print(f"{k}: {v:.2f}")

os.makedirs("model_data", exist_ok=True)
joblib.dump(class_weights_dict, "model_data/class_weights_dict.pkl")
print("✅ Class weights salvati in 'model_data/class_weights_dict.pkl'")

In [None]:
# ==========================================================
# BLOCCO 4 Imbalanced: Preparazione dataset (classi sbilanciate con class weights)
# ==========================================================
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
import os

print("🏗️ Inizio preparazione dataset per autoencoder e classificazione (dataset sbilanciato con class weights)...\n")

# ================================
# 1️⃣ Definizione target e dataset
# ================================
target_multiclass_imb = 'label_tactic_reduced'
required_cols = ['label_binary', 'label_technique', 'label_tactic', target_multiclass_imb]
missing_cols = [c for c in required_cols if c not in data.columns]
if missing_cols:
    raise KeyError(f"❌ Colonne mancanti nel dataset originale: {missing_cols}")

# Target (multiclass ridotto + binario)
y_multiclass_imb = data[target_multiclass_imb].copy()
y_binary_imb = data['label_binary'].copy()

# ================================
# 2️⃣ Feature set (senza label)
# ================================
feature_data_imb = data.drop(columns=['label_binary', 'label_technique', 'label_tactic', target_multiclass_imb])

# ================================
# 3️⃣ Conversione datetime → numerico (timestamp)
# ================================
datetime_cols = feature_data_imb.select_dtypes(include=['datetime64']).columns.tolist()
for col in datetime_cols:
    feature_data_imb[col] = feature_data_imb[col].astype('int64') / 1e9

# ================================
# 4️⃣ Selezione automatica feature numeriche ad alta varianza
# ================================
numerical_high_var_imb = [col for col in selected_features if col in feature_data_imb.columns]

# ================================
# 5️⃣ Frequency Encoding per feature categoriali
# ================================
cat_features_imb = feature_data_imb.select_dtypes(include=['object','category']).columns.tolist()
encoded_data_imb = feature_data_imb.copy()

for col in cat_features_imb:
    freq = encoded_data_imb[col].value_counts(normalize=True)
    encoded_data_imb[col] = encoded_data_imb[col].map(freq)

# Mappatura finale feature (utile per pipeline o export)
feature_mapping_imb = {col: col for col in numerical_high_var_imb + cat_features_imb}

# ================================
# 6️⃣ Scaling MinMax
# ================================
encoded_data_imb = encoded_data_imb[numerical_high_var_imb + cat_features_imb]

scaler_auto_imb = MinMaxScaler()
X_autoencoder_imb = pd.DataFrame(
    scaler_auto_imb.fit_transform(encoded_data_imb),
    columns=encoded_data_imb.columns
)

# ================================
# 7️⃣ Salvataggio modelli e pesi
# ================================
os.makedirs("model_data", exist_ok=True)
joblib.dump(scaler_auto_imb, "model_data/scaler_auto_imbalanced.pkl")
joblib.dump(feature_mapping_imb, "model_data/feature_mapping_imbalanced.pkl")

print(f"✅ Dataset sbilanciato pronto: {X_autoencoder_imb.shape}")
print(f"🔹 Numero di feature totali: {X_autoencoder_imb.shape[1]}")
print(f"🔹 Classi multiclass (sbilanciate): {y_multiclass_imb.unique().tolist()}")

# Carica class weights salvati nel blocco precedente
class_weights_dict = joblib.load("model_data/class_weights_dict.pkl")
print(f"🔹 Class weights caricati da 'class_weights_dict.pkl'")

print("\n📊 Distribuzione classi (originale, sbilanciata):")
display(y_multiclass_imb.value_counts())

print("✅ Fine preparazione: X_autoencoder_imb, y_multiclass_imb, y_binary_imb pronti per modelli con class weights.")

In [None]:
# ==========================================================
# BLOCCO 5 IMBALANCED: Addestramento Autoencoder con logging avanzato
# ==========================================================
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # riduce warning TF/CUDA

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, EarlyStopping
import pandas as pd
import matplotlib.pyplot as plt
import time
import joblib

# Pulizia memoria Keras
tf.keras.backend.clear_session()

print("🏗️ Inizio costruzione e training dell'Autoencoder - Sbilanciato..")

# 1️⃣ Parametri
input_dim_imb = X_autoencoder_imb.shape[1]
latent_dim_imb = 16

# 2️⃣ Costruzione autoencoder
input_layer_imb = Input(shape=(input_dim_imb,))
encoded_imb = Dense(64, activation='relu')(input_layer_imb)
encoded_imb = Dense(32, activation='relu')(encoded_imb)
encoded_imb = Dense(latent_dim_imb, activation='relu', name='latent_vector_imb')(encoded_imb)
decoded_imb = Dense(32, activation='relu')(encoded_imb)
decoded_imb = Dense(64, activation='relu')(decoded_imb)
decoded_imb = Dense(input_dim_imb, activation='sigmoid')(decoded_imb)

autoencoder_imb = Model(inputs=input_layer_imb, outputs=decoded_imb)
autoencoder_imb.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# 3️⃣ EarlyStopping avanzato
early_stop_imb = EarlyStopping(
    monitor='val_loss',
    patience=10,
    min_delta=1e-4,
    restore_best_weights=True,
    verbose=1
)

# 4️⃣ Callback custom con logging avanzato e tempo per epoca
class ProgressLoggerImb(Callback):
    def on_train_begin(self, logs=None):
        self.epoch_logs = []

    def on_epoch_begin(self, epoch, logs=None):
        self.start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        elapsed = time.time() - self.start_time
        self.epoch_logs.append({
            'epoch': epoch+1,
            'loss': logs['loss'],
            'val_loss': logs['val_loss'],
            'time_s': elapsed
        })
        bar_len = 30
        progress = int(bar_len * (epoch+1)/self.params['epochs'])
        bar = '━' * progress + '-' * (bar_len - progress)
        print(f"\rEpoch {epoch+1}/{self.params['epochs']} [{bar}] "
              f"loss: {logs['loss']:.6f} | val_loss: {logs['val_loss']:.6f} | tempo: {elapsed:.2f}s", end='\n')

# 5️⃣ Training
logger_imb = ProgressLoggerImb()
history_imb = autoencoder_imb.fit(
    X_autoencoder_imb,
    X_autoencoder_imb,
    epochs=40,
    batch_size=64,
    validation_split=0.1,
    shuffle=True,
    verbose=0,
    callbacks=[early_stop_imb, logger_imb]
)

print("\n✅ Autoencoder IMBALANCED addestrato con successo.")

# 6️⃣ Recupero best epoch
best_epoch_idx_imb = history_imb.history['val_loss'].index(min(history_imb.history['val_loss']))
best_train_loss_imb = history_imb.history['loss'][best_epoch_idx_imb]
best_val_loss_imb = history_imb.history['val_loss'][best_epoch_idx_imb]
best_time_imb = logger_imb.epoch_logs[best_epoch_idx_imb]['time_s']

print(f"🏆 Best epoch: {best_epoch_idx_imb+1}")
print(f"    Train loss: {best_train_loss_imb:.6f}")
print(f"    Validation loss: {best_val_loss_imb:.6f}")
print(f"    Tempo per epoca: {best_time_imb:.2f}s")

# 7️⃣ Estrazione encoder ottimale e generazione embeddings
encoder_imb = Model(inputs=input_layer_imb, outputs=autoencoder_imb.get_layer('latent_vector_imb').output)

X_latent_imb = encoder_imb.predict(X_autoencoder_imb)
X_classifier_imb = pd.DataFrame(X_latent_imb, columns=[f'latent_imb_{i}' for i in range(latent_dim_imb)])
y_classifier_imb = y_multiclass_imb.reset_index(drop=True)

# Controllo NaN
assert not y_classifier_imb.isna().any(), "Errore: y_classifier_imb contiene NaN"

print(f"✅ Embeddings generati: {X_classifier_imb.shape}")

# 8️⃣ Grafico Train vs Validation Loss
plt.figure(figsize=(10,6))
plt.plot(history_imb.history['loss'], label='Train Loss', marker='o')
plt.plot(history_imb.history['val_loss'], label='Validation Loss', marker='o')
plt.axvline(x=best_epoch_idx_imb, color='r', linestyle='--', label=f'Best Epoch ({best_epoch_idx_imb+1})')
plt.title("Autoencoder IMBALANCED - Andamento Train/Validation Loss", fontsize=14)
plt.xlabel("Epoca")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.grid(True)
plt.show()

# 9️⃣ Salvataggio modelli e dataset latenti
os.makedirs("model_data", exist_ok=True)
autoencoder_imb.save("model_data/autoencoder_imbalanced.h5")
encoder_imb.save("model_data/encoder_imbalanced.h5")
joblib.dump(X_classifier_imb, "model_data/X_classifier_imbalanced.pkl")
joblib.dump(y_classifier_imb, "model_data/y_classifier_imbalanced.pkl")

print("💾 Modelli e dataset latenti salvati in 'model_data/'")

In [None]:
# ==========================================================
# BLOCCO 6 (Imbalanced): Train/Test Split + Scaling + Analisi distribuzioni
# ==========================================================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

print("🏗️ Suddivisione IMBALANCED dataset in train/test e analisi bilanciamento classi...")

# 1️⃣ Split stratificato
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
    X_classifier_imb,
    y_classifier_imb,
    test_size=0.2,
    stratify=y_classifier_imb,
    random_state=42
)

print("✅ Split completato (Imbalanced).")
print(f"Train: {X_train_imb.shape}, Test: {X_test_imb.shape}")

# 2️⃣ Percentuali per categoria
train_dist_imb = y_train_imb.value_counts(normalize=True) * 100
test_dist_imb = y_test_imb.value_counts(normalize=True) * 100

print("\n📊 Distribuzione classi nel TRAIN (Imbalanced):")
print(train_dist_imb.round(2))
print("\n📊 Distribuzione classi nel TEST (Imbalanced):")
print(test_dist_imb.round(2))

# 3️⃣ Grafico distribuzione classi
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.barplot(x=train_dist_imb.index, y=train_dist_imb.values, ax=axes[0])
axes[0].set_title("Distribuzione classi - Train (Imbalanced)")
sns.barplot(x=test_dist_imb.index, y=test_dist_imb.values, ax=axes[1])
axes[1].set_title("Distribuzione classi - Test (Imbalanced)")
plt.tight_layout()
plt.show()

# 4️⃣ Scaling
print("\n⚙️ Applicazione StandardScaler sugli embeddings latenti (Imbalanced)...")
scaler_latent_imb = StandardScaler()
X_train_imb_scaled = pd.DataFrame(scaler_latent_imb.fit_transform(X_train_imb), columns=X_train_imb.columns)
X_test_imb_scaled = pd.DataFrame(scaler_latent_imb.transform(X_test_imb), columns=X_test_imb.columns)
print("✅ Scaling completato (Imbalanced).")

# 5️⃣ Visualizzazione confronto pre/post scaling
sample_features_imb = X_train_imb.columns[:5]
fig, axes = plt.subplots(len(sample_features_imb), 2, figsize=(10, 12))
for i, feat in enumerate(sample_features_imb):
    sns.histplot(X_train_imb[feat], ax=axes[i, 0], kde=True)
    axes[i, 0].set_title(f"{feat} - Originale (Imbalanced)")
    sns.histplot(X_train_imb_scaled[feat], ax=axes[i, 1], kde=True)
    axes[i, 1].set_title(f"{feat} - Scaled (Imbalanced)")
plt.tight_layout()
plt.show()

print(f"\n✅ Dataset IMBALANCED pronto per classificazione multiclasse.")
print(f"Train: {X_train_imb_scaled.shape}, Test: {X_test_imb_scaled.shape}")

In [None]:
# ==========================================================
# BLOCCO 7 (Imbalanced): Salvataggio dataset, scaler e encoder
# ==========================================================
import joblib
import os

print("💾 Salvataggio dati e modelli IMBALANCED...")

# 1️⃣ Creazione cartella di output
os.makedirs("model_data_imbalanced", exist_ok=True)

# 2️⃣ Salvataggio dataset train/test
X_train_imb_scaled.to_csv("model_data_imbalanced/X_train_imb.csv", index=False)
X_test_imb_scaled.to_csv("model_data_imbalanced/X_test_imb.csv", index=False)
y_train_imb.to_csv("model_data_imbalanced/y_train_imb.csv", index=False)
y_test_imb.to_csv("model_data_imbalanced/y_test_imb.csv", index=False)
print("✅ Dataset IMBALANCED salvati in 'model_data_imbalanced/'")

# 3️⃣ Salvataggio scaler
joblib.dump(scaler_latent_imb, "model_data_imbalanced/scaler_latent_imb.pkl")
print("✅ Scaler IMBALANCED salvato come 'scaler_latent_imb.pkl'")

# 4️⃣ Salvataggio encoder (dall’autoencoder imbalanced)
encoder_imb.save("model_data_imbalanced/encoder_imb_best.keras")
print("✅ Encoder IMBALANCED salvato come 'encoder_imb_best.keras'")

print("\n🎯 Tutti i dati e modelli IMBALANCED pronti per il training!")