In [5]:
# ==========================================================
# BLOCCO 1: Elaborazione file per file di ZeekData22 
# ==========================================================
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import seaborn as sns
import matplotlib.pyplot as plt

# ----------------------------
# Cartelle
# ----------------------------
input_dir = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekData22"
output_dir = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"
os.makedirs(output_dir, exist_ok=True)

# ----------------------------
# Lista file CSV/Parquet
# ----------------------------
all_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) 
             if f.endswith('.csv') or f.endswith('.parquet')]

# ----------------------------
# Funzione di pulizia e trasformazione
# ----------------------------
def clean_and_transform(df):
    df = df.copy()
    
    # Pulizia valori numerici
    for col in df.select_dtypes(include=['float64','int64']).columns:
        df.loc[:, col] = df[col].fillna(df[col].mean())
    
    # Pulizia valori categorici
    for col in df.select_dtypes(include=['object','category']).columns:
        mode_val = df[col].mode()[0] if not df[col].mode().empty else 'unknown'
        df.loc[:, col] = df[col].fillna(mode_val)
    
    # Pulizia label_binary
    if 'label_binary' in df.columns:
        df['label_binary'] = df['label_binary'].map({True:1, False:0, 'True':1, 'False':0, 1:1, 0:0})
        df = df.dropna(subset=['label_binary'])
        df['label_binary'] = df['label_binary'].astype(int)
    
    # Selezione feature numeriche (escludendo label)
    num_features = df.select_dtypes(include=['int64','float64']).columns.tolist()
    for col in ['label_binary','label_technique','label_tactic']:
        if col in num_features:
            num_features.remove(col)
    
    # Rimuovi feature a bassa varianza
    variance = df[num_features].var()
    selected_features = variance[variance > 0.01].index.tolist()
    
    # Winsorization + log-transform
    for col in selected_features:
        lower = df[col].quantile(0.01)
        upper = df[col].quantile(0.99)
        df[col] = np.clip(df[col], lower, upper)
        min_val = df[col].min()
        offset = abs(min_val)+1e-6 if min_val <= 0 else 0
        df[col] = np.log1p(df[col] + offset)
        df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Rimozione eventuali NaN residui
    df = df.dropna(subset=selected_features)
    
    # A questo punto il file è pulito e trasformato, **senza bilanciamento**
    return df

# ----------------------------
# Elaborazione file per file
# ----------------------------
for file_path in tqdm(all_files, desc="Processing ZeekData22"):
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            df = pd.read_parquet(file_path)
        
        df_clean = clean_and_transform(df)
        
        base_name = os.path.basename(file_path)
        output_file = os.path.join(output_dir, f"processed_{base_name}.parquet")
        df_clean.to_parquet(output_file, index=False)
        
    except Exception as e:
        print(f"⚠️ Errore con file {file_path}: {e}")

print(f"\n✅ Tutti i file elaborati e salvati in: {output_dir}")


Processing ZeekData22: 100%|██████████| 14/14 [02:35<00:00, 11.11s/it]


✅ Tutti i file elaborati e salvati in: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22





In [6]:
# ==========================================================
# BLOCCO 2: Visualizzazione colonne ZeekData22 
# ==========================================================
import os
import pandas as pd

# Cartella dei file già processati
processed_dir = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"
processed_files = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith('.parquet')]

# Caricamento in un unico DataFrame (attenzione, può essere grande!)
dfs = []
for file in processed_files:
    dfs.append(pd.read_parquet(file))

data = pd.concat(dfs, ignore_index=True)
print(f"📊 Dataset caricato: {len(data)} righe, {data.shape[1]} colonne\n")

# ==============================
# 1️⃣ Elenco colonne presenti
# ==============================
columns_df = pd.DataFrame({'Colonne disponibili': data.columns.tolist()})
display(columns_df)

# ==============================
# 2️⃣ Distribuzione delle tattiche
# ==============================
# Controlliamo che la colonna esista
if 'label_tactic' in data.columns:
    tactic_counts = data['label_tactic'].value_counts().reset_index()
    tactic_counts.columns = ['Tattica', 'Conteggio']
    tactic_counts['Percentuale (%)'] = (tactic_counts['Conteggio'] / len(data) * 100).round(2)
    display(tactic_counts)
else:
    print("⚠️ Colonna 'label_tactic' non presente nel dataset")


📊 Dataset caricato: 20607218 righe, 37 colonne



Unnamed: 0,Colonne disponibili
0,resp_pkts
1,service
2,orig_ip_bytes
3,local_resp
4,missed_bytes
5,protocol
6,duration
7,conn_state
8,dest_ip
9,orig_pkts


Unnamed: 0,Tattica,Conteggio,Percentuale (%)
0,none,9281600,45.04
1,Reconnaissance,9278723,45.03
2,Discovery,2087,0.01
3,Credential Access,32,0.0
4,Privilege Escalation,14,0.0
5,Exfiltration,8,0.0
6,Lateral Movement,5,0.0
7,Resource Development,4,0.0
8,Defense Evasion,2,0.0
9,Initial Access,2,0.0


In [7]:
# ==========================================================
# BLOCCO 3: Confronto colonne ZeekData22 vs ZeekDataFall22
# ==========================================================
import os
import pandas as pd
from tqdm import tqdm

# === Percorso dataset Fall22 ===
folder_fall22 = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekDataFall22"

# === Caricamento (come già fatto per ZeekData22) ===
all_files_fall22 = [os.path.join(folder_fall22, f) 
                    for f in os.listdir(folder_fall22) 
                    if f.endswith('.csv') or f.endswith('.parquet')]

dfs_fall22 = []
for f in tqdm(all_files_fall22, desc="Loading ZeekDataFall22"):
    if f.endswith('.csv'):
        dfs_fall22.append(pd.read_csv(f))
    else:
        dfs_fall22.append(pd.read_parquet(f))

data_fall22 = pd.concat(dfs_fall22, ignore_index=True)
print(f"\n📊 ZeekDataFall22 caricato: {len(data_fall22)} righe, {data_fall22.shape[1]} colonne\n")

# === 1️⃣ Confronto diretto nomi colonne ===
cols_22 = set(data.columns)
cols_fall22 = set(data_fall22.columns)

common_cols = sorted(list(cols_22.intersection(cols_fall22)))
only_in_22 = sorted(list(cols_22 - cols_fall22))
only_in_fall22 = sorted(list(cols_fall22 - cols_22))

print("✅ Colonne comuni:", len(common_cols))
print("❌ Solo in ZeekData22:", len(only_in_22))
print("❌ Solo in ZeekDataFall22:", len(only_in_fall22))

# Tabella riepilogativa
diff_table = pd.DataFrame({
    "Solo in ZeekData22": pd.Series(only_in_22),
    "Solo in ZeekDataFall22": pd.Series(only_in_fall22)
})
display(diff_table)

# === 2️⃣ Analisi somiglianza colonne con nome diverso ===
# Per evitare rallentamenti, usa solo un campione casuale
sample_22 = data.sample(n=min(10000, len(data)), random_state=42)
sample_fall22 = data_fall22.sample(n=min(10000, len(data_fall22)), random_state=42)

similarity_report = []

for col_22 in only_in_22:
    for col_fall in only_in_fall22:
        try:
            # se entrambe sono numeriche
            if pd.api.types.is_numeric_dtype(sample_22[col_22]) and pd.api.types.is_numeric_dtype(sample_fall22[col_fall]):
                corr = sample_22[col_22].corr(sample_fall22[col_fall])
                if corr > 0.95:
                    similarity_report.append((col_22, col_fall, f"Numerica - Corr: {corr:.3f}"))
            # se entrambe sono stringhe
            elif pd.api.types.is_object_dtype(sample_22[col_22]) and pd.api.types.is_object_dtype(sample_fall22[col_fall]):
                overlap = (sample_22[col_22].isin(sample_fall22[col_fall])).mean()
                if overlap > 0.9:
                    similarity_report.append((col_22, col_fall, f"Categorica - Overlap: {overlap:.2%}"))
        except Exception:
            continue

# === 3️⃣ Output somiglianze trovate ===
if similarity_report:
    print("\n🔍 Colonne con contenuto simile ma nome diverso:")
    sim_df = pd.DataFrame(similarity_report, columns=["Colonna ZeekData22", "Colonna ZeekDataFall22", "Somiglianza"])
    display(sim_df)
else:
    print("\nℹ️ Nessuna colonna con contenuto simile trovata tra i nomi diversi.")


Loading ZeekDataFall22: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]



📊 ZeekDataFall22 caricato: 700395 righe, 38 colonne

✅ Colonne comuni: 27
❌ Solo in ZeekData22: 10
❌ Solo in ZeekDataFall22: 11


Unnamed: 0,Solo in ZeekData22,Solo in ZeekDataFall22
0,2022-01-09 - 2022-01-16,2022-08-28 - 2022-09-04
1,2022-01-16 - 2022-01-23,2022-09-04 - 2022-09-11
2,2022-02-06 - 2022-02-13,2022-09-11 - 2022-09-18
3,2022-02-13 - 2022-02-20,2022-09-18 - 2022-09-25
4,dest_ip,2022-09-25 - 2022-10-02
5,dest_port,2022-10-02 - 2022-10-09
6,mitre_attack_tactics,2022-10-09 - 2022-10-16
7,protocol,2022-10-16 - 2022-10-23
8,src_ip,2022-10-23 - 2022-10-30
9,src_port,label_binary



🔍 Colonne con contenuto simile ma nome diverso:


Unnamed: 0,Colonna ZeekData22,Colonna ZeekDataFall22,Somiglianza
0,2022-01-09 - 2022-01-16,2022-08-28 - 2022-09-04,Categorica - Overlap: 100.00%
1,2022-01-09 - 2022-01-16,2022-09-04 - 2022-09-11,Categorica - Overlap: 100.00%
2,2022-01-09 - 2022-01-16,2022-09-11 - 2022-09-18,Categorica - Overlap: 100.00%
3,2022-01-09 - 2022-01-16,2022-09-18 - 2022-09-25,Categorica - Overlap: 100.00%
4,2022-01-09 - 2022-01-16,2022-09-25 - 2022-10-02,Categorica - Overlap: 100.00%
...,...,...,...
75,src_ip,2022-10-02 - 2022-10-09,Categorica - Overlap: 90.19%
76,src_ip,2022-10-09 - 2022-10-16,Categorica - Overlap: 90.19%
77,src_ip,2022-10-16 - 2022-10-23,Categorica - Overlap: 90.19%
78,src_ip,2022-10-23 - 2022-10-30,Categorica - Overlap: 90.19%


In [None]:
# ==========================================================
# BLOCCO 4: Analisi ZeekData22 (pulizia, varianza e confronto con ZeekDataFall22)
# ==========================================================
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from tqdm import tqdm
import gc

# ================================
# Step 1️⃣ + 2️⃣ - Caricamento, filtraggio e salvataggio progressivo (no concat in RAM)
# ================================
from tqdm import tqdm
import gc

processed_dir = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"
temp_dir = os.path.join(processed_dir, "filtered_chunks")
os.makedirs(temp_dir, exist_ok=True)

processed_files = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith('.parquet')]

total_rows, benign_rows, kept_rows = 0, 0, 0

for i, file in enumerate(tqdm(processed_files, desc="Filtering ZeekData22")):
    df = pd.read_parquet(file)
    total_rows += len(df)

    if 'label_tactic' in df.columns:
        benign_count = (df['label_tactic'] == 'none').sum()
        benign_rows += benign_count
        df = df[df['label_tactic'] != 'none']
    
    kept_rows += len(df)
    # Salva ogni blocco filtrato come Parquet temporaneo
    chunk_path = os.path.join(temp_dir, f"filtered_chunk_{i}.parquet")
    df.to_parquet(chunk_path, index=False)
    del df
    gc.collect()

print(f"\n📊 Totale righe iniziali: {total_rows}")
print(f"⚠️ Benigni rimossi: {benign_rows} ({benign_rows/total_rows*100:.2f}%)")
print(f"✅ Righe rimanenti dopo filtraggio: {kept_rows}")
print(f"💾 Blocchi filtrati salvati in: {temp_dir}")

# Ora carica tutti i chunk filtrati (molto più leggeri) in un unico DataFrame
dfs = [pd.read_parquet(os.path.join(temp_dir, f)) for f in os.listdir(temp_dir) if f.endswith('.parquet')]
data = pd.concat(dfs, ignore_index=True)
del dfs
gc.collect()

print(f"\n📊 Dataset finale caricato: {len(data)} righe totali (post-filtraggio).")


# ================================
# Step 3️⃣ - Controllo valori nulli
# ================================
null_counts = data.isna().sum()
null_cols = null_counts[null_counts > 0]

if len(null_cols) == 0:
    print("\n✅ Nessun valore nullo rilevato.")
else:
    print("\n⚠️ Colonne con valori nulli:")
    display(null_cols)

# ================================
# Step 4️⃣ - Selezione feature numeriche e categoriali
# ================================
num_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = data.select_dtypes(include=['object','category']).columns.tolist()

print("\n🔹 Feature numeriche:", num_features)
print("\n🔹 Feature categoriali:", cat_features)

# ================================
# Step 5️⃣ - Calcolo varianza per feature numeriche
# ================================
variance = data[num_features].var().sort_values(ascending=False)
var_table = pd.DataFrame({
    'Feature': variance.index,
    'Varianza': variance.values,
    'Significativa (>0.01)': ['✅' if v > 0.01 else '❌' for v in variance.values]
})

print("\n📊 Tabella varianza (ZeekData22):")
display(var_table)

# Selezione feature significative
selected_features = variance[variance > 0.01].index.tolist()
print(f"\n✅ Feature con varianza significativa ({len(selected_features)}): {selected_features}")

# Salvataggio
variance22_path = os.path.join(processed_dir, "feature_variance_zeekdata22.csv")
var_table.to_csv(variance22_path, index=False)
print(f"💾 Tabella varianza ZeekData22 salvata in: {variance22_path}")

# ================================
# Step 6️⃣ - Analisi outlier e trasformazione
# ================================
outlier_summary = {}
for col in tqdm(selected_features, desc="Analisi outlier"):
    Q1, Q3 = data[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    outlier_summary[col] = ((data[col]<lower) | (data[col]>upper)).sum()

print("\n⚠️ Numero di outlier per feature:")
display(pd.Series(outlier_summary))

# Trasformazioni
print("\n🏗️ Applicazione Winsorization + Log Transform...")
for col in tqdm(selected_features, desc="Trasformazione"):
    lower = data[col].quantile(0.01)
    upper = data[col].quantile(0.99)
    data[col] = np.clip(data[col], lower, upper)
    min_val = data[col].min()
    offset = abs(min_val)+1e-6 if min_val <= 0 else 0
    data[col] = np.log1p(data[col] + offset)
    data[col].replace([np.inf, -np.inf], np.nan, inplace=True)

data = data.dropna(subset=selected_features)
print("✅ Dataset pulito e trasformato.")

# ================================
# Step 7️⃣ - Confronto con varianza ZeekDataFall22
# ================================
fall22_var_path = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\feature_variance_fall22.csv"
if os.path.exists(fall22_var_path):
    var_fall22 = pd.read_csv(fall22_var_path)
    var22 = pd.read_csv(variance22_path)

    merged_var = pd.merge(var22, var_fall22, on='Feature', how='outer', suffixes=('_ZeekData22','_ZeekDataFall22'))
    merged_var.fillna(0, inplace=True)

    print("\n📊 Confronto varianza tra ZeekData22 e ZeekDataFall22:")
    display(merged_var.head(20))

    plt.figure(figsize=(10,6))
    sns.scatterplot(data=merged_var, x='Varianza_ZeekDataFall22', y='Varianza_ZeekData22')
    plt.title("📈 Confronto varianza feature tra ZeekDataFall22 e ZeekData22")
    plt.xlabel("Varianza ZeekDataFall22")
    plt.ylabel("Varianza ZeekData22")
    plt.grid(True)
    plt.show()
else:
    print("\n⚠️ File feature_variance_fall22.csv non trovato, confronto non eseguito.")

# ================================
# Step 8️⃣ - Statistiche descrittive post trasformazione
# ================================
print("\n📄 Statistiche descrittive (post trasformazione):")
display(data[selected_features].describe().T)


Filtering ZeekData22:   0%|          | 0/18 [00:00<?, ?it/s]

Filtering ZeekData22: 100%|██████████| 18/18 [01:33<00:00,  5.21s/it]



📊 Totale righe iniziali: 41548181
⚠️ Benigni rimossi: 9281600 (22.34%)
✅ Righe rimanenti dopo filtraggio: 32266581
💾 Blocchi filtrati salvati in: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\filtered_chunks


MemoryError: Unable to allocate 246. MiB for an array with shape (1, 32266581) and data type object

In [11]:
# ==========================================================
# BLOCCO 5: Preprocessing robusto ZeekData22 per test encoder Fall22
# ==========================================================
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc

# --------------------------
# Percorsi
# --------------------------
processed_dir = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"
temp_dir = os.path.join(processed_dir, "filtered_chunks")
fall22_var_path = os.path.join(processed_dir, "feature_variance_fall22.csv")
os.makedirs(temp_dir, exist_ok=True)

# --------------------------
# Step 1: Caricamento e filtraggio benigni in chunk
# --------------------------
processed_files = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith('.parquet')]
kept_rows, total_rows, benign_rows = 0, 0, 0

for i, file in enumerate(tqdm(processed_files, desc="Filtering ZeekData22")):
    df = pd.read_parquet(file)
    total_rows += len(df)

    if 'label_tactic' in df.columns:
        benign_count = (df['label_tactic'] == 'none').sum()
        benign_rows += benign_count
        df = df[df['label_tactic'] != 'none']
    
    kept_rows += len(df)
    # Salva chunk filtrato
    chunk_path = os.path.join(temp_dir, f"filtered_chunk_{i}.parquet")
    df.to_parquet(chunk_path, index=False)
    del df
    gc.collect()

print(f"\nTotale righe iniziali: {total_rows}")
print(f"Benigni rimossi: {benign_rows} ({benign_rows/total_rows*100:.2f}%)")
print(f"Righe rimanenti dopo filtraggio: {kept_rows}")

# --------------------------
# Step 2: Caricamento chunk filtrati in DataFrame unico (più leggero)
# --------------------------
dfs = [pd.read_parquet(os.path.join(temp_dir, f)) for f in os.listdir(temp_dir) if f.endswith('.parquet')]
data = pd.concat(dfs, ignore_index=True)
del dfs
gc.collect()
print(f"\nDataset finale caricato: {len(data)} righe")

# --------------------------
# Step 3: Selezione feature numeriche e allineamento con Fall22
# --------------------------
num_features = data.select_dtypes(include=['int64','float64']).columns.tolist()

if os.path.exists(fall22_var_path):
    var_fall22 = pd.read_csv(fall22_var_path)
    fall22_features = var_fall22['Feature'].tolist()
    # Conserva solo le feature numeriche usate dall'encoder Fall22
    selected_features = [f for f in fall22_features if f in num_features]
else:
    selected_features = num_features  # fallback
print(f"\nFeature numeriche selezionate ({len(selected_features)}): {selected_features}")

# --------------------------
# Step 4: Gestione outlier e trasformazione robusta
# --------------------------
for col in tqdm(selected_features, desc="Winsorization + LogTransform"):
    # Winsorization: clip tra 1° e 99° percentile
    lower = data[col].quantile(0.01)
    upper = data[col].quantile(0.99)
    data[col] = np.clip(data[col], lower, upper)
    
    # Log-transform robusto
    data[col] = np.log1p(data[col].clip(lower=0))

# --------------------------
# Step 5: Gestione NaN senza droppare righe
# --------------------------
data[selected_features] = data[selected_features].fillna(0)

# --------------------------
# Step 6: Salvataggio dataset pronto per test
# --------------------------
test_path = os.path.join(processed_dir, "ZeekData22_test_ready.parquet")
data[selected_features].to_parquet(test_path, index=False)
print(f"\n✅ Dataset ZeekData22 pronto per test salvato in: {test_path}")


Filtering ZeekData22: 100%|██████████| 14/14 [00:51<00:00,  3.69s/it]



Totale righe iniziali: 20607218
Benigni rimossi: 9281600 (45.04%)
Righe rimanenti dopo filtraggio: 11325618

Dataset finale caricato: 11325618 righe

Feature numeriche selezionate (11): ['ts', 'dest_port_zeek', 'src_port_zeek', 'resp_ip_bytes', 'orig_ip_bytes', 'resp_bytes', 'orig_bytes', 'resp_pkts', 'orig_pkts', 'missed_bytes', 'duration']


Winsorization + LogTransform: 100%|██████████| 11/11 [00:10<00:00,  1.07it/s]



✅ Dataset ZeekData22 pronto per test salvato in: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_test_ready.parquet


In [20]:
# ==========================================================
# BLOCCO 6b: Confronto avanzato feature categoriali ZeekData22 vs ZeekDataFall22
# ==========================================================
import pandas as pd
import os

processed_dir = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22"
output_dir = os.path.join(processed_dir, "reports")
os.makedirs(output_dir, exist_ok=True)

# Percorsi
cat22_path = os.path.join(processed_dir, "categorical_features_zeekdata22.parquet")
catfall_path = os.path.join(processed_dir, "categorical_features_zeekdatafall22.parquet")

# Caricamento feature categoriali
cat22 = pd.read_parquet(cat22_path)
catfall = pd.read_parquet(catfall_path)

# -----------------------------
# Analisi colonne
# -----------------------------
common_cols = [c for c in cat22.columns if c in catfall.columns]
only_22 = [c for c in cat22.columns if c not in catfall.columns]
only_fall = [c for c in catfall.columns if c not in cat22.columns]

print(f"🔹 Colonne comuni: {len(common_cols)}")
print(f"🔹 Colonne solo ZeekData22: {len(only_22)} -> {only_22}")
print(f"🔹 Colonne solo ZeekDataFall22: {len(only_fall)} -> {only_fall}")

# -----------------------------
# Report valori unici colonne comuni
# -----------------------------
report_rows = []

for col in common_cols:
    vals_22 = set(cat22[col].dropna().unique())
    vals_fall = set(catfall[col].dropna().unique())
    only_in_22 = vals_22 - vals_fall
    only_in_fall = vals_fall - vals_22
    report_rows.append({
        'Feature': col,
        'Unique_in_ZeekData22': len(vals_22),
        'Unique_in_ZeekDataFall22': len(vals_fall),
        'Only_in_22': list(only_in_22),
        'Only_in_Fall': list(only_in_fall)
    })

report_df = pd.DataFrame(report_rows)
report_csv_path = os.path.join(output_dir, "categorical_features_comparison_report.csv")
report_df.to_csv(report_csv_path, index=False)

print(f"\n💾 Report valori unici salvato in: {report_csv_path}")
print("✅ Confronto completato.")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\maria\\Desktop\\Zeek_ML\\processed_zeekdata22\\categorical_features_zeekdata22.parquet'

In [None]:
# ================================
# BLOCCO 6: Preprocessing numerico + filtraggio classi per test
# ================================

from tqdm import tqdm
import numpy as np

# --------------------------
# Classe target da mantenere
# --------------------------
target_classes = ['Resource Development', 'Reconnaissance', 'Discovery']

# Filtra solo le righe con classi target
if 'label_tactic' in data.columns:
    # Manteniamo anche la categoria "Other" per eventuali righe residue
    data['label_tactic_reduced'] = data['label_tactic'].where(
        data['label_tactic'].isin(target_classes), other='Other'
    )
    
    print(f"\n✅ Righe dopo filtraggio classi target ({len(target_classes)}): {len(data)}")

    # --------------------------
    # Tabella distribuzione classi
    # --------------------------
    class_counts = data['label_tactic_reduced'].value_counts()
    class_percent = (class_counts / len(data) * 100).round(2)
    class_table = pd.DataFrame({
        'Count': class_counts,
        'Percent (%)': class_percent
    })
    print("\n📊 Distribuzione classi target:")
    display(class_table)

# --------------------------
# Feature numeriche da usare
# --------------------------
num_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
if os.path.exists(fall22_var_path):
    var_fall22 = pd.read_csv(fall22_var_path)
    fall22_features = var_fall22['Feature'].tolist()
    selected_features = [f for f in fall22_features if f in num_features]
else:
    selected_features = num_features
print(f"\n🔹 Feature numeriche selezionate ({len(selected_features)}): {selected_features}")

# --------------------------
# Winsorization + LogTransform robusto
# --------------------------
for col in tqdm(selected_features, desc="Winsorization + LogTransform"):
    lower = data[col].quantile(0.01)
    upper = data[col].quantile(0.99)
    data[col] = np.clip(data[col], lower, upper)
    data[col] = np.log1p(data[col].clip(lower=0))

# --------------------------
# Gestione NaN senza droppare righe
# --------------------------
data[selected_features] = data[selected_features].fillna(0)

# --------------------------
# Salvataggio dataset pronto per test
# --------------------------
test_path = os.path.join(processed_dir, "ZeekData22_test_ready_classes.parquet")
data[selected_features + ['label_tactic_reduced']].to_parquet(test_path, index=False)
print(f"\n✅ Dataset ZeekData22 pronto per test con classi target salvato in: {test_path}")



✅ Righe dopo filtraggio classi target (3): 9280814

📊 Distribuzione classi target:


Unnamed: 0_level_0,Count,Percent (%)
label_tactic_reduced,Unnamed: 1_level_1,Unnamed: 2_level_1
Reconnaissance,9278723,99.98
Discovery,2087,0.02
Resource Development,4,0.0



🔹 Feature numeriche selezionate (11): ['ts', 'dest_port_zeek', 'src_port_zeek', 'resp_ip_bytes', 'orig_ip_bytes', 'resp_bytes', 'orig_bytes', 'resp_pkts', 'orig_pkts', 'missed_bytes', 'duration']


Winsorization + LogTransform: 100%|██████████| 11/11 [00:09<00:00,  1.16it/s]



✅ Dataset ZeekData22 pronto per test con classi target salvato in: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_test_ready_classes.parquet


In [14]:
# ================================
# BLOCCO 7: Bilanciamento classi Discovery e Reconnaissance
# ================================
from sklearn.utils import resample
import pandas as pd
import os

# --------------------------
# Filtra solo le due classi principali
# --------------------------
data_bal = data[data['label_tactic'].isin(['Reconnaissance', 'Discovery'])].copy()
print(f"\n✅ Righe dopo filtraggio Discovery + Reconnaissance: {len(data_bal)}")

# --------------------------
# Controllo distribuzione originale
# --------------------------
class_counts = data_bal['label_tactic'].value_counts()
class_percent = data_bal['label_tactic'].value_counts(normalize=True) * 100
dist_table = pd.DataFrame({'Count': class_counts, 'Percent (%)': class_percent})
print("\n📊 Distribuzione originale delle classi:")
display(dist_table)

# --------------------------
# Downsample della classe maggiore
# --------------------------
minority_class_size = class_counts.min()
dfs_balanced = []

for cls in ['Reconnaissance', 'Discovery']:
    cls_df = data_bal[data_bal['label_tactic'] == cls]
    if len(cls_df) > minority_class_size:
        cls_df = resample(cls_df,
                          replace=False,
                          n_samples=minority_class_size,
                          random_state=42)
    dfs_balanced.append(cls_df)

# --------------------------
# Combina classi bilanciate
# --------------------------
data_balanced = pd.concat(dfs_balanced, ignore_index=True)
data_balanced = data_balanced.sample(frac=1, random_state=42)  # shuffle
print(f"\n✅ Dataset bilanciato pronto: {len(data_balanced)} righe")

# --------------------------
# Feature numeriche da usare
# --------------------------
selected_features = [f for f in selected_features if f in data_balanced.columns]
print(f"\n🔹 Feature numeriche selezionate ({len(selected_features)}): {selected_features}")

# --------------------------
# Salvataggio dataset bilanciato pronto per test
# --------------------------
balanced_path = os.path.join(processed_dir, "ZeekData22_test_ready_balanced.parquet")
data_balanced[selected_features + ['label_tactic']].to_parquet(balanced_path, index=False)
print(f"\n💾 Dataset bilanciato salvato in: {balanced_path}")

# --------------------------
# Distribuzione finale
# --------------------------
final_counts = data_balanced['label_tactic'].value_counts()
final_percent = data_balanced['label_tactic'].value_counts(normalize=True) * 100
final_table = pd.DataFrame({'Count': final_counts, 'Percent (%)': final_percent})
print("\n📊 Distribuzione finale delle classi bilanciate:")
display(final_table)



✅ Righe dopo filtraggio Discovery + Reconnaissance: 9280810

📊 Distribuzione originale delle classi:


Unnamed: 0_level_0,Count,Percent (%)
label_tactic,Unnamed: 1_level_1,Unnamed: 2_level_1
Reconnaissance,9278723,99.977513
Discovery,2087,0.022487



✅ Dataset bilanciato pronto: 4174 righe

🔹 Feature numeriche selezionate (11): ['ts', 'dest_port_zeek', 'src_port_zeek', 'resp_ip_bytes', 'orig_ip_bytes', 'resp_bytes', 'orig_bytes', 'resp_pkts', 'orig_pkts', 'missed_bytes', 'duration']

💾 Dataset bilanciato salvato in: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_test_ready_balanced.parquet

📊 Distribuzione finale delle classi bilanciate:


Unnamed: 0_level_0,Count,Percent (%)
label_tactic,Unnamed: 1_level_1,Unnamed: 2_level_1
Reconnaissance,2087,50.0
Discovery,2087,50.0


In [19]:
# ==========================================================
# BLOCCO 7️⃣ - Generazione embeddings Test Set ZeekData22 (allineato Fall22)
# ==========================================================
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import joblib
import os

# --------------------------
# Percorsi
# --------------------------
encoder_path = "model_data/encoder_best.keras"
scaler_path = "model_data/scaler_latent.pkl"
fall22_var_path = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\feature_variance_fall22.csv"
test_path = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_test_ready_balanced.parquet"
output_dir = "model_data"
os.makedirs(output_dir, exist_ok=True)

# --------------------------
# Caricamento encoder, scaler e test set
# --------------------------
encoder = load_model(encoder_path)
scaler_latent = joblib.load(scaler_path)
data_test = pd.read_parquet(test_path)

# Separazione feature/label
X_test = data_test.drop(columns=['label_tactic'])
y_test = data_test['label_tactic'].reset_index(drop=True)
print(f"✅ Test set caricato: {X_test.shape[0]} righe, {X_test.shape[1]} feature")

# --------------------------
# Allineamento feature con Fall22
# --------------------------
fall22_features = pd.read_csv(fall22_var_path)['Feature'].tolist()

# Rimuovi eventuali feature extra
X_test = X_test[[f for f in fall22_features if f in X_test.columns]]

# Aggiungi feature mancanti con 0
for f in fall22_features:
    if f not in X_test.columns:
        X_test[f] = 0

# Ordina colonne come nell'encoder Fall22
X_test = X_test[fall22_features]
print(f"✅ Feature test set allineate correttamente: {X_test.shape}")

# --------------------------
# Winsorization + LogTransform robusto (come train)
# --------------------------
for col in X_test.columns:
    lower = X_test[col].quantile(0.01)
    upper = X_test[col].quantile(0.99)
    X_test[col] = np.clip(X_test[col], lower, upper)
    X_test[col] = np.log1p(X_test[col].clip(lower=0))

# Riempimento NaN con 0
X_test = X_test.fillna(0)

# --------------------------
# Generazione embeddings latenti
# --------------------------
X_test_latent = encoder.predict(X_test, verbose=1)
latent_cols = [f'latent_{i}' for i in range(X_test_latent.shape[1])]
X_test_latent = pd.DataFrame(X_test_latent, columns=latent_cols)

# --------------------------
# Applicazione scaler del train set
# --------------------------
X_test_scaled = pd.DataFrame(
    scaler_latent.transform(X_test_latent),
    columns=latent_cols
)

# --------------------------
# Salvataggio embeddings e label
# --------------------------
X_test_scaled.to_csv(os.path.join(output_dir, "X_test_embeddings.csv"), index=False)
y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

print(f"\n💾 Test set embeddings scalati salvati in '{output_dir}/X_test_embeddings.csv'")
print(f"💾 Etichette test salvate in '{output_dir}/y_test.csv'")
print(f"✅ Tutto pronto per valutazione sul modello multiclasse!")


✅ Test set caricato: 4174 righe, 11 feature
✅ Feature test set allineate correttamente: (4174, 16)


ValueError: Input 0 of layer "functional_1" is incompatible with the layer: expected shape=(None, 28), found shape=(32, 16)