In [1]:
import os
import pandas as pd
from tqdm import tqdm

folder_data22 = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekData22"
output_folder = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_chunks"
os.makedirs(output_folder, exist_ok=True)

all_files = [f for f in os.listdir(folder_data22) if f.endswith(".parquet") or f.endswith(".csv")]

for i, f in enumerate(tqdm(all_files, desc="Processing files")):
    file_path = os.path.join(folder_data22, f)
    
    # Caricamento file
    if f.endswith(".parquet"):
        df = pd.read_parquet(file_path)
    else:
        df = pd.read_csv(file_path)
    
    # =================
    # Rimozione duplicati
    # =================
    df = df.drop_duplicates()
    
    # =================
    # Imputazione valori mancanti
    # =================
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    cat_cols = df.select_dtypes(include=['object','category']).columns
    
    for col in num_cols:
        if df[col].isna().any():
            df[col].fillna(df[col].mean(), inplace=True)
    for col in cat_cols:
        if df[col].isna().any():
            df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown', inplace=True)
    
    # =================
    # Aggregazioni session-level se 'uid' presente
    # =================
    if 'uid' in df.columns:
        session_features = df.groupby('uid').agg(
            total_orig_bytes=('orig_bytes','sum'),
            total_resp_bytes=('resp_bytes','sum'),
            total_orig_pkts=('orig_pkts','sum'),
            total_resp_pkts=('resp_pkts','sum'),
            mean_duration=('duration','mean')
        ).reset_index()
        df = pd.merge(df, session_features, on='uid', how='left')
    else:
        df['uid'] = range(len(df))
    
    # =================
    # Salvataggio file pulito in chunk
    # =================
    df.to_parquet(os.path.join(output_folder, f"cleaned_{f.replace('.csv','.parquet')}"), index=False)


Processing files: 100%|██████████| 14/14 [01:16<00:00,  5.45s/it]


In [2]:
import glob
import pandas as pd
from tqdm import tqdm
import os

# -----------------------------
# Percorsi
# -----------------------------
cleaned_folder = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_chunks"
output_file = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_final.parquet"

# Trova tutti i file puliti
cleaned_files = glob.glob(os.path.join(cleaned_folder, "*.parquet"))

# -----------------------------
# Concatenazione memory-safe
# -----------------------------
dfs = []
for f in tqdm(cleaned_files, desc="Concatenating cleaned chunks"):
    dfs.append(pd.read_parquet(f))
    if len(dfs) == 5:  # concatenazione a blocchi di 5 file
        temp = pd.concat(dfs, ignore_index=True)
        dfs = [temp]

final_df = pd.concat(dfs, ignore_index=True)
print(f"✅ Totale righe dataset concatenato: {len(final_df)}")
print(f"✅ Totale colonne: {final_df.shape[1]}")

# -----------------------------
# Analisi valori nulli
# -----------------------------
null_counts = final_df.isna().sum()
null_perc = (null_counts / len(final_df) * 100).round(2)
null_df = pd.DataFrame({'Null Count': null_counts, 'Null %': null_perc})
null_df = null_df[null_df['Null Count']>0].sort_values('Null %', ascending=False)
if not null_df.empty:
    print("📊 Colonne con valori nulli residui:")
    display(null_df)
else:
    print("✅ Nessun valore null residuo.")

# -----------------------------
# Rimozione duplicati residui
# -----------------------------
duplicates = final_df.duplicated().sum()
if duplicates > 0:
    final_df = final_df.drop_duplicates()
    print(f"⚠️ Rimosso duplicati residui: {duplicates}")
else:
    print("✅ Nessun duplicato residuo trovato.")

# -----------------------------
# Aggregazioni session-level finali
# -----------------------------
if 'uid' in final_df.columns:
    print("🔹 Esecuzione aggregazioni session-level finali...")
    session_features = final_df.groupby('uid').agg(
        total_orig_bytes=('orig_bytes','sum'),
        total_resp_bytes=('resp_bytes','sum'),
        total_orig_pkts=('orig_pkts','sum'),
        total_resp_pkts=('resp_pkts','sum'),
        mean_duration=('duration','mean')
    ).reset_index()
    final_df = pd.merge(final_df, session_features, on='uid', how='left')
    print(f"✅ Aggregazioni completate per {final_df['uid'].nunique()} sessioni.")
else:
    print("⚠️ Colonna 'uid' mancante, impossibile fare aggregazioni session-level.")

# -----------------------------
# Imputazione residua valori nulli (opzionale)
# -----------------------------
num_cols = final_df.select_dtypes(include=['int64','float64']).columns
cat_cols = final_df.select_dtypes(include=['object','category']).columns

for col in num_cols:
    if final_df[col].isna().any():
        final_df[col].fillna(final_df[col].mean(), inplace=True)
for col in cat_cols:
    if final_df[col].isna().any():
        final_df[col].fillna(final_df[col].mode()[0] if not final_df[col].mode().empty else 'unknown', inplace=True)

print("✅ Pulizia finale completata. Dataset pronto per analisi o test dei modelli.")

# -----------------------------
# Conversione sicura tipi datetime e object
# -----------------------------
for col in final_df.columns:
    # Se il nome della colonna suggerisce un timestamp o datetime
    if 'time' in col.lower() or 'date' in col.lower():
        try:
            final_df[col] = pd.to_datetime(final_df[col], errors='coerce')
        except Exception:
            pass

# Alcune colonne potrebbero rimanere di tipo misto (object con numeri o liste)
# Convertiamo tutto ciò che non è numerico o datetime in stringa
for col in final_df.columns:
    if final_df[col].dtype == 'object':
        try:
            final_df[col] = final_df[col].astype(str)
        except Exception:
            final_df[col] = final_df[col].apply(lambda x: str(x) if not pd.isna(x) else "")

# -----------------------------
# Salvataggio dataset finale
# -----------------------------
try:
    final_df.to_parquet(output_file, index=False)
    print(f"💾 Dataset finale salvato con successo: {output_file}")
except Exception as e:
    print(f"❌ Errore durante il salvataggio in parquet: {e}")
    print("👉 Provo a salvare in formato CSV come fallback...")
    csv_output = output_file.replace('.parquet', '.csv')
    final_df.to_csv(csv_output, index=False)
    print(f"💾 Dataset salvato in formato CSV: {csv_output}")



Concatenating cleaned chunks: 100%|██████████| 14/14 [00:14<00:00,  1.02s/it]


✅ Totale righe dataset concatenato: 6182348
✅ Totale colonne: 42
📊 Colonne con valori nulli residui:


Unnamed: 0,Null Count,Null %
2022-02-13 - 2022-02-20,6182332,100.0
2022-02-06 - 2022-02-13,6182332,100.0
2022-01-16 - 2022-01-23,6182332,100.0
2022-01-09 - 2022-01-16,6182332,100.0
2022-01-02 - 2022-01-09,6182332,100.0
2021-12-26 - 2022-01-02,6182332,100.0
2021-12-19 - 2021-12-26,6182332,100.0
2021-12-12 - 2021-12-19,6182332,100.0
dest_port,4851845,78.48
mitre_attack_tactics,4851845,78.48


⚠️ Rimosso duplicati residui: 221319
🔹 Esecuzione aggregazioni session-level finali...
✅ Aggregazioni completate per 4851835 sessioni.
✅ Pulizia finale completata. Dataset pronto per analisi o test dei modelli.


  final_df[col] = pd.to_datetime(final_df[col], errors='coerce')


💾 Dataset finale salvato con successo: C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_final.parquet


In [4]:
import pandas as pd

# -----------------------------
# Caricamento dataset finale
# -----------------------------
final_df = pd.read_parquet(r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_final.parquet")
print(f"✅ Dataset caricato: {len(final_df)} righe, {final_df.shape[1]} colonne")

# -----------------------------
# Rimozione traffico benigno
# -----------------------------
benign_mask = final_df['label_tactic'].str.contains('Benign|Normal', case=False, na=False) | \
              final_df['mitre_attack_tactics'].str.contains('Benign|Normal', case=False, na=False)

df_attacks = final_df[~benign_mask].copy()
print(f"⚔️ Campioni malevoli: {len(df_attacks)} su {len(final_df)} totali ({len(df_attacks)/len(final_df)*100:.2f}%)")

# -----------------------------
# Distribuzione per tipo di attacco (label_tactic)
# -----------------------------
label_counts = (
    df_attacks['label_tactic']
    .value_counts(dropna=False)
    .rename_axis('Attack_Type')
    .reset_index(name='Count')
)
label_counts['Percentage'] = (label_counts['Count'] / len(df_attacks) * 100).round(2)

print("\n📊 Distribuzione per tipo di attacco (label_tactic):")
display(label_counts)

# -----------------------------
# Distribuzione per tattica MITRE (mitre_attack_tactics)
# -----------------------------
tactic_counts = (
    df_attacks['mitre_attack_tactics']
    .value_counts(dropna=False)
    .rename_axis('MITRE_Tactic')
    .reset_index(name='Count')
)
tactic_counts['Percentage'] = (tactic_counts['Count'] / len(df_attacks) * 100).round(2)

print("\n🧠 Distribuzione per tattica MITRE (mitre_attack_tactics):")
display(tactic_counts)


✅ Dataset caricato: 5961029 righe, 47 colonne
⚔️ Campioni malevoli: 5961029 su 5961029 totali (100.00%)

📊 Distribuzione per tipo di attacco (label_tactic):


Unnamed: 0,Attack_Type,Count,Percentage
0,none,5922625,99.36
1,Reconnaissance,36248,0.61
2,Discovery,2087,0.04
3,Credential Access,28,0.0
4,Privilege Escalation,14,0.0
5,Exfiltration,8,0.0
6,Lateral Movement,4,0.0
7,Resource Development,4,0.0
8,Defense Evasion,2,0.0
9,Initial Access,2,0.0



🧠 Distribuzione per tattica MITRE (mitre_attack_tactics):


Unnamed: 0,MITRE_Tactic,Count,Percentage
0,none,5956972,99.93
1,Discovery,2086,0.03
2,Reconnaissance,1971,0.03


In [7]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

# ======================
# PATHS
# ======================
input_folder = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_chunks"
output_file = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_merged_final.parquet"

# ======================
# CONCATENAZIONE CHUNKS
# ======================
all_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".parquet")]

dfs = []
for f in tqdm(all_files, desc="Merging cleaned chunks"):
    chunk = pd.read_parquet(f)
    dfs.append(chunk)

df = pd.concat(dfs, ignore_index=True)
del dfs  # libera memoria
print(f"✅ Dataset concatenato: {df.shape[0]:,} righe, {df.shape[1]} colonne")

# ======================
# RIMOZIONE DUPLICATI
# ======================
before_dups = len(df)
if "uid" in df.columns:
    df.drop_duplicates(subset="uid", inplace=True)
else:
    df.drop_duplicates(inplace=True)
after_dups = len(df)
print(f"🧹 Rimossi {before_dups - after_dups:,} duplicati ({round((before_dups - after_dups)/before_dups*100, 2)}%)")

# ======================
# PULIZIA COLONNE CON TROPPI NULL
# ======================
null_threshold = 0.6  # 60% null → colonna eliminata
null_ratios = df.isna().mean()
cols_to_drop = null_ratios[null_ratios > null_threshold].index.tolist()

if cols_to_drop:
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"🧾 Eliminate {len(cols_to_drop)} colonne con >{null_threshold*100:.0f}% valori null: {cols_to_drop}")

# ======================
# IMPUTAZIONE VALORI MANCANTI
# ======================
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        df[col].fillna(df[col].median(), inplace=True)
    elif df[col].dtype == 'object':
        df[col].fillna('unknown', inplace=True)

print("🧩 Valori mancanti imputati (median/unknown)")

# ======================
# FUSIONE ETICHETTE TATTICHE
# ======================
if 'label_tactic' in df.columns and 'mitre_attack_tactic' in df.columns:
    df['attack_tactic_temp'] = df['label_tactic'].fillna('none')
    df.loc[df['attack_tactic_temp'].isin(['none', '', 'unknown']), 'attack_tactic_temp'] = df['mitre_attack_tactic']
elif 'mitre_attack_tactic' in df.columns:
    df['attack_tactic_temp'] = df['mitre_attack_tactic']
elif 'label_tactic' in df.columns:
    df['attack_tactic_temp'] = df['label_tactic']
else:
    raise ValueError("❌ Nessuna colonna tattica trovata nel dataset.")

df['attack_tactic_temp'] = (
    df['attack_tactic_temp']
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({'nan': 'none', '': 'none', 'unknown': 'none'})
    .str.title()
)

df.drop(columns=['label_tactic', 'mitre_attack_tactic'], errors='ignore', inplace=True)
df.rename(columns={'attack_tactic_temp': 'label_tactic'}, inplace=True)

# ======================
# AGGREGAZIONI SESSION-LEVEL
# ======================
if 'uid' in df.columns:
    session_features = df.groupby('uid').agg(
        total_orig_bytes=('orig_bytes', 'sum'),
        total_resp_bytes=('resp_bytes', 'sum'),
        total_orig_pkts=('orig_pkts', 'sum'),
        total_resp_pkts=('resp_pkts', 'sum'),
        mean_duration=('duration', 'mean')
    ).reset_index()
    df = pd.merge(df, session_features, on='uid', how='left')
    print("📊 Aggregazioni session-level completate")

# ======================
# TABELLONE LABELS
# ======================
label_counts = df['label_tactic'].value_counts().reset_index()
label_counts.columns = ['Attack_Type', 'Count']
label_counts['Percentage'] = (label_counts['Count'] / len(df) * 100).round(2)
print("\n📊 Distribuzione finale per label_tactic:\n")
print(label_counts.to_string(index=False))

# ======================
# CONVERSIONE DATETIME
# ======================
datetime_cols = [col for col in df.columns if 'time' in col.lower() or 'date' in col.lower()]
for col in datetime_cols:
    try:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)
    except Exception:
        df[col] = df[col].astype(str)
print(f"\n🕒 Colonne convertite in datetime: {datetime_cols}")

# ======================
# TIPOLOGIE COMPATIBILI PARQUET
# ======================
df['uid'] = df['uid'].astype(str)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)

# ======================
# SALVATAGGIO FINALE
# ======================
df.to_parquet(output_file, index=False)
print(f"\n💾 Dataset finale salvato in: {output_file}")


Merging cleaned chunks:   0%|          | 0/14 [00:00<?, ?it/s]

Merging cleaned chunks: 100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


✅ Dataset concatenato: 6,182,348 righe, 42 colonne
🧹 Rimossi 1,330,513 duplicati (21.52%)
🧾 Eliminate 14 colonne con >60% valori null: ['protocol', 'dest_ip', 'dest_port', 'src_port', 'src_ip', 'mitre_attack_tactics', '2021-12-12 - 2021-12-19', '2021-12-19 - 2021-12-26', '2021-12-26 - 2022-01-02', '2022-01-02 - 2022-01-09', '2022-01-09 - 2022-01-16', '2022-01-16 - 2022-01-23', '2022-02-06 - 2022-02-13', '2022-02-13 - 2022-02-20']
🧩 Valori mancanti imputati (median/unknown)
📊 Aggregazioni session-level completate

📊 Distribuzione finale per label_tactic:

         Attack_Type   Count  Percentage
                None 4817498       99.29
      Reconnaissance   34277        0.71
   Credential Access      28        0.00
        Exfiltration       8        0.00
Privilege Escalation       6        0.00
    Lateral Movement       4        0.00
Resource Development       4        0.00
     Defense Evasion       2        0.00
           Discovery       1        0.00
      Initial Access       1 

In [9]:
import pandas as pd
import os

# ==========================================
# 0️⃣ RECUPERO FEATURE NUMERICHE E CATEGORICAL
# ==========================================

# Cartella dove salvare i parquet
os.makedirs("model_data", exist_ok=True)

# Caricamento di un chunk di riferimento per identificare le feature
sample_file = [f for f in os.listdir(input_folder) if f.endswith(".parquet")][0]
df_sample = pd.read_parquet(os.path.join(input_folder, sample_file))

# --------------------------
# Feature numeriche
# --------------------------
num_features = df_sample.select_dtypes(include=['int64','float64']).columns.tolist()
# Rimuovo colonne target o non informative
for col in ['label_binary','label_technique','label_tactic','label_tactic_reduced','uid']:
    if col in num_features:
        num_features.remove(col)

# Salvataggio parquet numeriche
pd.DataFrame(columns=num_features).to_parquet("model_data/numerical_features.parquet", index=False)
print(f"💾 Feature numeriche salvate ({len(num_features)} colonne): {num_features}")

# --------------------------
# Feature categoriali
# --------------------------
cat_features = df_sample.select_dtypes(include=['object','category']).columns.tolist()
# Salvataggio parquet categoriali
pd.DataFrame(columns=cat_features).to_parquet("model_data/categorical_features.parquet", index=False)
print(f"💾 Feature categoriali salvate ({len(cat_features)} colonne): {cat_features}")


💾 Feature numeriche salvate (16 colonne): ['resp_pkts', 'orig_ip_bytes', 'missed_bytes', 'duration', 'orig_pkts', 'resp_ip_bytes', 'dest_port', 'orig_bytes', 'resp_bytes', 'src_port', 'ts', 'total_orig_bytes', 'total_resp_bytes', 'total_orig_pkts', 'total_resp_pkts', 'mean_duration']
💾 Feature categoriali salvate (10 colonne): ['service', 'protocol', 'conn_state', 'dest_ip', 'community_id', 'datetime', 'history', 'uid', 'src_ip', 'mitre_attack_tactics']


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler

# ==============================
# 0️⃣ Caricamento dataset ZeekData22 già pulito
# ==============================
zeek22_file = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_final.parquet"
df_22 = pd.read_parquet(zeek22_file)
print(f"✅ ZeekData22 caricato: {df_22.shape[0]} righe, {df_22.shape[1]} colonne")

# ==============================
# 1️⃣ Caricamento feature list usate in ZeekDataFall22
# ==============================
num_features_fall = pd.read_parquet("model_data/numerical_features.parquet").columns.tolist()
cat_features_fall = pd.read_parquet("model_data/categorical_features.parquet").columns.tolist()

print(f"🔹 Numeriche Fall22: {len(num_features_fall)}")
print(f"🔹 Categoriali Fall22: {len(cat_features_fall)}")

# ==============================
# 2️⃣ Allineamento colonne numeriche
# ==============================
# Manteniamo solo le colonne numeriche comuni
num_common = [col for col in num_features_fall if col in df_22.columns]
# Se mancano colonne numeriche richieste → imputiamo a 0
num_missing = [col for col in num_features_fall if col not in df_22.columns]
for col in num_missing:
    df_22[col] = 0.0

# ==============================
# 3️⃣ Allineamento colonne categoriali
# ==============================
cat_common = [col for col in cat_features_fall if col in df_22.columns]
cat_missing = [col for col in cat_features_fall if col not in df_22.columns]
for col in cat_missing:
    df_22[col] = "unknown"

# ==============================
# 4️⃣ Frequency encoding su feature categoriali (stesso approccio Fall22)
# ==============================
for col in cat_features_fall:
    freq = df_22[col].value_counts(normalize=True)
    df_22[col] = df_22[col].map(freq).fillna(0.0)

# ==============================
# 5️⃣ Creazione dataset finale allineato
# ==============================
# Ordine esatto: numeriche ad alta varianza + categoriali encoded
feature_order = num_features_fall + cat_features_fall
X_22_aligned = df_22[feature_order].copy()

# ==============================
# 6️⃣ MinMax scaling (usando scaler dell'encoder Fall22 se salvato)
# ==============================
scaler_file = "model_data/scaler_autoencoder.pkl"  # se hai salvato lo scaler originale
try:
    scaler_auto = joblib.load(scaler_file)
    X_22_scaled = pd.DataFrame(
        scaler_auto.transform(X_22_aligned),
        columns=X_22_aligned.columns
    )
    print("✅ Scaling MinMax applicato con scaler Fall22 salvato")
except Exception:
    # fallback: scala tra 0 e 1 sul dataset Zeek22
    from sklearn.preprocessing import MinMaxScaler
    scaler_auto = MinMaxScaler()
    X_22_scaled = pd.DataFrame(
        scaler_auto.fit_transform(X_22_aligned),
        columns=X_22_aligned.columns
    )
    print("⚠️ Fallback: scaling MinMax calcolato su ZeekData22")

# ==============================
# 7️⃣ Caricamento encoder Fall22 e generazione embeddings latenti
# ==============================
import tensorflow as tf
from tensorflow.keras.models import load_model

encoder_file = "model_data/encoder_best.keras"
encoder = load_model(encoder_file, compile=False)

X_22_latent = encoder.predict(X_22_scaled)
latent_dim = X_22_latent.shape[1]
X_22_latent_df = pd.DataFrame(X_22_latent, columns=[f"latent_{i}" for i in range(latent_dim)])
print(f"✅ ZeekData22 trasformato in latent embeddings: {X_22_latent_df.shape}")

# ==============================
# 8️⃣ Salvataggio dataset pronto come test set
# ==============================
X_22_latent_df.to_parquet("model_data/ZeekData22_test_embeddings.parquet", index=False)
print("💾 ZeekData22 pronto come test set per autoencoder Fall22 salvato")


FileNotFoundError: [Errno 2] No such file or directory: 'model_data/scaler_auto.pkl'