In [None]:
# ==========================================================
# BLOCCO 1: Setup e caricamento dataset ZeekData22
# ==========================================================
import os
import pandas as pd
import glob

# Percorso cartella contenente i file ZeekData22
folder_data22 = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekData22"

# Funzione per caricare parquet/csv
def load_dataset(folder_path):
    all_files = os.listdir(folder_path)
    dfs = []
    for f in all_files:
        path = os.path.join(folder_path, f)
        if f.endswith(".parquet"):
            dfs.append(pd.read_parquet(path))
        elif f.endswith(".csv"):
            dfs.append(pd.read_csv(path))
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()

# Caricamento dataset
df_22 = load_dataset(folder_data22)
print(f"✅ Dataset ZeekData22 caricato: {df_22.shape}")



✅ Dataset ZeekData22 caricato: (18585503, 37)
Prime righe:


Unnamed: 0,resp_pkts,service,orig_ip_bytes,local_resp,missed_bytes,protocol,duration,conn_state,dest_ip,orig_pkts,...,src_ip_zeek,label_tactic,2021-12-12 - 2021-12-19,2021-12-19 - 2021-12-26,2021-12-26 - 2022-01-02,2022-01-02 - 2022-01-09,2022-01-09 - 2022-01-16,2022-01-16 - 2022-01-23,2022-02-06 - 2022-02-13,2022-02-13 - 2022-02-20
0,2.0,dns,186.0,False,0.0,udp,0.00228,SF,143.88.5.1,2.0,...,,,,,,,,,,
1,2.0,dns,186.0,False,0.0,udp,0.00228,SF,143.88.5.1,2.0,...,,,,,,,,,,
2,2.0,dns,186.0,False,0.0,udp,0.00228,SF,143.88.5.1,2.0,...,,,,,,,,,,
3,2.0,dns,186.0,False,0.0,udp,0.00228,SF,143.88.5.1,2.0,...,,,,,,,,,,
4,2.0,dns,186.0,False,0.0,udp,0.00228,SF,143.88.5.1,2.0,...,,,,,,,,,,


In [None]:
# ==========================================================
# BLOCCO 2: Controllo e mapping colonne
# ==========================================================
# Visualizzo le colonne attuali
print("Colonne originali ZeekData22:")
print(df_22.columns.tolist())

# Mapping colonne per allineamento a ZeekDataFall22
rename_dict = {
    'src_ip':'src_ip_zeek',
    'dest_ip':'dest_ip_zeek',
    'src_port':'src_port_zeek',
    'dest_port':'dest_port_zeek',
    'proto':'protocol',
    'mitre_attack_tactics':'label_tactic'
}

df_22.rename(columns=rename_dict, inplace=True)

# Eventuale creazione colonna 'uid' se mancante
if 'uid' not in df_22.columns:
    df_22['uid'] = range(len(df_22))

print("Colonne dopo mapping:")
print(df_22.columns.tolist())



⚠️ Mapping colonne creato ex-novo e salvato.


In [None]:
# ==========================================================
# BLOCCO 3 – Pulizia e aggregazione
# ==========================================================

# 1️⃣ Rimozione duplicati
df_22 = df_22.drop_duplicates()

# 2️⃣ Imputazione valori mancanti
num_cols = df_22.select_dtypes(include=['int64','float64']).columns
cat_cols = df_22.select_dtypes(include=['object','category']).columns

for col in num_cols:
    df_22[col].fillna(df_22[col].mean(), inplace=True)

for col in cat_cols:
    df_22[col].fillna(df_22[col].mode()[0] if not df_22[col].mode().empty else 'unknown', inplace=True)

# 3️⃣ Aggregazione session-level (uid)
session_features = df_22.groupby('uid').agg(
    total_orig_bytes=('orig_bytes', 'sum'),
    total_resp_bytes=('resp_bytes', 'sum'),
    total_orig_pkts=('orig_pkts', 'sum'),
    total_resp_pkts=('resp_pkts', 'sum'),
    mean_duration=('duration', 'mean')
).reset_index()

df_22 = pd.merge(df_22, session_features, on='uid', how='left')
print(f"✅ Dataset aggregato: {df_22.shape}")


FileNotFoundError: [Errno 2] No such file or directory: 'model_data/imputer_stats_fall22.pkl'

In [None]:
# ==========================================================
# BLOCCO 4 – Preparazione label e feature
# ==========================================================

# Binary
df_22['label_binary_clean'] = df_22['label_binary'].map({True:1, False:0, 'True':1, 'False':0, 1:1, 0:0})
df_22 = df_22.dropna(subset=['label_binary_clean'])
df_22['label_binary'] = df_22['label_binary_clean'].astype(int)
df_22.drop(columns=['label_binary_clean'], inplace=True)

# Multiclass ridotta (stesso criterio Fall22)
main_classes = ['Resource Development', 'Reconnaissance', 'Discovery']
df_22['label_tactic_reduced'] = df_22['label_tactic'].apply(lambda x: x if x in main_classes else 'Other')

# Feature dataset (rimuovo label e colonne non utili)
feature_data_22 = df_22.drop(columns=['label_binary', 'label_technique', 'label_tactic'])



In [None]:
# ==========================================================
# BLOCCO 5 – Frequency encoding e scaling
# ==========================================================

from sklearn.preprocessing import MinMaxScaler
import joblib

# Frequency encoding colonne categoriali
cat_features = feature_data_22.select_dtypes(include=['object', 'category']).columns.tolist()
encoded_data_22 = feature_data_22.copy()
for col in cat_features:
    freq = encoded_data_22[col].value_counts(normalize=True)
    encoded_data_22[col] = encoded_data_22[col].map(freq)

# Scaling con scaler Autoencoder già salvato
scaler_auto = joblib.load("model_data/scaler_auto.pkl")  # se l'hai salvato da Fall22
X_autoencoder_22 = pd.DataFrame(
    scaler_auto.transform(encoded_data_22),
    columns=encoded_data_22.columns
)

print(f"✅ Dataset pronto per Autoencoder: {X_autoencoder_22.shape}")



In [None]:
# ==========================================================
# BLOCCO 6 – Generazione embeddings con encoder salvato
# ==========================================================

from tensorflow.keras.models import load_model
import pandas as pd

encoder = load_model("model_data/encoder_best.keras")
X_latent_22 = encoder.predict(X_autoencoder_22)
X_classifier_22 = pd.DataFrame(X_latent_22, columns=[f'latent_{i}' for i in range(X_latent_22.shape[1])])

# Label per test
y_classifier_22 = df_22['label_tactic_reduced'].reset_index(drop=True)

print(f"✅ Embeddings generati: {X_classifier_22.shape}")
print("Classi presenti:", y_classifier_22.unique())
