In [10]:
import os
import pandas as pd
from tqdm import tqdm

folder_data22 = r"C:\Users\maria\Desktop\Zeek_ML\UWF-ZeekData22"
output_folder = r"C:\Users\maria\Desktop\Zeek_ML\processed_zeekdata22\ZeekData22_chunks"
os.makedirs(output_folder, exist_ok=True)

all_files = [f for f in os.listdir(folder_data22) if f.endswith(".parquet") or f.endswith(".csv")]

for i, f in enumerate(tqdm(all_files, desc="Processing files")):
    file_path = os.path.join(folder_data22, f)
    
    # Caricamento file
    if f.endswith(".parquet"):
        df = pd.read_parquet(file_path)
    else:
        df = pd.read_csv(file_path)
    
    # =================
    # Rimozione duplicati
    # =================
    df = df.drop_duplicates()
    
    # =================
    # Imputazione valori mancanti
    # =================
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    cat_cols = df.select_dtypes(include=['object','category']).columns
    
    for col in num_cols:
        if df[col].isna().any():
            df[col].fillna(df[col].mean(), inplace=True)
    for col in cat_cols:
        if df[col].isna().any():
            df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown', inplace=True)
    
    # =================
    # Aggregazioni session-level se 'uid' presente
    # =================
    if 'uid' in df.columns:
        session_features = df.groupby('uid').agg(
            total_orig_bytes=('orig_bytes','sum'),
            total_resp_bytes=('resp_bytes','sum'),
            total_orig_pkts=('orig_pkts','sum'),
            total_resp_pkts=('resp_pkts','sum'),
            mean_duration=('duration','mean')
        ).reset_index()
        df = pd.merge(df, session_features, on='uid', how='left')
    else:
        df['uid'] = range(len(df))
    
    # =================
    # Salvataggio file pulito in chunk
    # =================
    df.to_parquet(os.path.join(output_folder, f"cleaned_{f.replace('.csv','.parquet')}"), index=False)


Processing files: 100%|██████████| 14/14 [02:02<00:00,  8.75s/it]
