Preparazione pacchetti

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
# === 1. Imposta la cartella dove hai scaricato i dati ===
DATASET_DIR = r"C:\Users\maria\Desktop\zeek-ml\dataset"

# === 2. Carica tutti i file CSV e Parquet ===
all_dfs = []

for filename in os.listdir(DATASET_DIR):
    file_path = os.path.join(DATASET_DIR, filename)

    if filename.endswith(".csv"):
        print(f"[INFO] Caricamento CSV: {filename}")
        df = pd.read_csv(file_path)
        all_dfs.append(df)

    elif filename.endswith(".parquet"):
        print(f"[INFO] Caricamento PARQUET: {filename}")
        df = pd.read_parquet(file_path)
        all_dfs.append(df)

if not all_dfs:
    raise FileNotFoundError(f"Nessun file CSV o Parquet trovato in {DATASET_DIR}")

# === 3. Unisci tutto in un unico DataFrame ===
data = pd.concat(all_dfs, ignore_index=True)
print(f"[INFO] Dataset totale: {data.shape[0]} righe, {data.shape[1]} colonne")
print("[INFO] Colonne disponibili:", data.columns.tolist())


[INFO] Caricamento PARQUET: part-00000-1da06990-329c-4e38-913a-0f0aa39b388d-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-23fdcfa3-9dd3-4c72-886c-e945bfcf92e1-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-26e9208e-7819-451b-b23f-2e47f6d1e834-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-2b76f9cc-0710-45e4-9e33-98ad5808ee79-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-36240b61-b84f-4164-a873-d7973e652780-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-3f86626a-1225-47f9-a5a2-0170b737e404-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-745e350a-da9e-4619-bd52-8cc23bb41ad5-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-7c2e9adb-5430-4792-a42b-10ff5bbd46e8-c000.snappy.parquet
[INFO] Caricamento CSV: part-00000-8c53ceaf-1fd1-4711-aa7d-26d0c5323dab-c000.csv
[INFO] Caricamento PARQUET: part-00000-94d13437-ae00-4a8c-9f38-edd0196cfdee-c000.snappy.parquet
[INFO] Caricamento PARQUET: part-00000-9a46dd05-4b06-4a

In [14]:
import pandas as pd

# 🔹 Mostra tutte le colonne senza troncamenti
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

# === 🔹 Conteggio dei valori nulli per colonna + percentuale ===
null_counts = data.isna().sum()
null_percent = (null_counts / len(data)) * 100
null_table = pd.DataFrame({
    'null_count': null_counts,
    'null_percentage': null_percent
}).sort_values(by='null_count', ascending=False)

print("\n[INFO] Numero e percentuale di valori nulli per colonna:")
print(null_table)

# === 🔹 One-hot encoding di label_tactic e label_technique ===
tactic_dummies = pd.get_dummies(data['label_tactic'], prefix='tactic')
technique_dummies = pd.get_dummies(data['label_technique'], prefix='technique')

# 🔹 Unisci le nuove colonne al DataFrame originale
data_expanded = pd.concat([data, tactic_dummies, technique_dummies], axis=1)

# Mostra un esempio solo con le etichette espanse
expanded_cols = tactic_dummies.columns.tolist() + technique_dummies.columns.tolist()
print("\n[INFO] Prime righe con etichette espanse:")
print(data_expanded[expanded_cols].head(5))

# === 🔹 Salva i primi 30 elementi in un file Excel ===
data_expanded.head(30).to_excel("dataset_preview.xlsx", index=False)
print("\n[INFO] File Excel 'dataset_preview.xlsx' creato con i primi 30 elementi (etichette espanse incluse).")



[INFO] Numero e percentuale di valori nulli per colonna:
                         null_count  null_percentage
2022-01-09 - 2022-01-16    19262873        99.999969
2022-01-16 - 2022-01-23    19262873        99.999969
2022-02-06 - 2022-02-13    19262872        99.999964
2022-02-13 - 2022-02-20    19262865        99.999927
2022-10-09 - 2022-10-16    19262859        99.999896
2022-09-04 - 2022-09-11    19262859        99.999896
2022-09-11 - 2022-09-18    19262859        99.999896
2022-09-18 - 2022-09-25    19262856        99.999881
2022-10-02 - 2022-10-09    19262856        99.999881
2021-12-12 - 2021-12-19    19262855        99.999875
2021-12-26 - 2022-01-02    19262855        99.999875
2022-10-23 - 2022-10-30    19262855        99.999875
2021-12-19 - 2021-12-26    19262855        99.999875
2022-01-02 - 2022-01-09    19262855        99.999875
2022-08-28 - 2022-09-04    19262854        99.999870
2022-10-16 - 2022-10-23    19262848        99.999839
2022-09-25 - 2022-10-02    19262843      

Preprocessing

In [15]:
import pandas as pd

# === 🔹 Mostra tutte le colonne senza troncamenti ===
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

# === 🔹 Conteggio valori nulli e percentuali ===
null_counts = data.isna().sum()
null_percentages = (null_counts / len(data)) * 100
null_summary = pd.DataFrame({"null_count": null_counts, "null_percentage": null_percentages})
print("\n[INFO] Numero e percentuale di valori nulli per colonna:")
print(null_summary.sort_values(by="null_percentage", ascending=False))

# === 1️⃣ Rimuovi colonne con più del 99% di NaN ===
high_null_cols = null_summary[null_summary["null_percentage"] > 99].index
data = data.drop(columns=high_null_cols)
print(f"\n[INFO] Colonne rimosse (>99% NaN): {list(high_null_cols)}")
print(f"[INFO] Dataset dopo rimozione colonne: {data.shape}")

# === 2️⃣ Filtra solo righe con etichette MITRE presenti ===
data = data[data['label_tactic'].notna()]  # mantieni righe con label_tactic valorizzata
print(f"[INFO] Dataset filtrato: {data.shape[0]} righe rimaste dopo rimozione NaN in label_tactic")

# === 3️⃣ Riempimento dei valori mancanti ===
# Riempimento per colonne categoriali
if 'service' in data.columns:
    data['service'] = data['service'].fillna('unknown')
if 'history' in data.columns:
    data['history'] = data['history'].fillna('unknown')

# Riempimento per colonne numeriche
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(0)

print(f"[INFO] Riempimento completato. Nessun NaN residuo: {data.isna().sum().sum() == 0}")

# === 4️⃣ One-hot encoding di tattiche e tecniche ===
tactic_dummies = pd.get_dummies(data['label_tactic'], prefix='tactic')
technique_dummies = pd.get_dummies(data['label_technique'], prefix='technique')

data_expanded = pd.concat([data, tactic_dummies, technique_dummies], axis=1)

print(f"[INFO] Dataset finale pronto per analisi e training. Shape: {data_expanded.shape}")

# === 5️⃣ Salva i primi 30 record puliti in Excel ===
data_expanded.head(30).to_excel("dataset_pulito_preview.xlsx", index=False)
print("\n[INFO] File Excel 'dataset_pulito_preview.xlsx' creato con i primi 30 elementi (etichette espanse incluse).")



[INFO] Numero e percentuale di valori nulli per colonna:
                         null_count  null_percentage
2022-01-09 - 2022-01-16    19262873        99.999969
2022-01-16 - 2022-01-23    19262873        99.999969
2022-02-06 - 2022-02-13    19262872        99.999964
2022-02-13 - 2022-02-20    19262865        99.999927
2022-10-09 - 2022-10-16    19262859        99.999896
2022-09-04 - 2022-09-11    19262859        99.999896
2022-09-11 - 2022-09-18    19262859        99.999896
2022-09-18 - 2022-09-25    19262856        99.999881
2022-10-02 - 2022-10-09    19262856        99.999881
2021-12-12 - 2021-12-19    19262855        99.999875
2021-12-26 - 2022-01-02    19262855        99.999875
2022-10-23 - 2022-10-30    19262855        99.999875
2021-12-19 - 2021-12-26    19262855        99.999875
2022-01-02 - 2022-01-09    19262855        99.999875
2022-08-28 - 2022-09-04    19262854        99.999870
2022-10-16 - 2022-10-23    19262848        99.999839
2022-09-25 - 2022-10-02    19262843      

Splitting di dataset

In [None]:
# === 7. Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"[INFO] Train set: {X_train.shape}, Test set: {X_test.shape}")

# (Opzionale: salva i dati pre-processati per usarli in un altro script)
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("[INFO] Preprocessing completato ✅")