In [1]:
from pathlib import Path
import json
import hashlib

import numpy as np
import pandas as pd
import yaml

from src.utils.paths import load_paths
from src.utils.logging import setup_logger

from src.splits.make_split import make_vnat_capture_split, write_split_files
from src.splits.validate import validate_split_files
from src.splits.io import load_splits

from src.features.extract import load_feature_config, extract_features_from_flows, feature_config_hash_text
from src.pipeline.feature_pipeline import FeaturePipeline
from src.pipeline.artifacts import default_feature_artifacts

paths = load_paths()
paths.ensure_dirs()
logger = setup_logger(level="INFO")

logger.info(f"Repo root: {paths.repo_root}")
logger.info(f"Processed dir: {paths.data_processed}")
logger.info(f"Splits dir: {paths.data_splits}")
logger.info(f"Configs dir: {paths.configs_dir}")

2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Repo root: C:\Users\scoti\PycharmProjects\ai-vpn-firewall
2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Processed dir: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed
2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Splits dir: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\splits
2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Configs dir: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\configs


In [2]:
flows_parquet = paths.data_processed / "vnat" / "flows.parquet"
splits_yaml = paths.configs_dir / "splits.yaml"
features_yaml = paths.configs_dir / "features.yaml"

train_list = paths.data_splits / "vnat_train_captures.txt"
val_list = paths.data_splits / "vnat_val_captures.txt"
test_list = paths.data_splits / "vnat_test_captures.txt"

out_dir = paths.data_processed / "vnat"
out_dir.mkdir(parents=True, exist_ok=True)

features_out = out_dir / "features.parquet"
features_manifest_out = out_dir / "features_manifest.json"

assert flows_parquet.exists(), flows_parquet
assert splits_yaml.exists(), splits_yaml
assert features_yaml.exists(), features_yaml

logger.info("Core inputs OK: flows.parquet, splits.yaml, features.yaml")

2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Core inputs OK: flows.parquet, splits.yaml, features.yaml


In [3]:
def _exists_all_split_lists() -> bool:
    return train_list.exists() and val_list.exists() and test_list.exists()

if not _exists_all_split_lists():
    logger.warning("Split lists are missing. Recreating them now from flows.parquet + splits.yaml ...")

    splits = make_vnat_capture_split(
        flows_parquet=flows_parquet,
        splits_yaml=splits_yaml,
        repo_root=paths.repo_root,
    )

    manifest = write_split_files(
        splits=splits,
        flows_parquet=flows_parquet,
        splits_yaml=splits_yaml,
        repo_root=paths.repo_root,
    )

    logger.info("Recreated split lists and wrote split manifest.")
    logger.info(json.dumps(manifest["split_stats"], indent=2))

assert _exists_all_split_lists(), "Split lists still missing after regeneration."
logger.info("Split list files exist.")

2026-02-12 19:50:53 | INFO | ai-vpn-firewall | Split list files exist.


In [4]:
split_stats = validate_split_files(
    flows_parquet=flows_parquet,
    train_list=train_list,
    val_list=val_list,
    test_list=test_list,
    splits_yaml=splits_yaml,
)

logger.info("Split validation passed.")
print(json.dumps(split_stats, indent=2))

splits = load_splits(train_list, val_list, test_list)
s_train = set(map(str, splits["train"]))
s_val   = set(map(str, splits["val"]))
s_test  = set(map(str, splits["test"]))

2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Split validation passed.
{
  "train": {
    "n_captures": 115,
    "n_flows": 25994,
    "captures_by_label": {
      "0": 59,
      "1": 56
    },
    "flows_by_label": {
      "0": 25825,
      "1": 169
    }
  },
  "val": {
    "n_captures": 24,
    "n_flows": 3581,
    "captures_by_label": {
      "0": 11,
      "1": 13
    },
    "flows_by_label": {
      "0": 3442,
      "1": 139
    }
  },
  "test": {
    "n_captures": 26,
    "n_flows": 4136,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "flows_by_label": {
      "0": 4065,
      "1": 71
    }
  }
}


In [5]:
cfg = load_feature_config(features_yaml)
logger.info(f"Loaded feature config: N={cfg.N}, min_packets={cfg.min_packets}, eps={cfg.eps}")
logger.info(f"Histogram bins: size_bins={len(cfg.size_bins)} edges, iat_bins={len(cfg.iat_bins)} edges")

2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Loaded feature config: N=100, min_packets=10, eps=1e-06
2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Histogram bins: size_bins=8 edges, iat_bins=13 edges


In [6]:
flows = pd.read_parquet(flows_parquet)

EXPECTED_COLS = {
    "capture_id",
    "capture_name",
    "row_id",
    "flow_id",
    "flow_key",
    "connection_str",
    "timestamps",
    "sizes",
    "directions",
    "file_names",
    "app",
    "label",
    "packet_count",
    "packet_count_full",
    "window_complete",
    "min_packets_ok",
}

missing = EXPECTED_COLS - set(flows.columns)
extra = set(flows.columns) - EXPECTED_COLS

if missing:
    raise ValueError(f"flows.parquet missing required columns: {sorted(missing)}")
if extra:
    logger.warning(f"flows.parquet has extra columns (ok): {sorted(extra)}")

assert flows["flow_id"].is_unique, "flow_id must be unique."
assert flows["capture_id"].notna().all(), "capture_id contains NaN."
assert flows["label"].isin([0, 1]).all(), "label must be 0/1."

flows["capture_id"] = flows["capture_id"].astype(str)

def _split_of(cid: str) -> str:
    if cid in s_train:
        return "train"
    if cid in s_val:
        return "val"
    if cid in s_test:
        return "test"
    return "unknown"

flows["split"] = flows["capture_id"].map(_split_of)

n_unknown = int((flows["split"] == "unknown").sum())
if n_unknown:
    raise ValueError(f"{n_unknown} flows belong to capture_ids not found in split lists.")

logger.info(f"Loaded flows: shape={flows.shape}")
logger.info(f"Label counts:\n{flows['label'].value_counts()}")
logger.info(f"min_packets_ok rate: {100*flows['min_packets_ok'].mean():.2f}%")

2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Loaded flows: shape=(33711, 17)
2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Label counts:
label
0    33332
1      379
Name: count, dtype: int64
2026-02-12 19:50:54 | INFO | ai-vpn-firewall | min_packets_ok rate: 10.75%


In [7]:
logger.info("Extracting features from flows... (this can take a bit)")

features = extract_features_from_flows(flows=flows, cfg=cfg)

logger.info(f"Extracted features: shape={features.shape}")
features.head()

2026-02-12 19:50:54 | INFO | ai-vpn-firewall | Extracting features from flows... (this can take a bit)
2026-02-12 19:51:23 | INFO | ai-vpn-firewall | Extracted features: shape=(33711, 91)


Unnamed: 0,flow_id,capture_id,label,f_duration_s,f_total_pkts,f_up_pkts,f_down_pkts,f_total_bytes,f_up_bytes,f_down_bytes,...,h_iat_all_05,h_iat_all_06,h_iat_all_07,h_iat_all_08,h_iat_all_09,h_iat_all_10,h_iat_all_11,q_packet_count,q_window_complete,q_min_packets_ok
0,vpn_youtube_capture2.pcap::0,vpn_youtube_capture2.pcap,1,10.798472,100.0,39.0,61.0,67536.0,7592.0,59944.0,...,0.010101,0.020202,0.020202,0.0,0.020202,0.010101,0.010101,100.0,1.0,1.0
1,nonvpn_sftp_newcapture1.pcap::1,nonvpn_sftp_newcapture1.pcap,0,0.000383,2.0,1.0,1.0,142.0,63.0,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
2,nonvpn_sftp_newcapture1.pcap::2,nonvpn_sftp_newcapture1.pcap,0,0.000353,2.0,1.0,1.0,126.0,63.0,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,nonvpn_sftp_newcapture1.pcap::3,nonvpn_sftp_newcapture1.pcap,0,0.328656,100.0,57.0,43.0,51166.0,44933.0,6233.0,...,0.020202,0.030303,0.0,0.010101,0.0,0.0,0.0,100.0,1.0,1.0
4,nonvpn_sftp_newcapture1.pcap::4,nonvpn_sftp_newcapture1.pcap,0,0.000414,2.0,1.0,1.0,153.0,51.0,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [8]:
cfg_hash = feature_config_hash_text(features_yaml)

# match flows -> features by flow_id
train_flow_ids = set(
    flows.loc[(flows["split"] == "train") & (flows["min_packets_ok"] == True), "flow_id"]
    .astype(str)
    .tolist()
)

feats_train = features[features["flow_id"].astype(str).isin(train_flow_ids)].copy()

if len(feats_train) == 0:
    raise ValueError("No TRAIN flows with min_packets_ok==True. Check features.yaml min_packets and your data.")

logger.info(f"Fitting scaler on TRAIN(min_packets_ok=True): {len(feats_train)} flows")
logger.info("Label counts (train scaler fit):\n" + str(feats_train["label"].value_counts()))

pipe = FeaturePipeline().fit(feats_train)

features_scaled = pipe.transform(features)
logger.info(f"Scaled features: shape={features_scaled.shape}")

features_scaled.to_parquet(features_out, index=False)
logger.info(f"Saved SCALED features: {features_out}")

art = default_feature_artifacts(paths.artifacts_features)
pipe.save(art, feature_config_hash=cfg_hash)
logger.info(f"Saved feature artifacts under: {paths.artifacts_features}")

2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Fitting scaler on TRAIN(min_packets_ok=True): 1705 flows
2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Label counts (train scaler fit):
label
0    1649
1      56
Name: count, dtype: int64
2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Scaled features: shape=(33711, 91)
2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Saved SCALED features: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\features.parquet
2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Saved feature artifacts under: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\artifacts\features


In [9]:
def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

manifest = {
    "dataset": "vnat",
    "flows_parquet": str(flows_parquet.resolve()),
    "flows_sha256": sha256_file(flows_parquet),

    "features_yaml": str(features_yaml.resolve()),
    "features_yaml_sha256": hashlib.sha256(features_yaml.read_bytes()).hexdigest(),
    "feature_config_hash": cfg_hash,

    "splits_yaml": str(splits_yaml.resolve()),
    "splits_yaml_sha256": hashlib.sha256(splits_yaml.read_bytes()).hexdigest(),

    "split_lists": {
        "train_list": str(train_list.resolve()),
        "val_list": str(val_list.resolve()),
        "test_list": str(test_list.resolve()),
        "train_list_sha256": sha256_file(train_list),
        "val_list_sha256": sha256_file(val_list),
        "test_list_sha256": sha256_file(test_list),
    },

    "output": {
        "features_parquet": str(features_out.resolve()),
        "features_sha256": sha256_file(features_out),
        "scaled": True,
    },

    "artifacts": {
        "dir": str(paths.artifacts_features.resolve()),
        "feature_columns_json": str(art.feature_columns_json.resolve()),
        "scaler_pkl": str(art.scaler_pkl.resolve()),
        "feature_config_hash_txt": str(art.feature_config_hash_txt.resolve()),
        "feature_columns_sha256": sha256_file(art.feature_columns_json),
        "scaler_sha256": sha256_file(art.scaler_pkl),
        "feature_config_hash_txt_sha256": sha256_file(art.feature_config_hash_txt),
    },

    "rows": {
        "flows": int(len(flows)),
        "features": int(len(features_scaled)),
        "trainable_min_packets_ok": int((features_scaled["q_min_packets_ok"] == 1.0).sum()),
    },

    "label_counts": {
        "all": features_scaled["label"].value_counts().to_dict(),
        "trainable": features_scaled.loc[features_scaled["q_min_packets_ok"] == 1.0, "label"].value_counts().to_dict(),
    },

    "schema": {
        "n_columns": int(features_scaled.shape[1]),
        "columns": list(features_scaled.columns),
        "dtypes": {c: str(features_scaled[c].dtype) for c in features_scaled.columns},
    },
}

features_manifest_out.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
logger.info(f"Saved features manifest: {features_manifest_out}")

print(json.dumps({
    "features_rows": manifest["rows"]["features"],
    "trainable_rows": manifest["rows"]["trainable_min_packets_ok"],
    "features_sha256": manifest["output"]["features_sha256"],
}, indent=2))

2026-02-12 19:51:24 | INFO | ai-vpn-firewall | Saved features manifest: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\features_manifest.json
{
  "features_rows": 33711,
  "trainable_rows": 3624,
  "features_sha256": "adc626fc8f32af949c74cca003c2d3ad76c8e92a7415f135dc3cbc7661bb5bdd"
}


In [10]:
reloaded = pd.read_parquet(features_out)
assert len(reloaded) == len(features_scaled)
assert set(reloaded.columns) == set(features_scaled.columns)

logger.info("Reload check passed. features.parquet (scaled) is consistent.")
reloaded.head()

2026-02-12 19:51:25 | INFO | ai-vpn-firewall | Reload check passed. features.parquet (scaled) is consistent.


Unnamed: 0,flow_id,capture_id,label,f_duration_s,f_total_pkts,f_up_pkts,f_down_pkts,f_total_bytes,f_up_bytes,f_down_bytes,...,h_iat_all_05,h_iat_all_06,h_iat_all_07,h_iat_all_08,h_iat_all_09,h_iat_all_10,h_iat_all_11,q_packet_count,q_window_complete,q_min_packets_ok
0,vpn_youtube_capture2.pcap::0,vpn_youtube_capture2.pcap,1,-0.213618,1.039777,0.351779,1.542283,1.708268,-0.173571,2.200641,...,-0.560608,-0.852035,0.262198,-0.927833,0.876797,0.234349,-0.761668,1.039777,1.0,1.0
1,nonvpn_sftp_newcapture1.pcap::1,nonvpn_sftp_newcapture1.pcap,0,-0.226556,-1.785572,-1.77036,-1.566472,-0.931634,-0.626785,-0.632885,...,-0.991914,-1.267974,-0.293283,-0.927833,-0.318378,-0.437921,-0.889393,-1.785572,0.0,0.0
2,nonvpn_sftp_newcapture1.pcap::2,nonvpn_sftp_newcapture1.pcap,0,-0.226556,-1.785572,-1.77036,-1.566472,-0.932261,-0.626785,-0.633643,...,-0.991914,-1.267974,-0.293283,-0.927833,-0.318378,-0.437921,-0.889393,-1.785572,0.0,0.0
3,nonvpn_sftp_newcapture1.pcap::3,nonvpn_sftp_newcapture1.pcap,0,-0.226162,1.039777,1.357003,0.609656,1.067036,2.074196,-0.341604,...,-0.129303,-0.644066,-0.293283,0.215202,-0.318378,-0.437921,-0.889393,1.039777,1.0,1.0
4,nonvpn_sftp_newcapture1.pcap::4,nonvpn_sftp_newcapture1.pcap,0,-0.226556,-1.785572,-1.77036,-1.566472,-0.931203,-0.627507,-0.631797,...,-0.991914,-1.267974,-0.293283,-0.927833,-0.318378,-0.437921,-0.889393,-1.785572,0.0,0.0
