In [1]:
from pathlib import Path
import json
import pandas as pd
import yaml

from src.utils.paths import load_paths
from src.utils.logging import setup_logger
from src.splits.make_split import make_vnat_capture_split, write_split_files
from src.splits.validate import validate_split_files

paths = load_paths()
paths.ensure_dirs()
logger = setup_logger(level="INFO")

flows_parquet = paths.data_processed / "vnat" / "flows.parquet"
splits_yaml = paths.configs_dir / "splits.yaml"

logger.info(f"flows.parquet: {flows_parquet}")
logger.info(f"splits.yaml: {splits_yaml}")

assert flows_parquet.exists(), f"Missing: {flows_parquet}"
assert splits_yaml.exists(), f"Missing: {splits_yaml}"

2026-02-12 21:18:58 | INFO | ai-vpn-firewall | flows.parquet: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\flows.parquet
2026-02-12 21:18:58 | INFO | ai-vpn-firewall | splits.yaml: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\configs\splits.yaml


In [2]:
df = pd.read_parquet(flows_parquet, columns=["capture_id", "label"])
cap = df.groupby("capture_id").agg(label=("label","first"), n_flows=("label","size")).reset_index()

logger.info(f"Unique captures: {cap['capture_id'].nunique()}")
logger.info("Captures by label:\n" + str(cap["label"].value_counts()))

cap.sort_values("n_flows", ascending=False).head(15)

2026-02-12 21:18:58 | INFO | ai-vpn-firewall | Unique captures: 165
2026-02-12 21:18:58 | INFO | ai-vpn-firewall | Captures by label:
label
0    83
1    82
Name: count, dtype: int64


Unnamed: 0,capture_id,label,n_flows
74,nonvpn_ssh_capture5.pcap,0,11368
10,nonvpn_scp_long_capture1.pcap,0,10555
72,nonvpn_ssh_capture3.pcap,0,1600
75,nonvpn_vimeo_capture1.pcap,0,1217
11,nonvpn_scp_newcapture1.pcap,0,1214
9,nonvpn_scp_capture1.pcap,0,1074
8,nonvpn_rsync_newcapture1.pcap,0,1013
7,nonvpn_rsync_capture1.pcap,0,898
15,nonvpn_sftp_newcapture1.pcap,0,709
16,nonvpn_sftp_newcapture2.pcap,0,648


In [3]:
splits = make_vnat_capture_split(flows_parquet, splits_yaml, repo_root=paths.repo_root)

logger.info(f"Train captures: {len(splits['train'])}")
logger.info(f"Val captures: {len(splits['val'])}")
logger.info(f"Test captures: {len(splits['test'])}")

manifest = write_split_files(splits, flows_parquet, splits_yaml, repo_root=paths.repo_root)
print(json.dumps(manifest["split_stats"], indent=2))

2026-02-12 21:18:58 | INFO | ai-vpn-firewall | Train captures: 115
2026-02-12 21:18:58 | INFO | ai-vpn-firewall | Val captures: 24
2026-02-12 21:18:58 | INFO | ai-vpn-firewall | Test captures: 26
{
  "train": {
    "n_captures": 115,
    "n_flows": 25994,
    "captures_by_label": {
      "0": 59,
      "1": 56
    },
    "flows_by_label": {
      "0": 25825,
      "1": 169
    }
  },
  "val": {
    "n_captures": 24,
    "n_flows": 3581,
    "captures_by_label": {
      "0": 11,
      "1": 13
    },
    "flows_by_label": {
      "0": 3442,
      "1": 139
    }
  },
  "test": {
    "n_captures": 26,
    "n_flows": 4136,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "flows_by_label": {
      "0": 4065,
      "1": 71
    }
  }
}


In [4]:
print("VPN flows (train/val/test):",
      manifest["split_stats"]["train"]["flows_by_label"]["1"],
      manifest["split_stats"]["val"]["flows_by_label"]["1"],
      manifest["split_stats"]["test"]["flows_by_label"]["1"])

VPN flows (train/val/test): 169 139 71


In [5]:
total = (
    manifest["split_stats"]["train"]["n_flows"] +
    manifest["split_stats"]["val"]["n_flows"] +
    manifest["split_stats"]["test"]["n_flows"]
)

for k in ["train", "val", "test"]:
    n = manifest["split_stats"][k]["n_flows"]
    print(k, n, f"ratio={n/total:.3f}")

train 25994 ratio=0.771
val 3581 ratio=0.106
test 4136 ratio=0.123


In [6]:
train_list = paths.data_splits / "vnat_train_captures.txt"
val_list = paths.data_splits / "vnat_val_captures.txt"
test_list = paths.data_splits / "vnat_test_captures.txt"

stats = validate_split_files(
    flows_parquet,
    train_list,
    val_list,
    test_list,
    splits_yaml=splits_yaml,
)
print(json.dumps(stats, indent=2))

{
  "train": {
    "n_captures": 115,
    "n_flows": 25994,
    "captures_by_label": {
      "0": 59,
      "1": 56
    },
    "flows_by_label": {
      "0": 25825,
      "1": 169
    }
  },
  "val": {
    "n_captures": 24,
    "n_flows": 3581,
    "captures_by_label": {
      "0": 11,
      "1": 13
    },
    "flows_by_label": {
      "0": 3442,
      "1": 139
    }
  },
  "test": {
    "n_captures": 26,
    "n_flows": 4136,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "flows_by_label": {
      "0": 4065,
      "1": 71
    }
  }
}


In [7]:
cap_map = cap.set_index("capture_id")["n_flows"].to_dict()

def top_caps(caps, k=10):
    tmp = [(c, cap_map.get(c, 0)) for c in caps]
    tmp.sort(key=lambda x: x[1], reverse=True)
    return tmp[:k]

print("Top train captures:", top_caps(splits["train"], 10))
print("Top val captures:", top_caps(splits["val"], 10))
print("Top test captures:", top_caps(splits["test"], 10))

Top train captures: [('nonvpn_ssh_capture5.pcap', 11368), ('nonvpn_scp_long_capture1.pcap', 10555), ('nonvpn_ssh_capture3.pcap', 1600), ('nonvpn_sftp_newcapture2.pcap', 648), ('nonvpn_ssh_capture1.pcap', 298), ('nonvpn_ssh_capture2.pcap', 175), ('nonvpn_netflix_capture1.pcap', 140), ('nonvpn_ssh_capture4.pcap', 117), ('vpn_voip_capture3.pcap', 114), ('nonvpn_sftp_capture2.pcap', 31)]
Top val captures: [('nonvpn_scp_newcapture1.pcap', 1214), ('nonvpn_rsync_newcapture1.pcap', 1013), ('nonvpn_sftp_newcapture1.pcap', 709), ('vpn_voip_capture1.pcap', 126), ('nonvpn_youtube_capture3.pcap', 124), ('nonvpn_voip_capture2.pcap', 108), ('nonvpn_netflix_capture2.pcap', 64), ('nonvpn_sftp_capture1.pcap', 62), ('nonvpn_skype-chat_capture47.pcap', 48), ('nonvpn_skype-chat_capture12.pcap', 38)]
Top test captures: [('nonvpn_vimeo_capture1.pcap', 1217), ('nonvpn_scp_capture1.pcap', 1074), ('nonvpn_rsync_capture1.pcap', 898), ('nonvpn_sftp_capture3.pcap', 216), ('nonvpn_youtube_capture1.pcap', 136), ('no