In [1]:
from pathlib import Path
import json
import pandas as pd
import yaml

from src.utils.paths import load_paths
from src.utils.logging import setup_logger
from src.splits.make_split import make_vnat_capture_split, write_split_files
from src.splits.validate import validate_split_files

paths = load_paths()
paths.ensure_dirs()
logger = setup_logger(level="INFO")

flows_parquet = paths.data_processed / "vnat" / "flows.parquet"
splits_yaml = paths.configs_dir / "splits.yaml"

logger.info(f"flows.parquet: {flows_parquet}")
logger.info(f"splits.yaml: {splits_yaml}")

assert flows_parquet.exists(), f"Missing: {flows_parquet}"
assert splits_yaml.exists(), f"Missing: {splits_yaml}"

2026-02-12 23:47:36 | INFO | ai-vpn-firewall | flows.parquet: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\flows.parquet
2026-02-12 23:47:36 | INFO | ai-vpn-firewall | splits.yaml: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\configs\splits.yaml


In [2]:
df = pd.read_parquet(flows_parquet, columns=["capture_id", "label", "min_packets_ok"])
df["capture_id"] = df["capture_id"].astype(str)

cap = (
    df.groupby("capture_id")
      .agg(
          label=("label","first"),
          raw_flows=("label","size"),
          trainable_flows=("min_packets_ok","sum"),
      )
      .reset_index()
)
cap["trainable_flows"] = cap["trainable_flows"].astype(int)

logger.info(f"Unique captures: {cap['capture_id'].nunique()}")
logger.info("Captures by label:\n" + str(cap["label"].value_counts()))

cap.sort_values("trainable_flows", ascending=False).head(15)

2026-02-12 23:47:36 | INFO | ai-vpn-firewall | Unique captures: 165
2026-02-12 23:47:36 | INFO | ai-vpn-firewall | Captures by label:
label
0    83
1    82
Name: count, dtype: int64


Unnamed: 0,capture_id,label,raw_flows,trainable_flows
75,nonvpn_vimeo_capture1.pcap,0,1217,1188
74,nonvpn_ssh_capture5.pcap,0,11368,583
10,nonvpn_scp_long_capture1.pcap,0,10555,380
0,nonvpn_netflix_capture1.pcap,0,140,120
79,nonvpn_youtube_capture1.pcap,0,136,114
81,nonvpn_youtube_capture3.pcap,0,124,109
82,nonvpn_youtube_capture4.pcap,0,74,66
72,nonvpn_ssh_capture3.pcap,0,1600,65
39,nonvpn_skype-chat_capture3.pcap,0,86,54
1,nonvpn_netflix_capture2.pcap,0,64,52


In [3]:
splits = make_vnat_capture_split(flows_parquet, splits_yaml, repo_root=paths.repo_root)

logger.info(f"Train captures: {len(splits['train'])}")
logger.info(f"Val captures: {len(splits['val'])}")
logger.info(f"Test captures: {len(splits['test'])}")

manifest = write_split_files(splits, flows_parquet, splits_yaml, repo_root=paths.repo_root)
print(json.dumps(manifest["split_stats"], indent=2))

2026-02-12 23:47:36 | INFO | ai-vpn-firewall | Train captures: 115
2026-02-12 23:47:36 | INFO | ai-vpn-firewall | Val captures: 24
2026-02-12 23:47:36 | INFO | ai-vpn-firewall | Test captures: 26
{
  "train": {
    "n_captures": 115,
    "raw_flows": 26925,
    "trainable_flows": 3174,
    "captures_by_label": {
      "0": 61,
      "1": 54
    },
    "raw_flows_by_label": {
      "0": 26757,
      "1": 168
    },
    "trainable_flows_by_label": {
      "0": 3120,
      "1": 54
    }
  },
  "val": {
    "n_captures": 24,
    "raw_flows": 250,
    "trainable_flows": 43,
    "captures_by_label": {
      "0": 9,
      "1": 15
    },
    "raw_flows_by_label": {
      "0": 235,
      "1": 15
    },
    "trainable_flows_by_label": {
      "0": 28,
      "1": 15
    }
  },
  "test": {
    "n_captures": 26,
    "raw_flows": 6536,
    "trainable_flows": 573,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "raw_flows_by_label": {
      "0": 6340,
      "1": 196
    },
    "tra

In [5]:
print("VPN TRAINABLE flows (train/val/test):",
      manifest["split_stats"]["train"]["trainable_flows_by_label"]["1"],
      manifest["split_stats"]["val"]["trainable_flows_by_label"]["1"],
      manifest["split_stats"]["test"]["trainable_flows_by_label"]["1"])

VPN TRAINABLE flows (train/val/test): 54 15 14


In [6]:
total = (
    manifest["split_stats"]["train"]["trainable_flows"] +
    manifest["split_stats"]["val"]["trainable_flows"] +
    manifest["split_stats"]["test"]["trainable_flows"]
)

for k in ["train", "val", "test"]:
    n = manifest["split_stats"][k]["trainable_flows"]
    print(k, n, f"ratio={n/total:.3f}")

train 3174 ratio=0.837
val 43 ratio=0.011
test 573 ratio=0.151


In [7]:
train_list = paths.data_splits / "vnat_train_captures.txt"
val_list = paths.data_splits / "vnat_val_captures.txt"
test_list = paths.data_splits / "vnat_test_captures.txt"

stats = validate_split_files(
    flows_parquet,
    train_list,
    val_list,
    test_list,
    splits_yaml=splits_yaml,
)
print(json.dumps(stats, indent=2))

{
  "train": {
    "n_captures": 115,
    "raw_flows": 26925,
    "trainable_flows": 3174,
    "captures_by_label": {
      "0": 61,
      "1": 54
    },
    "raw_flows_by_label": {
      "0": 26757,
      "1": 168
    },
    "trainable_flows_by_label": {
      "0": 3120,
      "1": 54
    }
  },
  "val": {
    "n_captures": 24,
    "raw_flows": 250,
    "trainable_flows": 43,
    "captures_by_label": {
      "0": 9,
      "1": 15
    },
    "raw_flows_by_label": {
      "0": 235,
      "1": 15
    },
    "trainable_flows_by_label": {
      "0": 28,
      "1": 15
    }
  },
  "test": {
    "n_captures": 26,
    "raw_flows": 6536,
    "trainable_flows": 573,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "raw_flows_by_label": {
      "0": 6340,
      "1": 196
    },
    "trainable_flows_by_label": {
      "0": 559,
      "1": 14
    }
  }
}


In [8]:
cap_map = cap.set_index("capture_id")["trainable_flows"].to_dict()

def top_caps(caps, k=10):
    tmp = [(c, cap_map.get(c, 0)) for c in caps]
    tmp.sort(key=lambda x: x[1], reverse=True)
    return tmp[:k]

print("Top train captures:", top_caps(splits["train"], 10))
print("Top val captures:", top_caps(splits["val"], 10))
print("Top test captures:", top_caps(splits["test"], 10))

Top train captures: [('nonvpn_vimeo_capture1.pcap', 1188), ('nonvpn_ssh_capture5.pcap', 583), ('nonvpn_scp_long_capture1.pcap', 380), ('nonvpn_youtube_capture1.pcap', 114), ('nonvpn_skype-chat_capture3.pcap', 54), ('nonvpn_netflix_capture2.pcap', 52), ('nonvpn_skype-chat_capture47.pcap', 36), ('nonvpn_skype-chat_capture39.pcap', 32), ('nonvpn_skype-chat_capture12.pcap', 29), ('nonvpn_skype-chat_capture27.pcap', 26)]
Top val captures: [('nonvpn_skype-chat_capture35.pcap', 8), ('nonvpn_voip_capture1.pcap', 4), ('nonvpn_youtube_capture2.pcap', 3), ('nonvpn_rdp_capture2.pcap', 3), ('nonvpn_rdp_capture1.pcap', 3), ('nonvpn_skype-chat_capture1.pcap', 2), ('nonvpn_sftp_capture1.pcap', 2), ('nonvpn_rdp_capture3.pcap', 2), ('vpn_ssh_capture1.pcap', 1), ('vpn_skype-chat_capture46.pcap', 1)]
Top test captures: [('nonvpn_netflix_capture1.pcap', 120), ('nonvpn_youtube_capture3.pcap', 109), ('nonvpn_youtube_capture4.pcap', 66), ('nonvpn_ssh_capture3.pcap', 65), ('nonvpn_scp_capture1.pcap', 35), ('no