In [1]:
from pathlib import Path
import json
import pandas as pd
import yaml

from src.utils.paths import load_paths
from src.utils.logging import setup_logger
from src.splits.make_split import make_vnat_capture_split, write_split_files
from src.splits.validate import validate_split_files

paths = load_paths()
paths.ensure_dirs()
logger = setup_logger(level="INFO")

flows_parquet = paths.data_processed / "vnat" / "flows.parquet"
splits_yaml = paths.configs_dir / "splits.yaml"

logger.info(f"flows.parquet: {flows_parquet}")
logger.info(f"splits.yaml: {splits_yaml}")

assert flows_parquet.exists(), f"Missing: {flows_parquet}"
assert splits_yaml.exists(), f"Missing: {splits_yaml}"

2026-02-13 05:16:50 | INFO | ai-vpn-firewall | flows.parquet: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\flows.parquet
2026-02-13 05:16:50 | INFO | ai-vpn-firewall | splits.yaml: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\configs\splits.yaml


In [2]:
df = pd.read_parquet(flows_parquet, columns=["capture_id", "label", "min_packets_ok"])
df["capture_id"] = df["capture_id"].astype(str)

cap = (
    df.groupby("capture_id")
      .agg(
          label=("label","first"),
          raw_flows=("label","size"),
          trainable_flows=("min_packets_ok","sum"),
      )
      .reset_index()
)
cap["trainable_flows"] = cap["trainable_flows"].astype(int)

logger.info(f"Unique captures: {cap['capture_id'].nunique()}")
logger.info("Captures by label:\n" + str(cap["label"].value_counts()))

cap.sort_values("trainable_flows", ascending=False).head(15)

2026-02-13 05:16:50 | INFO | ai-vpn-firewall | Unique captures: 165
2026-02-13 05:16:50 | INFO | ai-vpn-firewall | Captures by label:
label
0    83
1    82
Name: count, dtype: int64


Unnamed: 0,capture_id,label,raw_flows,trainable_flows
74,nonvpn_ssh_capture5.pcap,0,11368,1552
75,nonvpn_vimeo_capture1.pcap,0,1217,1197
11,nonvpn_scp_newcapture1.pcap,0,1214,1029
10,nonvpn_scp_long_capture1.pcap,0,10555,996
8,nonvpn_rsync_newcapture1.pcap,0,1013,864
16,nonvpn_sftp_newcapture2.pcap,0,648,551
159,vpn_voip_capture1.pcap,1,126,125
0,nonvpn_netflix_capture1.pcap,0,140,120
79,nonvpn_youtube_capture1.pcap,0,136,114
161,vpn_voip_capture3.pcap,1,114,113


In [3]:
splits = make_vnat_capture_split(flows_parquet, splits_yaml, repo_root=paths.repo_root)

logger.info(f"Train captures: {len(splits['train'])}")
logger.info(f"Val captures: {len(splits['val'])}")
logger.info(f"Test captures: {len(splits['test'])}")

manifest = write_split_files(splits, flows_parquet, splits_yaml, repo_root=paths.repo_root)
print(json.dumps(manifest["split_stats"], indent=2))

2026-02-13 05:16:50 | INFO | ai-vpn-firewall | Train captures: 115
2026-02-13 05:16:50 | INFO | ai-vpn-firewall | Val captures: 24
2026-02-13 05:16:50 | INFO | ai-vpn-firewall | Test captures: 26
{
  "train": {
    "n_captures": 115,
    "raw_flows": 30704,
    "trainable_flows": 7160,
    "captures_by_label": {
      "0": 58,
      "1": 57
    },
    "raw_flows_by_label": {
      "0": 30350,
      "1": 354
    },
    "trainable_flows_by_label": {
      "0": 6811,
      "1": 349
    }
  },
  "val": {
    "n_captures": 24,
    "raw_flows": 2086,
    "trainable_flows": 439,
    "captures_by_label": {
      "0": 12,
      "1": 12
    },
    "raw_flows_by_label": {
      "0": 2074,
      "1": 12
    },
    "trainable_flows_by_label": {
      "0": 427,
      "1": 12
    }
  },
  "test": {
    "n_captures": 26,
    "raw_flows": 921,
    "trainable_flows": 508,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "raw_flows_by_label": {
      "0": 908,
      "1": 13
    },
    "

In [4]:
print("VPN TRAINABLE flows (train/val/test):",
      manifest["split_stats"]["train"]["trainable_flows_by_label"]["1"],
      manifest["split_stats"]["val"]["trainable_flows_by_label"]["1"],
      manifest["split_stats"]["test"]["trainable_flows_by_label"]["1"])

VPN TRAINABLE flows (train/val/test): 349 12 13


In [5]:
total = (
    manifest["split_stats"]["train"]["trainable_flows"] +
    manifest["split_stats"]["val"]["trainable_flows"] +
    manifest["split_stats"]["test"]["trainable_flows"]
)

for k in ["train", "val", "test"]:
    n = manifest["split_stats"][k]["trainable_flows"]
    print(k, n, f"ratio={n/total:.3f}")

train 7160 ratio=0.883
val 439 ratio=0.054
test 508 ratio=0.063


In [6]:
train_list = paths.data_splits / "vnat_train_captures.txt"
val_list = paths.data_splits / "vnat_val_captures.txt"
test_list = paths.data_splits / "vnat_test_captures.txt"

stats = validate_split_files(
    flows_parquet,
    train_list,
    val_list,
    test_list,
    splits_yaml=splits_yaml,
)
print(json.dumps(stats, indent=2))

{
  "train": {
    "n_captures": 115,
    "raw_flows": 30704,
    "trainable_flows": 7160,
    "captures_by_label": {
      "0": 58,
      "1": 57
    },
    "raw_flows_by_label": {
      "0": 30350,
      "1": 354
    },
    "trainable_flows_by_label": {
      "0": 6811,
      "1": 349
    }
  },
  "val": {
    "n_captures": 24,
    "raw_flows": 2086,
    "trainable_flows": 439,
    "captures_by_label": {
      "0": 12,
      "1": 12
    },
    "raw_flows_by_label": {
      "0": 2074,
      "1": 12
    },
    "trainable_flows_by_label": {
      "0": 427,
      "1": 12
    }
  },
  "test": {
    "n_captures": 26,
    "raw_flows": 921,
    "trainable_flows": 508,
    "captures_by_label": {
      "0": 13,
      "1": 13
    },
    "raw_flows_by_label": {
      "0": 908,
      "1": 13
    },
    "trainable_flows_by_label": {
      "0": 495,
      "1": 13
    }
  }
}


In [7]:
cap_map = cap.set_index("capture_id")["trainable_flows"].to_dict()

def top_caps(caps, k=10):
    tmp = [(c, cap_map.get(c, 0)) for c in caps]
    tmp.sort(key=lambda x: x[1], reverse=True)
    return tmp[:k]

print("Top train captures:", top_caps(splits["train"], 10))
print("Top val captures:", top_caps(splits["val"], 10))
print("Top test captures:", top_caps(splits["test"], 10))

Top train captures: [('nonvpn_ssh_capture5.pcap', 1552), ('nonvpn_vimeo_capture1.pcap', 1197), ('nonvpn_scp_newcapture1.pcap', 1029), ('nonvpn_scp_long_capture1.pcap', 996), ('nonvpn_rsync_newcapture1.pcap', 864), ('nonvpn_sftp_newcapture2.pcap', 551), ('vpn_voip_capture1.pcap', 125), ('vpn_voip_capture3.pcap', 113), ('vpn_voip_capture2.pcap', 57), ('nonvpn_netflix_capture2.pcap', 53)]
Top val captures: [('nonvpn_youtube_capture1.pcap', 114), ('nonvpn_ssh_capture3.pcap', 82), ('nonvpn_skype-chat_capture3.pcap', 55), ('nonvpn_skype-chat_capture31.pcap', 29), ('nonvpn_skype-chat_capture11.pcap', 25), ('nonvpn_skype-chat_capture19.pcap', 22), ('nonvpn_skype-chat_capture9.pcap', 19), ('nonvpn_skype-chat_capture44.pcap', 18), ('nonvpn_skype-chat_capture25.pcap', 17), ('nonvpn_skype-chat_capture48.pcap', 16)]
Top test captures: [('nonvpn_netflix_capture1.pcap', 120), ('nonvpn_youtube_capture3.pcap', 109), ('nonvpn_youtube_capture4.pcap', 66), ('nonvpn_skype-chat_capture47.pcap', 36), ('nonvp

In [8]:
import pandas as pd

df = pd.read_parquet(flows_parquet, columns=["capture_id", "label", "min_packets_ok"])
df["capture_id"] = df["capture_id"].astype(str)

cap = (df.groupby("capture_id")
         .agg(label=("label","first"),
              raw_flows=("label","size"),
              trainable_flows=("min_packets_ok","sum"))
         .reset_index())

cap["raw_flows"] = cap["raw_flows"].astype(int)
cap["trainable_flows"] = cap["trainable_flows"].astype(int)

print("captures total:", len(cap))
print("captures by label:\n", cap["label"].value_counts())
print("trainable VPN total:", int(cap.loc[cap.label==1, "trainable_flows"].sum()))
print("trainable NonVPN total:", int(cap.loc[cap.label==0, "trainable_flows"].sum()))

print("\nTop 15 by RAW flows:")
print(cap.sort_values("raw_flows", ascending=False).head(15)[["capture_id","label","raw_flows","trainable_flows"]])

print("\nTop 15 by TRAINABLE flows:")
print(cap.sort_values("trainable_flows", ascending=False).head(15)[["capture_id","label","raw_flows","trainable_flows"]])

cap.to_csv("cap_summary.csv", index=False)
print("\nWrote cap_summary.csv")

captures total: 165
captures by label:
 label
0    83
1    82
Name: count, dtype: int64
trainable VPN total: 374
trainable NonVPN total: 7733

Top 15 by RAW flows:
                       capture_id  label  raw_flows  trainable_flows
74       nonvpn_ssh_capture5.pcap      0      11368             1552
10  nonvpn_scp_long_capture1.pcap      0      10555              996
72       nonvpn_ssh_capture3.pcap      0       1600               82
75     nonvpn_vimeo_capture1.pcap      0       1217             1197
11    nonvpn_scp_newcapture1.pcap      0       1214             1029
9        nonvpn_scp_capture1.pcap      0       1074               40
8   nonvpn_rsync_newcapture1.pcap      0       1013              864
7      nonvpn_rsync_capture1.pcap      0        898               30
15   nonvpn_sftp_newcapture1.pcap      0        709               27
16   nonvpn_sftp_newcapture2.pcap      0        648              551
70       nonvpn_ssh_capture1.pcap      0        298               14
14      