In [1]:
from pathlib import Path
import pandas as pd
import json
import hashlib
import yaml
import numpy as np

from src.utils.paths import load_paths
from src.utils.logging import setup_logger

paths = load_paths()
paths.ensure_dirs()
logger = setup_logger(level="INFO")

logger.info(f"Repo root: {paths.repo_root}")
logger.info(f"VNAT raw dir: {paths.data_raw_vnat}")
logger.info(f"Processed dir: {paths.data_processed}")

2026-02-14 07:11:24 | INFO | ai-vpn-firewall | Repo root: C:\Users\scoti\PycharmProjects\ai-vpn-firewall
2026-02-14 07:11:24 | INFO | ai-vpn-firewall | VNAT raw dir: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\raw\vnat
2026-02-14 07:11:24 | INFO | ai-vpn-firewall | Processed dir: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed


In [2]:
h5_path = paths.data_raw_vnat / "VNAT_Dataframe_release_1.h5"
if not h5_path.exists():
    raise FileNotFoundError(f"VNAT H5 not found at: {h5_path}")

logger.info(f"Found VNAT file: {h5_path}")

store = pd.HDFStore(h5_path)
keys = store.keys()
store.close()

logger.info(f"H5 keys: {keys}")
if "/data" not in keys:
    raise ValueError(f"Expected key '/data' in H5 store. Found: {keys}")

2026-02-14 07:11:24 | INFO | ai-vpn-firewall | Found VNAT file: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\raw\vnat\VNAT_Dataframe_release_1.h5
2026-02-14 07:11:27 | INFO | ai-vpn-firewall | H5 keys: ['/data']


In [3]:
features_path = paths.configs_dir / "features.yaml"
if not features_path.exists():
    raise FileNotFoundError(f"Missing config: {features_path}")

features_cfg = yaml.safe_load(features_path.read_text()) or {}
if "window" not in features_cfg:
    raise ValueError("configs/features.yaml must contain a 'window' section (e.g. window: {N: 100, eps: 1e-6}).")

N = int(features_cfg["window"].get("N", 100))
EPS = float(features_cfg["window"].get("eps", 1e-6))

MIN_PACKETS = int(features_cfg["window"].get("min_packets", 10))
logger.info(f"Loaded features config: N={N}, eps={EPS}, min_packets={MIN_PACKETS}")

2026-02-14 07:11:27 | INFO | ai-vpn-firewall | Loaded features config: N=100, eps=1e-06, min_packets=3


In [4]:
df = pd.read_hdf(h5_path, key="/data")
logger.info(f"Loaded VNAT df: shape={df.shape}")

df.head()

2026-02-14 07:11:48 | INFO | ai-vpn-firewall | Loaded VNAT df: shape=(33711, 5)


Unnamed: 0,connection,timestamps,sizes,directions,file_names
0,"(10.123.1.2, 1195, 10.123.1.1, 1195, 17)","[1563289706.330096, 1563289706.330207, 1563289...","[120, 88, 120, 88, 120, 88, 120, 120, 152, 120...","[1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, ...",vpn_youtube_capture2.pcap
0,"(10.113.1.2, 22924, 10.115.1.2, 53, 17)","[1561391908.523659, 1561391908.524042]","[63, 79]","[1, 0]",nonvpn_sftp_newcapture1.pcap
1,"(10.113.1.2, 53065, 10.115.1.2, 53, 17)","[1561391908.523706, 1561391908.524059]","[63, 63]","[1, 0]",nonvpn_sftp_newcapture1.pcap
2,"(10.113.1.150, 39816, 10.115.1.123, 22, 6)","[1561391908.524836, 1561391908.525027, 1561391...","[60, 60, 52, 73, 52, 73, 52, 1378, 222, 52, 13...","[1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",nonvpn_sftp_newcapture1.pcap
3,"(10.115.1.2, 6589, 10.113.1.2, 53, 17)","[1561391908.594887, 1561391908.595301]","[51, 102]","[1, 0]",nonvpn_sftp_newcapture1.pcap


In [5]:
df = df.reset_index(drop=True)

assert df.index.is_unique
assert df.index.min() == 0
assert df.index.max() == len(df) - 1

df["row_id"] = df.index.astype("int64")

logger.info("Index reset OK: unique, 0..N-1")

2026-02-14 07:11:48 | INFO | ai-vpn-firewall | Index reset OK: unique, 0..N-1


In [6]:
expected_cols = {"connection", "timestamps", "sizes", "directions", "file_names"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

logger.info(f"Columns OK: {list(df.columns)}")

df["file_names"] = df["file_names"].astype(str)

2026-02-14 07:11:49 | INFO | ai-vpn-firewall | Columns OK: ['connection', 'timestamps', 'sizes', 'directions', 'file_names', 'row_id']


In [7]:
def derive_label_from_filename(s: str) -> int | None:
    s = str(s).strip().lower()
    if s.startswith("vpn_"):
        return 1
    if s.startswith("nonvpn_"):
        return 0
    return None

derived = df["file_names"].map(derive_label_from_filename)

if derived.isna().any():
    bad = df.loc[derived.isna(), "file_names"].head(20)
    raise ValueError(f"Found file_names without vpn_/nonvpn_ prefix. Examples:\n{bad}")

if "label" in df.columns:
    mismatch = (df["label"].astype(int) != derived.astype(int)).sum()
    logger.info(f"Stored label exists. Mismatches vs derived: {mismatch}")
    if mismatch:
        ex = df.loc[df["label"].astype(int) != derived.astype(int), ["file_names", "label"]].head(20)
        raise ValueError(f"Label mismatch between stored label and derived label. Examples:\n{ex}")
else:
    df["label"] = derived.astype(int)
    logger.info("Created label column from file_names.")

logger.info("Label distribution:\n" + str(df["label"].value_counts()))

2026-02-14 07:11:49 | INFO | ai-vpn-firewall | Created label column from file_names.
2026-02-14 07:11:49 | INFO | ai-vpn-firewall | Label distribution:
label
0    33332
1      379
Name: count, dtype: int64


In [8]:
s = df["file_names"].str.lower().str.strip()

s = s.str.replace("\\", "/", regex=False).str.split("/").str[-1]

s = s.str.replace(".pcap", "", regex=False)

s = s.str.replace(r"^(vpn_|nonvpn_)", "", regex=True)

df["app"] = s.str.split("_", n=1).str[0]

logger.info("Top apps:\n" + str(df["app"].value_counts().head(15)))
df[["file_names", "app", "label"]].head()

2026-02-14 07:11:50 | INFO | ai-vpn-firewall | Top apps:
app
ssh           13563
scp           12845
rsync          1915
sftp           1670
skype-chat     1301
vimeo          1218
voip            617
youtube         341
netflix         205
rdp              36
Name: count, dtype: int64


Unnamed: 0,file_names,app,label
0,vpn_youtube_capture2.pcap,youtube,1
1,nonvpn_sftp_newcapture1.pcap,sftp,0
2,nonvpn_sftp_newcapture1.pcap,sftp,0
3,nonvpn_sftp_newcapture1.pcap,sftp,0
4,nonvpn_sftp_newcapture1.pcap,sftp,0


In [9]:
def is_listlike(x) -> bool:
    return isinstance(x, (list, tuple))

for col in ["timestamps", "sizes", "directions"]:
    bad = df[~df[col].map(is_listlike)]
    if len(bad) > 0:
        raise ValueError(f"Column {col} has non-listlike entries. Examples:\n{bad[[col, 'file_names']].head()}")

lens = pd.DataFrame({
    "t": df["timestamps"].map(len),
    "s": df["sizes"].map(len),
    "d": df["directions"].map(len),
})

mismatch = df[(lens["t"] != lens["s"]) | (lens["t"] != lens["d"])]
logger.info(f"Rows with mismatched packet list lengths: {len(mismatch)}")
if len(mismatch) > 0:
    show = mismatch[["file_names"]].head(10).copy()
    show["len_t"] = lens.loc[mismatch.index, "t"].values
    show["len_s"] = lens.loc[mismatch.index, "s"].values
    show["len_d"] = lens.loc[mismatch.index, "d"].values
    raise ValueError(f"Found mismatched list lengths. Examples:\n{show}")

2026-02-14 07:11:51 | INFO | ai-vpn-firewall | Rows with mismatched packet list lengths: 0


In [10]:
def dirs_valid(dirs) -> bool:
    return set(dirs).issubset({0, 1})

bad_dir = df[~df["directions"].map(dirs_valid)]
logger.info(f"Rows with invalid direction values: {len(bad_dir)}")
if len(bad_dir) > 0:
    ex = bad_dir[["file_names", "directions"]].head(5)
    raise ValueError(f"Invalid direction values found. Examples:\n{ex}")

2026-02-14 07:11:53 | INFO | ai-vpn-firewall | Rows with invalid direction values: 0


In [11]:
def backward_stats(ts):
    ts = np.asarray(ts, dtype=float)
    diffs = np.diff(ts)
    backward = diffs[diffs < 0]
    if backward.size == 0:
        return 0, 0.0, 0.0
    return int(backward.size), float(backward.min()), float(backward.mean())

stats = df["timestamps"].map(backward_stats)
df["n_backsteps"] = stats.map(lambda x: x[0])
df["min_backstep"] = stats.map(lambda x: x[1])   # most negative delta
df["mean_backstep"] = stats.map(lambda x: x[2])

n_bad = (df["n_backsteps"] > 0).sum()
logger.info(f"Rows with non-monotonic timestamps: {n_bad} / {len(df)} ({100*n_bad/len(df):.4f}%)")

df.loc[df["n_backsteps"] > 0, ["file_names", "n_backsteps", "min_backstep", "mean_backstep"]].head(10)

2026-02-14 07:11:55 | INFO | ai-vpn-firewall | Rows with non-monotonic timestamps: 246 / 33711 (0.7297%)


Unnamed: 0,file_names,n_backsteps,min_backstep,mean_backstep
0,vpn_youtube_capture2.pcap,4,-5.00679e-06,-2.503395e-06
3,nonvpn_sftp_newcapture1.pcap,831,-0.0001358986,-1.711392e-06
651,nonvpn_sftp_newcapture1.pcap,593,-0.0001819134,-1.978914e-06
681,nonvpn_sftp_newcapture1.pcap,701,-3.695488e-05,-1.913811e-06
734,nonvpn_rdp_capture4.pcap,1,-2.861023e-06,-2.861023e-06
739,nonvpn_rdp_capture4.pcap,1,-9.536743e-07,-9.536743e-07
740,vpn_rsync_capture2.pcap,367,-9.298325e-05,-2.12303e-06
787,nonvpn_sftp_capture1.pcap,191,-1.192093e-05,-1.529124e-06
972,vpn_sftp_capture1.pcap,1696,-0.0001020432,-1.606373e-06
1648,nonvpn_scp_long_capture1.pcap,1,-9.536743e-07,-9.536743e-07


In [12]:
def make_non_decreasing(ts, eps: float):
    """
    Fix timestamp jitter without reordering packets.
    Ensures t[i] >= t[i-1] by pushing forward when needed.
    eps ensures strict progress if you want it.
    """
    out = []
    prev = None
    for t in ts:
        t = float(t)
        if prev is None:
            out.append(t)
            prev = t
            continue
        if t < prev:
            t = prev + eps
        out.append(t)
        prev = t
    return out

bad_mask = df["n_backsteps"] > 0
rows_repaired = int(bad_mask.sum())
if rows_repaired == 0:
    logger.info("No timestamp repair needed.")
else:
    logger.info(f"Repairing timestamps for {rows_repaired} rows...")
    df.loc[bad_mask, "timestamps"] = df.loc[bad_mask, "timestamps"].map(lambda ts: make_non_decreasing(ts, eps=EPS))

def is_non_decreasing(ts) -> bool:
    return all(ts[i] <= ts[i+1] for i in range(len(ts)-1))

bad_ts_after = (~df["timestamps"].map(is_non_decreasing)).sum()
logger.info(f"Rows still non-monotonic after repair: {bad_ts_after}")
if bad_ts_after:
    raise ValueError("Timestamp repair failed for some rows. Investigate.")

2026-02-14 07:11:56 | INFO | ai-vpn-firewall | Repairing timestamps for 246 rows...
2026-02-14 07:12:02 | INFO | ai-vpn-firewall | Rows still non-monotonic after repair: 0


In [13]:
repair_summary = {
    "rows_total": int(len(df)),
    "rows_repaired": int((df["n_backsteps"] > 0).sum()),
    "pct_repaired": float(100 * (df["n_backsteps"] > 0).mean()),
    "max_backsteps_in_row": int(df["n_backsteps"].max()),
    "worst_min_backstep": float(df["min_backstep"].min()),
}

logger.info("Timestamp repair summary:\n" + str(repair_summary))
repair_summary

2026-02-14 07:12:02 | INFO | ai-vpn-firewall | Timestamp repair summary:
{'rows_total': 33711, 'rows_repaired': 246, 'pct_repaired': 0.7297321349114533, 'max_backsteps_in_row': 1696, 'worst_min_backstep': -0.0001819133758544922}


{'rows_total': 33711,
 'rows_repaired': 246,
 'pct_repaired': 0.7297321349114533,
 'max_backsteps_in_row': 1696,
 'worst_min_backstep': -0.0001819133758544922}

In [14]:
df = df.drop(columns=["n_backsteps", "min_backstep", "mean_backstep"], errors="ignore")

In [15]:
bad_ts = df[~df["timestamps"].map(is_non_decreasing)]
logger.info(f"Rows with non-monotonic timestamps (final): {len(bad_ts)}")
assert len(bad_ts) == 0

2026-02-14 07:12:05 | INFO | ai-vpn-firewall | Rows with non-monotonic timestamps (final): 0


In [16]:
df["packet_count"] = df["sizes"].map(len)
logger.info(df["packet_count"].describe().to_string())

2026-02-14 07:12:05 | INFO | ai-vpn-firewall | count    3.371100e+04
mean     1.130292e+03
std      3.741679e+04
min      1.000000e+00
25%      2.000000e+00
50%      2.000000e+00
75%      2.000000e+00
max      3.842411e+06


In [17]:
df["packet_count_full"] = df["packet_count"]

df["timestamps"] = df["timestamps"].map(lambda xs: xs[:N])
df["sizes"] = df["sizes"].map(lambda xs: xs[:N])
df["directions"] = df["directions"].map(lambda xs: xs[:N])

df["packet_count"] = df["sizes"].map(len)

df["window_complete"] = df["packet_count_full"] >= N

df["min_packets_ok"] = df["packet_count"] >= MIN_PACKETS

logger.info("Windowed packet_count stats:\n" + df["packet_count"].describe().to_string())
logger.info("Full packet_count_full stats:\n" + df["packet_count_full"].describe().to_string())
logger.info(f"Flows with >= {N} packets (complete window): {100 * df['window_complete'].mean():.2f}%")

2026-02-14 07:12:09 | INFO | ai-vpn-firewall | Windowed packet_count stats:
count    33711.000000
mean         7.659814
std         19.217968
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max        100.000000
2026-02-14 07:12:09 | INFO | ai-vpn-firewall | Full packet_count_full stats:
count    3.371100e+04
mean     1.130292e+03
std      3.741679e+04
min      1.000000e+00
25%      2.000000e+00
50%      2.000000e+00
75%      2.000000e+00
max      3.842411e+06
2026-02-14 07:12:09 | INFO | ai-vpn-firewall | Flows with >= 100 packets (complete window): 2.82%


In [18]:
def sizes_valid(sz) -> bool:
    if any(x is None for x in sz):
        return False
    return all((isinstance(x, (int, float)) and x >= 0 and x < 20000) for x in sz)

bad_sizes = df[~df["sizes"].map(sizes_valid)]
logger.info(f"Rows with invalid sizes: {len(bad_sizes)}")
if len(bad_sizes) > 0:
    ex = bad_sizes[["file_names", "sizes"]].head(3)
    raise ValueError(f"Invalid packet sizes found. Examples:\n{ex}")

2026-02-14 07:12:09 | INFO | ai-vpn-firewall | Rows with invalid sizes: 0


In [19]:
def normalize_capture_name(s: str) -> str:
    s = str(s).strip().lower()
    s = s.replace("\\", "/").split("/")[-1]
    return s

df["capture_name"] = df["file_names"].map(normalize_capture_name)

df["capture_id"] = df["capture_name"]

logger.info(f"Unique captures: {df['capture_id'].nunique()}")
df["capture_id"].value_counts().head(10)

2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Unique captures: 165


capture_id
nonvpn_ssh_capture5.pcap         11368
nonvpn_scp_long_capture1.pcap    10555
nonvpn_ssh_capture3.pcap          1600
nonvpn_vimeo_capture1.pcap        1217
nonvpn_scp_newcapture1.pcap       1214
nonvpn_scp_capture1.pcap          1074
nonvpn_rsync_newcapture1.pcap     1013
nonvpn_rsync_capture1.pcap         898
nonvpn_sftp_newcapture1.pcap       709
nonvpn_sftp_newcapture2.pcap       648
Name: count, dtype: int64

In [20]:
def conn_to_str(conn) -> str:
    try:
        src_ip, src_port, dst_ip, dst_port, proto = conn
        return f"{src_ip}:{int(src_port)}-{dst_ip}:{int(dst_port)}-p{int(proto)}"
    except Exception:
        return str(conn)

df["connection_str"] = df["connection"].map(conn_to_str)

# Non-unique key that describes the 5-tuple (useful for analysis)
df["flow_key"] = df["connection_str"]

# Unique per row in the dataset
df["flow_id"] = df["capture_id"] + "::" + df["row_id"].astype(str)

logger.info(f"Unique flow_id: {df['flow_id'].nunique()}")
logger.info(f"Unique flow_key: {df['flow_key'].nunique()}")

2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Unique flow_id: 33711
2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Unique flow_key: 32682


In [21]:
label_per_capture = df.groupby("capture_id")["label"].nunique()
mixed = (label_per_capture > 1).sum()
logger.info(f"Captures with mixed labels (both VPN and nonVPN): {mixed}")

cap_stats = df.groupby(["capture_id", "label"]).size().unstack(fill_value=0)
cap_stats["total"] = cap_stats.sum(axis=1)
cap_stats.sort_values("total", ascending=False).head(10)

2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Captures with mixed labels (both VPN and nonVPN): 0


label,0,1,total
capture_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nonvpn_ssh_capture5.pcap,11368,0,11368
nonvpn_scp_long_capture1.pcap,10555,0,10555
nonvpn_ssh_capture3.pcap,1600,0,1600
nonvpn_vimeo_capture1.pcap,1217,0,1217
nonvpn_scp_newcapture1.pcap,1214,0,1214
nonvpn_scp_capture1.pcap,1074,0,1074
nonvpn_rsync_newcapture1.pcap,1013,0,1013
nonvpn_rsync_capture1.pcap,898,0,898
nonvpn_sftp_newcapture1.pcap,709,0,709
nonvpn_sftp_newcapture2.pcap,648,0,648


In [22]:
df["packet_count"] = df["sizes"].map(len)
logger.info(df["packet_count"].describe().to_string())

for n in [10, 20, 50, 100]:
    pct = (df["packet_count"] >= n).mean() * 100
    logger.info(f"Flows with >= {n} packets: {pct:.2f}%")

2026-02-14 07:12:10 | INFO | ai-vpn-firewall | count    33711.000000
mean         7.659814
std         19.217968
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max        100.000000
2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Flows with >= 10 packets: 10.75%
2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Flows with >= 20 packets: 8.67%
2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Flows with >= 50 packets: 4.71%
2026-02-14 07:12:10 | INFO | ai-vpn-firewall | Flows with >= 100 packets: 2.82%


In [23]:
flows = df[
    [
        "capture_id",
        "capture_name",
        "row_id",
        "flow_id",
        "flow_key",
        "connection_str",
        "timestamps",
        "sizes",
        "directions",
        "file_names",
        "app",
        "label",
        "packet_count",
        "packet_count_full",
        "window_complete",
        "min_packets_ok",
    ]
].copy()

logger.info(f"flows table: shape={flows.shape}")
flows.head()

2026-02-14 07:12:10 | INFO | ai-vpn-firewall | flows table: shape=(33711, 16)


Unnamed: 0,capture_id,capture_name,row_id,flow_id,flow_key,connection_str,timestamps,sizes,directions,file_names,app,label,packet_count,packet_count_full,window_complete,min_packets_ok
0,vpn_youtube_capture2.pcap,vpn_youtube_capture2.pcap,0,vpn_youtube_capture2.pcap::0,10.123.1.2:1195-10.123.1.1:1195-p17,10.123.1.2:1195-10.123.1.1:1195-p17,"[1563289706.330096, 1563289706.330207, 1563289...","[120, 88, 120, 88, 120, 88, 120, 120, 152, 120...","[1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, ...",vpn_youtube_capture2.pcap,youtube,1,100,62283,True,True
1,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,1,nonvpn_sftp_newcapture1.pcap::1,10.113.1.2:22924-10.115.1.2:53-p17,10.113.1.2:22924-10.115.1.2:53-p17,"[1561391908.523659, 1561391908.524042]","[63, 79]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False
2,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,2,nonvpn_sftp_newcapture1.pcap::2,10.113.1.2:53065-10.115.1.2:53-p17,10.113.1.2:53065-10.115.1.2:53-p17,"[1561391908.523706, 1561391908.524059]","[63, 63]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False
3,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,3,nonvpn_sftp_newcapture1.pcap::3,10.113.1.150:39816-10.115.1.123:22-p6,10.113.1.150:39816-10.115.1.123:22-p6,"[1561391908.524836, 1561391908.525027, 1561391...","[60, 60, 52, 73, 52, 73, 52, 1378, 222, 52, 13...","[1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",nonvpn_sftp_newcapture1.pcap,sftp,0,100,2002346,True,True
4,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,4,nonvpn_sftp_newcapture1.pcap::4,10.115.1.2:6589-10.113.1.2:53-p17,10.115.1.2:6589-10.113.1.2:53-p17,"[1561391908.594887, 1561391908.595301]","[51, 102]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False


In [24]:
dup = flows["flow_id"].duplicated().sum()
logger.info(f"Duplicate flow_id values: {dup}")

if dup > 0:
    ex = flows.loc[flows["flow_id"].duplicated(keep=False), ["flow_id", "capture_id", "file_names"]].head(10)
    raise ValueError(f"Found duplicate flow_id. Examples:\n{ex}")

2026-02-14 07:12:11 | INFO | ai-vpn-firewall | Duplicate flow_id values: 0


In [25]:
flows["timestamps"] = flows["timestamps"].map(lambda xs: [float(x) for x in xs])
flows["sizes"] = flows["sizes"].map(lambda xs: [int(x) for x in xs])
flows["directions"] = flows["directions"].map(lambda xs: [int(x) for x in xs])

In [26]:
out_dir = paths.data_processed / "vnat"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "flows.parquet"
flows.to_parquet(out_path, index=False)

logger.info(f"Saved flows parquet: {out_path}")

2026-02-14 07:12:14 | INFO | ai-vpn-firewall | Saved flows parquet: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\flows.parquet


In [27]:
reloaded = pd.read_parquet(out_path)
logger.info(f"Reloaded parquet: shape={reloaded.shape}")

assert list(reloaded.columns) == list(flows.columns)
assert len(reloaded) == len(flows)
assert reloaded["capture_id"].nunique() == flows["capture_id"].nunique()
assert reloaded["flow_id"].nunique() == flows["flow_id"].nunique()

logger.info("flows.parquet schema and integrity checks passed")

def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

manifest = {
    "dataset": "vnat",
    "source_file": str(h5_path),
    "flows_parquet": str(out_path),
    "flows_sha256": sha256_file(out_path),

    "features_yaml": str(features_path),
    "features_yaml_sha256": hashlib.sha256(features_path.read_bytes()).hexdigest(),

    "rows": int(len(reloaded)),
    "unique_captures": int(reloaded["capture_id"].nunique()),
    "unique_flows": int(reloaded["flow_id"].nunique()),

    "label_counts": reloaded["label"].value_counts().to_dict(),

    "window": {"N": int(N), "eps": float(EPS)},
    "pct_window_complete": float((reloaded["packet_count"] >= N).mean() * 100),

    "timestamp_repair": {
        "rows_repaired": int(repair_summary["rows_repaired"]),
        "pct_repaired": float(repair_summary["pct_repaired"]),
        "max_backsteps_in_row": int(repair_summary["max_backsteps_in_row"]),
        "worst_min_backstep": float(repair_summary["worst_min_backstep"]),
    },
}

manifest_path = out_dir / "flows_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
logger.info(f"Saved manifest: {manifest_path}")

reloaded.head()

2026-02-14 07:12:15 | INFO | ai-vpn-firewall | Reloaded parquet: shape=(33711, 16)
2026-02-14 07:12:15 | INFO | ai-vpn-firewall | flows.parquet schema and integrity checks passed
2026-02-14 07:12:15 | INFO | ai-vpn-firewall | Saved manifest: C:\Users\scoti\PycharmProjects\ai-vpn-firewall\data\processed\vnat\flows_manifest.json


Unnamed: 0,capture_id,capture_name,row_id,flow_id,flow_key,connection_str,timestamps,sizes,directions,file_names,app,label,packet_count,packet_count_full,window_complete,min_packets_ok
0,vpn_youtube_capture2.pcap,vpn_youtube_capture2.pcap,0,vpn_youtube_capture2.pcap::0,10.123.1.2:1195-10.123.1.1:1195-p17,10.123.1.2:1195-10.123.1.1:1195-p17,"[1563289706.330096, 1563289706.330207, 1563289...","[120, 88, 120, 88, 120, 88, 120, 120, 152, 120...","[1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, ...",vpn_youtube_capture2.pcap,youtube,1,100,62283,True,True
1,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,1,nonvpn_sftp_newcapture1.pcap::1,10.113.1.2:22924-10.115.1.2:53-p17,10.113.1.2:22924-10.115.1.2:53-p17,"[1561391908.523659, 1561391908.524042]","[63, 79]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False
2,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,2,nonvpn_sftp_newcapture1.pcap::2,10.113.1.2:53065-10.115.1.2:53-p17,10.113.1.2:53065-10.115.1.2:53-p17,"[1561391908.523706, 1561391908.524059]","[63, 63]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False
3,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,3,nonvpn_sftp_newcapture1.pcap::3,10.113.1.150:39816-10.115.1.123:22-p6,10.113.1.150:39816-10.115.1.123:22-p6,"[1561391908.524836, 1561391908.525027, 1561391...","[60, 60, 52, 73, 52, 73, 52, 1378, 222, 52, 13...","[1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",nonvpn_sftp_newcapture1.pcap,sftp,0,100,2002346,True,True
4,nonvpn_sftp_newcapture1.pcap,nonvpn_sftp_newcapture1.pcap,4,nonvpn_sftp_newcapture1.pcap::4,10.115.1.2:6589-10.113.1.2:53-p17,10.115.1.2:6589-10.113.1.2:53-p17,"[1561391908.594887, 1561391908.595301]","[51, 102]","[1, 0]",nonvpn_sftp_newcapture1.pcap,sftp,0,2,2,False,False


In [28]:
print("Saved columns:", sorted(reloaded.columns))
print("In-memory columns:", sorted(flows.columns))

print("Missing in reloaded:", sorted(set(flows.columns) - set(reloaded.columns)))
print("Extra in reloaded:", sorted(set(reloaded.columns) - set(flows.columns)))

Saved columns: ['app', 'capture_id', 'capture_name', 'connection_str', 'directions', 'file_names', 'flow_id', 'flow_key', 'label', 'min_packets_ok', 'packet_count', 'packet_count_full', 'row_id', 'sizes', 'timestamps', 'window_complete']
In-memory columns: ['app', 'capture_id', 'capture_name', 'connection_str', 'directions', 'file_names', 'flow_id', 'flow_key', 'label', 'min_packets_ok', 'packet_count', 'packet_count_full', 'row_id', 'sizes', 'timestamps', 'window_complete']
Missing in reloaded: []
Extra in reloaded: []


In [29]:
assert set(reloaded.columns) == set(flows.columns)
assert reloaded["capture_id"].nunique() == flows["capture_id"].nunique()
assert len(reloaded) == len(flows)

In [30]:
EXPECTED_COLS = {
    "capture_id",
    "capture_name",
    "row_id",
    "flow_id",
    "flow_key",
    "connection_str",
    "timestamps",
    "sizes",
    "directions",
    "file_names",
    "app",
    "label",
    "packet_count",
    "packet_count_full",
    "window_complete",
    "min_packets_ok",
}

assert set(reloaded.columns) == EXPECTED_COLS, (
    f"Schema mismatch. "
    f"Missing: {EXPECTED_COLS - set(reloaded.columns)}, "
    f"Extra: {set(reloaded.columns) - EXPECTED_COLS}"
)

assert len(reloaded) > 0
assert reloaded["capture_id"].nunique() > 0
assert reloaded["flow_id"].nunique() > 0

logger.info("flows.parquet schema and integrity checks passed")

2026-02-14 07:12:15 | INFO | ai-vpn-firewall | flows.parquet schema and integrity checks passed


In [31]:
del flows

In [32]:
reloaded["label"].value_counts()

label
0    33332
1      379
Name: count, dtype: int64

In [33]:
df.groupby("label")["packet_count_full"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,33332.0,678.462829,16565.137433,1.0,2.0,2.0,2.0,2002346.0
1,379.0,40867.406332,314731.45051,1.0,3.0,3.0,3.0,3842411.0


In [34]:
for n in [5, 10, 20, 50, 100]:
    print(n, df.groupby("label")["packet_count_full"].apply(lambda s: (s>=n).mean()))

5 label
0    0.111214
1    0.218997
Name: packet_count_full, dtype: float64
10 label
0    0.106234
1    0.218997
Name: packet_count_full, dtype: float64
20 label
0    0.085203
1    0.218997
Name: packet_count_full, dtype: float64
50 label
0    0.045122
1    0.218997
Name: packet_count_full, dtype: float64
100 label
0    0.025981
1    0.218997
Name: packet_count_full, dtype: float64


In [35]:
df.groupby("label")["capture_id"].nunique(), df["capture_id"].value_counts().head(15)

(label
 0    83
 1    82
 Name: capture_id, dtype: int64,
 capture_id
 nonvpn_ssh_capture5.pcap         11368
 nonvpn_scp_long_capture1.pcap    10555
 nonvpn_ssh_capture3.pcap          1600
 nonvpn_vimeo_capture1.pcap        1217
 nonvpn_scp_newcapture1.pcap       1214
 nonvpn_scp_capture1.pcap          1074
 nonvpn_rsync_newcapture1.pcap     1013
 nonvpn_rsync_capture1.pcap         898
 nonvpn_sftp_newcapture1.pcap       709
 nonvpn_sftp_newcapture2.pcap       648
 nonvpn_ssh_capture1.pcap           298
 nonvpn_sftp_capture3.pcap          216
 nonvpn_ssh_capture2.pcap           175
 nonvpn_netflix_capture1.pcap       140
 nonvpn_youtube_capture1.pcap       136
 Name: count, dtype: int64)

In [36]:
df[df["label"]==1]["app"].value_counts().head(20)

app
voip          299
skype-chat     57
ssh             5
rsync           4
sftp            4
youtube         3
rdp             3
scp             2
netflix         1
vimeo           1
Name: count, dtype: int64

In [37]:
df[df["label"]==1]["capture_id"].value_counts().head(20)

capture_id
vpn_voip_capture1.pcap           126
vpn_voip_capture3.pcap           114
vpn_voip_capture2.pcap            59
vpn_rsync_capture2.pcap            2
vpn_youtube_capture2.pcap          1
vpn_ssh_capture2.pcap              1
vpn_sftp_capture1.pcap             1
vpn_skype-chat_capture8.pcap       1
vpn_skype-chat_capture29.pcap      1
vpn_skype-chat_capture16.pcap      1
vpn_skype-chat_capture36.pcap      1
vpn_skype-chat_capture30.pcap      1
vpn_sftp_capture3.pcap             1
vpn_skype-chat_capture13.pcap      1
vpn_skype-chat_capture10.pcap      1
vpn_rsync_capture3.pcap            1
vpn_skype-chat_capture37.pcap      1
vpn_skype-chat_capture31.pcap      1
vpn_skype-chat_capture43.pcap      1
vpn_rdp_capture1.pcap              1
Name: count, dtype: int64