In [1]:
import pandas as pd

path = r"C:\Users\scoti\projects\ai-vpn-firewall\data\raw\vnat\VNAT_Dataframe_release_1.h5"
df = pd.read_hdf(path)
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

df.head(3)

Shape: (33711, 5)

Columns:
['connection', 'timestamps', 'sizes', 'directions', 'file_names']


Unnamed: 0,connection,timestamps,sizes,directions,file_names
0,"(10.123.1.2, 1195, 10.123.1.1, 1195, 17)","[1563289706.330096, 1563289706.330207, 1563289...","[120, 88, 120, 88, 120, 88, 120, 120, 152, 120...","[1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, ...",vpn_youtube_capture2.pcap
0,"(10.113.1.2, 22924, 10.115.1.2, 53, 17)","[1561391908.523659, 1561391908.524042]","[63, 79]","[1, 0]",nonvpn_sftp_newcapture1.pcap
1,"(10.113.1.2, 53065, 10.115.1.2, 53, 17)","[1561391908.523706, 1561391908.524059]","[63, 63]","[1, 0]",nonvpn_sftp_newcapture1.pcap


In [2]:
names = df["file_names"].astype(str)
bad = names[~(names.str.startswith("vpn_") | names.str.startswith("nonvpn_"))]
print("Bad prefix count:", bad.shape[0])
print(bad.head(20).tolist())

Bad prefix count: 0
[]


In [3]:
import numpy as np
vals = set(np.concatenate(df["directions"].values))
print(vals)

{np.int64(0), np.int64(1)}


TODO 
- sort packets by timestamp per flow before:
IAT computation, first-N packet selection

Otherwise the CNN will see negative IAT spikes and noise.
- add explicit preprocessing rule: packets will get sorted by timestamp prior to windowing and IAT computation.

(pipeline rule) 

In [4]:
import numpy as np

def count_nonmonotonic(ts):
    ts = np.array(ts, dtype=float)
    return np.sum(np.diff(ts) < 0)

nonmono = sum(count_nonmonotonic(ts) > 0 for ts in df["timestamps"])
print("Flows with non-monotonic timestamps:", nonmono, "/", len(df))

Flows with non-monotonic timestamps: 246 / 33711


~95% of flows are extremely short (â‰¤ 2 packets!) so only ~2.8% of flows even reach 100 packets --> this affects the CNN

In [5]:
lens = df["sizes"].apply(len)
print(lens.describe())
print("pct >=100:", (lens >= 100).mean())
print("pct <50:", (lens < 50).mean())

count    3.371100e+04
mean     1.130292e+03
std      3.741679e+04
min      1.000000e+00
25%      2.000000e+00
50%      2.000000e+00
75%      2.000000e+00
max      3.842411e+06
Name: sizes, dtype: float64
pct >=100: 0.028151048619145087
pct <50: 0.9529233781258343


In [6]:
import pandas as pd

labels = df["file_names"].astype(str).str.startswith("vpn_").astype(int)
cap = df["file_names"].astype(str)

by_cap = pd.DataFrame({"cap": cap, "y": labels}).groupby("cap")["y"].agg(["count","mean"])
print(by_cap["mean"].value_counts().head(10))
print("captures:", by_cap.shape[0])

mean
0.0    83
1.0    82
Name: count, dtype: int64
captures: 165


In [None]:
import pandas as pd

flows = pd.read_parquet("../data/processed/vnat/flows.parquet")
print("rows:", len(flows))
print("unique flow_id:", flows["flow_id"].nunique())
print("duplicates:", len(flows) - flows["flow_id"].nunique())
print(flows["flow_id"].value_counts().head(10))

rows: 33711
unique flow_id: 33711
duplicates: 0
flow_id
0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

VNAT_H5 = Path("../data/raw/vnat/VNAT_Dataframe_release_1.h5")
OUT_FLOWS = Path("../data/processed/vnat/flows.parquet")
OUT_FLOWS.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_hdf(VNAT_H5)

# IMPORTANT: VNAT H5 index is NOT unique in your file, so we create a new one
df = df.reset_index(drop=True)
df["flow_id"] = df.index.astype(int)

fn = df["file_names"].astype(str)
y = np.where(fn.str.startswith("vpn_"), 1,
     np.where(fn.str.startswith("nonvpn_"), 0, -1))

if (y == -1).any():
    bad = df.loc[y == -1, "file_names"].head(20).tolist()
    raise ValueError(f"Unknown label prefix in file_names, examples: {bad}")

def flow_duration(ts):
    if len(ts) < 2:
        return 0.0
    ts = np.array(ts, dtype=float)
    return float(np.max(ts) - np.min(ts))

flows = pd.DataFrame({
    "flow_id": df["flow_id"].astype(int),
    "capture_id": fn,
    "label": y.astype(int),
    "pkt_len": df["sizes"].apply(len).astype(int),
    "duration": df["timestamps"].apply(flow_duration).astype(float),
})

# Sanity checks (mandatory)
assert len(flows) == len(df), "Row count mismatch!"
assert flows["flow_id"].is_unique, "flow_id is NOT unique!"

flows.to_parquet(OUT_FLOWS, index=False, engine="pyarrow")

print("Saved:", OUT_FLOWS)
print("rows:", len(flows))
print("unique flow_id:", flows["flow_id"].nunique())
flows.head()

Saved: ..\data\processed\vnat\flows.parquet
rows: 33711
unique flow_id: 33711


Unnamed: 0,flow_id,capture_id,label,pkt_len,duration
0,0,vpn_youtube_capture2.pcap,1,62283,800.568697
1,1,nonvpn_sftp_newcapture1.pcap,0,2,0.000383
2,2,nonvpn_sftp_newcapture1.pcap,0,2,0.000353
3,3,nonvpn_sftp_newcapture1.pcap,0,2002346,473.359046
4,4,nonvpn_sftp_newcapture1.pcap,0,2,0.000414
