In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

VNAT_H5 = Path("../data/raw/vnat/VNAT_Dataframe_release_1.h5")
FLOWS   = Path("../data/processed/vnat/flows.parquet")

OUT_DIR = Path("../data/processed/vnat")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_XGB  = OUT_DIR / "xgb_features.parquet"
OUT_COLS = Path("../artifacts/features/feature_columns.json")
OUT_COLS.parent.mkdir(parents=True, exist_ok=True)

N = 100
EPS = 1e-9

# Load VNAT and align indices to flow_id
df = pd.read_hdf(VNAT_H5).reset_index(drop=True)  # critical
flows = pd.read_parquet(FLOWS).set_index("flow_id").sort_index()

assert len(df) == len(flows), f"Mismatch df({len(df)}) vs flows({len(flows)})"

# ---- helpers ----
def safe_stats(x: np.ndarray, prefix: str):
    if x.size == 0:
        return {
            f"{prefix}_mean": 0.0, f"{prefix}_std": 0.0, f"{prefix}_min": 0.0, f"{prefix}_max": 0.0,
            f"{prefix}_median": 0.0, f"{prefix}_q25": 0.0, f"{prefix}_q75": 0.0
        }
    return {
        f"{prefix}_mean": float(np.mean(x)),
        f"{prefix}_std": float(np.std(x)),
        f"{prefix}_min": float(np.min(x)),
        f"{prefix}_max": float(np.max(x)),
        f"{prefix}_median": float(np.median(x)),
        f"{prefix}_q25": float(np.quantile(x, 0.25)),
        f"{prefix}_q75": float(np.quantile(x, 0.75)),
    }

def size_hist_fracs(abs_sizes: np.ndarray, pkt_count: int):
    bins = [(0,99),(100,299),(300,599),(600,899),(900,1199),(1200, None)]
    out = {}
    denom = max(pkt_count, 1)
    for i, (lo, hi) in enumerate(bins, start=1):
        if hi is None:
            c = int(np.sum(abs_sizes >= lo))
        else:
            c = int(np.sum((abs_sizes >= lo) & (abs_sizes <= hi)))
        out[f"size_bin_frac_{i}"] = c / denom
    return out

def iat_from_times(t: np.ndarray):
    if t.size < 2:
        return np.array([], dtype=np.float64)
    d = np.diff(t)
    d[d < 0] = 0.0
    return d

# ---- build features ----
rows = []
for flow_id in flows.index.to_numpy():
    ts = np.asarray(df.at[flow_id, "timestamps"], dtype=np.float64)
    sz = np.asarray(df.at[flow_id, "sizes"], dtype=np.float64)
    dr = np.asarray(df.at[flow_id, "directions"], dtype=np.int64)

    if ts.size == 0:
        rows.append({
            "flow_id": int(flow_id),
            "pkt_count_total": 0,
            "byte_count_total": 0.0,
            "flow_duration": 0.0,
            "pkt_rate": 0.0,
            "byte_rate": 0.0,
            "pkt_count_up": 0,
            "pkt_count_down": 0,
            "byte_count_up": 0.0,
            "byte_count_down": 0.0,
            "pkt_ratio_up_down": 1.0,
            "byte_ratio_up_down": 1.0,
            "up_fraction_bytes": 0.0,
            "down_fraction_bytes": 0.0,
            "cv_pkt_size": 0.0,
            "cv_iat": 0.0,
            "max_iat_over_median_iat": 0.0,
            "pkt_count_observed": 0,
            "window_complete": 0,
            **safe_stats(np.array([], dtype=np.float64), "pkt_size_all"),
            **safe_stats(np.array([], dtype=np.float64), "pkt_size_up"),
            **safe_stats(np.array([], dtype=np.float64), "pkt_size_down"),
            **safe_stats(np.array([], dtype=np.float64), "iat_all"),
            **safe_stats(np.array([], dtype=np.float64), "iat_up"),
            **safe_stats(np.array([], dtype=np.float64), "iat_down"),
            **{f"size_bin_frac_{i}": 0.0 for i in range(1, 7)}
        })
        continue

    # sort by time (matches CNN)
    order = np.argsort(ts)
    ts, sz, dr = ts[order], sz[order], dr[order]

    k = int(min(sz.size, N))
    ts = ts[:k]
    sz = sz[:k]
    dr = dr[:k]

    up_mask = (dr == 1)
    down_mask = (dr == 0)

    sz_up = sz[up_mask]
    sz_down = sz[down_mask]

    iat_all = iat_from_times(ts)
    iat_up = iat_from_times(ts[up_mask])
    iat_down = iat_from_times(ts[down_mask])

    duration = float(ts[-1] - ts[0]) if ts.size >= 2 else 0.0
    pkt_count_total = int(k)
    byte_count_total = float(np.sum(sz))

    pkt_rate = pkt_count_total / (duration + EPS)
    byte_rate = byte_count_total / (duration + EPS)

    pkt_count_up = int(sz_up.size)
    pkt_count_down = int(sz_down.size)
    byte_count_up = float(np.sum(sz_up)) if sz_up.size else 0.0
    byte_count_down = float(np.sum(sz_down)) if sz_down.size else 0.0

    pkt_ratio_up_down = (pkt_count_up + 1) / (pkt_count_down + 1)
    byte_ratio_up_down = (byte_count_up + 1.0) / (byte_count_down + 1.0)
    up_fraction_bytes = byte_count_up / (byte_count_total + EPS)
    down_fraction_bytes = byte_count_down / (byte_count_total + EPS)

    size_all_stats = safe_stats(sz, "pkt_size_all")
    size_up_stats = safe_stats(sz_up, "pkt_size_up")
    size_down_stats = safe_stats(sz_down, "pkt_size_down")

    iat_all_stats = safe_stats(iat_all, "iat_all")
    iat_up_stats = safe_stats(iat_up, "iat_up")
    iat_down_stats = safe_stats(iat_down, "iat_down")

    cv_pkt_size = size_all_stats["pkt_size_all_std"] / (size_all_stats["pkt_size_all_mean"] + EPS)
    cv_iat = iat_all_stats["iat_all_std"] / (iat_all_stats["iat_all_mean"] + EPS)
    max_iat_over_median_iat = (iat_all_stats["iat_all_max"] + EPS) / (iat_all_stats["iat_all_median"] + EPS)

    hist = size_hist_fracs(np.abs(sz), pkt_count_total)

    pkt_count_observed = pkt_count_total
    window_complete = 1 if pkt_count_total == N else 0

    row = {
        "flow_id": int(flow_id),

        "pkt_count_total": pkt_count_total,
        "byte_count_total": byte_count_total,
        "flow_duration": duration,
        "pkt_rate": pkt_rate,
        "byte_rate": byte_rate,

        "pkt_count_up": pkt_count_up,
        "pkt_count_down": pkt_count_down,
        "byte_count_up": byte_count_up,
        "byte_count_down": byte_count_down,
        "pkt_ratio_up_down": pkt_ratio_up_down,
        "byte_ratio_up_down": byte_ratio_up_down,
        "up_fraction_bytes": up_fraction_bytes,
        "down_fraction_bytes": down_fraction_bytes,

        "cv_pkt_size": float(cv_pkt_size),
        "cv_iat": float(cv_iat),
        "max_iat_over_median_iat": float(max_iat_over_median_iat),

        "pkt_count_observed": pkt_count_observed,
        "window_complete": window_complete,
    }

    row.update(size_all_stats)
    row.update(size_up_stats)
    row.update(size_down_stats)

    row.update(iat_all_stats)
    row.update(iat_up_stats)
    row.update(iat_down_stats)

    row.update(hist)

    rows.append(row)

xgb_df = pd.DataFrame(rows).sort_values("flow_id").reset_index(drop=True)

# attach capture_id + label so the features file is self-contained
meta = flows.reset_index()[["flow_id", "capture_id", "label"]]
xgb_df = xgb_df.merge(meta, on="flow_id", how="left")

assert not xgb_df["label"].isna().any(), "label missing after merge"
assert not xgb_df["capture_id"].isna().any(), "capture_id missing after merge"

# optional: log transforms for heavy tails (helps generalization)
for col in ["byte_count_total", "byte_count_up", "byte_count_down", "pkt_rate", "byte_rate", "flow_duration"]:
    xgb_df[col] = np.log1p(xgb_df[col].astype(float))

# Save
xgb_df.to_parquet(OUT_XGB, index=False, engine="pyarrow")

# Save ONLY feature columns (exclude meta)
feature_cols = [c for c in xgb_df.columns if c not in ["flow_id", "capture_id", "label"]]
OUT_COLS.write_text(json.dumps(feature_cols, indent=2), encoding="utf-8")

print("Saved:", OUT_XGB)
print("Rows:", len(xgb_df), "Cols:", len(xgb_df.columns))
print("Feature columns saved:", OUT_COLS)
print("Example feature cols:", feature_cols[:10])
xgb_df.head()

Saved: ..\data\processed\vnat\xgb_features.parquet
Rows: 33711 Cols: 69
Feature columns saved: ..\artifacts\features\feature_columns.json
Example feature cols: ['pkt_count_total', 'byte_count_total', 'flow_duration', 'pkt_rate', 'byte_rate', 'pkt_count_up', 'pkt_count_down', 'byte_count_up', 'byte_count_down', 'pkt_ratio_up_down']


Unnamed: 0,flow_id,pkt_count_total,byte_count_total,flow_duration,pkt_rate,byte_rate,pkt_count_up,pkt_count_down,byte_count_up,byte_count_down,...,iat_down_q25,iat_down_q75,size_bin_frac_1,size_bin_frac_2,size_bin_frac_3,size_bin_frac_4,size_bin_frac_5,size_bin_frac_6,capture_id,label
0,0,100,11.120431,2.46797,2.328308,8.741171,39,61,8.934982,11.001183,...,4.1e-05,0.000519,0.03,0.52,0.01,0.05,0.01,0.38,vpn_youtube_capture2.pcap,1
1,1,2,4.962845,0.000383,8.561072,12.823563,1,1,4.158883,4.382027,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nonvpn_sftp_newcapture1.pcap,0
2,2,2,4.844187,0.000353,8.642086,12.785047,1,1,4.158883,4.158883,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,nonvpn_sftp_newcapture1.pcap,0
3,3,100,10.84285,0.284168,5.721195,11.955581,57,43,10.71295,8.737773,...,0.000163,0.002826,0.33,0.3,0.05,0.01,0.01,0.3,nonvpn_sftp_newcapture1.pcap,0
4,4,2,5.036953,0.000414,8.483251,12.820337,1,1,3.951244,4.634729,...,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,nonvpn_sftp_newcapture1.pcap,0


In [2]:
import pandas as pd

xgb = pd.read_parquet("../data/processed/vnat/xgb_features.parquet")
print(xgb.shape)
print("Any NaNs:", xgb.isna().any().any())
print("Example columns:", list(xgb.columns)[:12])

(33711, 69)
Any NaNs: False
Example columns: ['flow_id', 'pkt_count_total', 'byte_count_total', 'flow_duration', 'pkt_rate', 'byte_rate', 'pkt_count_up', 'pkt_count_down', 'byte_count_up', 'byte_count_down', 'pkt_ratio_up_down', 'byte_ratio_up_down']
