In [3]:
import os
import glob
import pandas as pd

# ==== USER SETTINGS ====
INPUT_DIR = r"D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023"   # folder where all your CSVs are
OUTPUT_FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"
CHUNK_SIZE  = 500_000   # adjust if you want smaller/larger memory footprint
# =======================

os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# 1) Find CSVs
csv_files = sorted(glob.glob(os.path.join(INPUT_DIR, "**", "*.csv"), recursive=True))
if not csv_files:
    raise FileNotFoundError(f"No CSV files found under: {INPUT_DIR}")
print(f"Found {len(csv_files)} CSV files.")

# 2) Build the union of columns (without loading full data)
union_cols = []
seen = set()
for fp in csv_files:
    try:
        hdr = pd.read_csv(fp, nrows=0, low_memory=False)
        cols = [c.strip() for c in hdr.columns]
        for c in cols:
            if c not in seen:
                seen.add(c)
                union_cols.append(c)
    except Exception as e:
        print(f"[warn] Skipping unreadable header: {fp} -> {e}")

if not union_cols:
    raise RuntimeError("Could not read any headers to determine columns.")

print(f"Total columns in union: {len(union_cols)}")

# 3) Stream through files chunk-by-chunk and append to one CSV
wrote_header = False
rows_written = 0

for idx, fp in enumerate(csv_files, 1):
    print(f"[{idx}/{len(csv_files)}] Processing: {fp}")
    try:
        # Use chunksize to avoid loading whole file
        for chunk in pd.read_csv(fp, chunksize=CHUNK_SIZE, low_memory=False):
            # Normalize column names
            chunk.columns = [c.strip() for c in chunk.columns]
            # Reindex to union columns (missing -> NaN); extra cols are dropped
            chunk = chunk.reindex(columns=union_cols)
            # Append to output
            chunk.to_csv(OUTPUT_FILE, mode='a', index=False, header=not wrote_header)
            wrote_header = True
            rows_written += len(chunk)
    except Exception as e:
        print(f"[warn] Skipping file due to read error: {fp} -> {e}")

print(f"\n[OK] Wrote {rows_written} rows to {OUTPUT_FILE}")
print("Done (streaming merge, memory-safe).")


Found 169 CSV files.
Total columns in union: 47
[1/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[2/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[3/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[4/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[5/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[6/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
[7/169] Processing: D:/PARA/projects/FYP/IDS2/CIC_dataset/wataiData/csv/CICIoT2023\part-00006-363d1ba3-8ab5-4f96

In [4]:
import pandas as pd

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"

# 1) Peek first few rows
df_head = pd.read_csv(FILE, nrows=5)
print("HEAD:")
print(df_head)

# 2) Peek column names
cols = list(df_head.columns)
print("\nColumns:", cols)
print("\nTotal columns:", len(cols))

# 3) Peek dtypes (from first 5 rows only)
print("\nDtypes (quick):")
print(df_head.dtypes)


HEAD:
   flow_duration  Header_Length  Protocol Type  Duration         Rate  \
0       0.000000          54.00           6.00     64.00     0.329807   
1       0.000000          57.04           6.33     64.00     4.290556   
2       0.000000           0.00           1.00     64.00    33.396799   
3       0.328175       76175.00          17.00     64.00  4642.133010   
4       0.117320         101.73           6.11     65.91     6.202211   

         Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  ...  \
0     0.329807    0.0              1.0              0.0              1.0  ...   
1     4.290556    0.0              0.0              0.0              0.0  ...   
2    33.396799    0.0              0.0              0.0              0.0  ...   
3  4642.133010    0.0              0.0              0.0              0.0  ...   
4     6.202211    0.0              0.0              1.0              0.0  ...   

         Std  Tot size           IAT  Number   Magnitue     Radius  

In [5]:
possible_labels = {"label","Label","attack","Attack","class","Class","target","Target","y",
                   "Attack_type","attack_type","subcategory","Category","category"}

label_col = None
for c in cols:
    if c in possible_labels:
        label_col = c

print("Detected label column:", label_col)


Detected label column: label


In [6]:
import pandas as pd
from collections import Counter

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"
CHUNK = 500_000

label_counts = Counter()

for chunk in pd.read_csv(FILE, usecols=["label"], chunksize=CHUNK):
    chunk["label"] = chunk["label"].astype(str)
    label_counts.update(chunk["label"])

print("\n=== CLASS DISTRIBUTION ===")
for k, v in label_counts.most_common():
    print(f"{k}: {v}")



=== CLASS DISTRIBUTION ===
DDoS-ICMP_Flood: 7200504
DDoS-UDP_Flood: 5412287
DDoS-TCP_Flood: 4497667
DDoS-PSHACK_Flood: 4094755
DDoS-SYN_Flood: 4059190
DDoS-RSTFINFlood: 4045285
DDoS-SynonymousIP_Flood: 3598138
DoS-UDP_Flood: 3318595
DoS-TCP_Flood: 2671445
DoS-SYN_Flood: 2028834
BenignTraffic: 1098195
Mirai-greeth_flood: 991866
Mirai-udpplain: 890576
Mirai-greip_flood: 751682
DDoS-ICMP_Fragmentation: 452489
MITM-ArpSpoofing: 307593
DDoS-UDP_Fragmentation: 286925
DDoS-ACK_Fragmentation: 285104
DNS_Spoofing: 178911
Recon-HostDiscovery: 134378
Recon-OSScan: 98259
Recon-PortScan: 82284
DoS-HTTP_Flood: 71864
VulnerabilityScan: 37382
DDoS-HTTP_Flood: 28790
DDoS-SlowLoris: 23426
DictionaryBruteForce: 13064
BrowserHijacking: 5859
CommandInjection: 5409
SqlInjection: 5245
XSS: 3846
Backdoor_Malware: 3218
Recon-PingSweep: 2262
Uploading_Attack: 1252


In [7]:
FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"

# Only read a small sample (no memory issues)
df = pd.read_csv(FILE, nrows=5000, low_memory=False)

# Show columns
print("\n=== COLUMN LIST ===")
print(df.columns.tolist())
print("\nTotal columns:", len(df.columns))

# Show dtypes
print("\n=== DTYPES ===")
print(df.dtypes)

# Show first 5 rows
print("\n=== SAMPLE ROWS ===")
print(df.head())


=== COLUMN LIST ===
['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label']

Total columns: 47

=== DTYPES ===
flow_duration      float64
Header_Length      float64
Protocol Type      float64
Duration           float64
Rate               float64
Srate              float64
Drate              float64
fin_flag_number    float64
syn_flag_number    float64
rst_flag_number    float64
psh_flag_number    float64
ack_flag_number    float64
ece_flag_number    float64
cwr_flag_number    float64
ack_count          float64
syn_count

In [8]:
import pandas as pd
import numpy as np

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"

df = pd.read_csv(FILE, nrows=200000)  # sample safely
df = df.drop(columns=['label'])

# replace inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# calculate number of unique values per column
uniq = df.nunique()

print("=== UNIQUE VALUES PER COLUMN (top problematic) ===")
print(uniq.sort_values().head(15))

# calculate variance
variance = df.var(numeric_only=True)

print("\n=== LOW VARIANCE COLUMNS ===")
print(variance.sort_values().head(15))


=== UNIQUE VALUES PER COLUMN (top problematic) ===
ece_flag_number    1
cwr_flag_number    1
SMTP               1
Telnet             1
IRC                1
DHCP               1
fin_flag_number    2
syn_flag_number    2
DNS                2
HTTP               2
TCP                2
HTTPS              2
rst_flag_number    2
ack_flag_number    2
psh_flag_number    2
dtype: int64

=== LOW VARIANCE COLUMNS ===
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
SMTP               0.000000e+00
Telnet             0.000000e+00
IRC                0.000000e+00
DHCP               0.000000e+00
Drate              1.191480e-07
SSH                4.999775e-05
ARP                6.999545e-05
IPv                1.199862e-04
LLC                1.199862e-04
DNS                1.799685e-04
HTTP               4.660374e-02
HTTPS              5.209539e-02
Variance           5.450901e-02
dtype: float64


In [10]:
df_full = pd.read_csv(FILE, nrows=200000, low_memory=False)

# drop the 6 columns
df_full = df_full.drop(columns=drop_cols, errors='ignore')

# handle inf → nan
df_full = df_full.replace([np.inf, -np.inf], np.nan)

# fill missing values
df_full = df_full.fillna(0)

print(df_full.head())
print("Columns AFTER cleaning:", len(df_full.columns))


   flow_duration  Header_Length  Protocol Type  Duration         Rate  \
0       0.000000          54.00           6.00     64.00     0.329807   
1       0.000000          57.04           6.33     64.00     4.290556   
2       0.000000           0.00           1.00     64.00    33.396799   
3       0.328175       76175.00          17.00     64.00  4642.133010   
4       0.117320         101.73           6.11     65.91     6.202211   

         Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  ...  \
0     0.329807    0.0              1.0              0.0              1.0  ...   
1     4.290556    0.0              0.0              0.0              0.0  ...   
2    33.396799    0.0              0.0              0.0              0.0  ...   
3  4642.133010    0.0              0.0              0.0              0.0  ...   
4     6.202211    0.0              0.0              1.0              0.0  ...   

         Std  Tot size           IAT  Number   Magnitue     Radius  \
0   

In [14]:
import pandas as pd
import numpy as np
import os

# === paths ===
FILE     = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_raw.csv"
OUT_FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz"  # final output

# === settings ===
CHUNK = 500_000
LABEL_COL = "label"
BENIGN = "BenignTraffic"

# columns to drop
drop_cols = [
    'ece_flag_number',
    'cwr_flag_number',
    'SMTP',
    'Telnet',
    'IRC',
    'DHCP'
]

os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)

wrote_header = False
total_rows = 0
benign_ct = 0
attack_ct = 0
kept_cols = None

for i, chunk in enumerate(pd.read_csv(FILE, chunksize=CHUNK, low_memory=False), 1):

    # 1) drop 6 useless columns
    chunk = chunk.drop(columns=drop_cols, errors='ignore')

    # 2) ensure label exists
    if LABEL_COL not in chunk.columns:
        raise ValueError(f"'{LABEL_COL}' missing in chunk {i}.")

    # 3) clean values
    chunk = chunk.replace([np.inf, -np.inf], np.nan).fillna(0)

    # 4) create binary label
    chunk[LABEL_COL] = chunk[LABEL_COL].astype(str)
    chunk["label_bin"] = (chunk[LABEL_COL] != BENIGN).astype("int8")

    # 5) convert numeric columns → float32 (safe, reduces space)
    numeric_cols = chunk.select_dtypes(include=[np.number]).columns.tolist()
    if "label_bin" in numeric_cols:
        numeric_cols.remove("label_bin")
    chunk[numeric_cols] = chunk[numeric_cols].astype("float32")

    # 6) lock column order on first chunk
    if kept_cols is None:
        kept_cols = list(chunk.columns)

    out = chunk.reindex(columns=kept_cols)

    # 7) stats
    vc = out["label_bin"].value_counts()
    benign_ct += int(vc.get(0, 0))
    attack_ct += int(vc.get(1, 0))
    total_rows += len(out)

    # 8) write final CSV (compressed)
    out.to_csv(
        OUT_FILE,
        index=False,
        header=not wrote_header,
        mode='a',
        compression='gzip'
    )
    wrote_header = True

print("\n[OK] Final cleaned dataset created.")
print(f"Saved to: {OUT_FILE}")
print(f"Total rows: {total_rows:,}")
print(f"Benign (0): {benign_ct:,}   |   Attack (1): {attack_ct:,}")
print(f"Total columns written: {len(kept_cols)}")



[OK] Final cleaned dataset created.
Saved to: D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz
Total rows: 46,686,579
Benign (0): 1,098,195   |   Attack (1): 45,588,384
Total columns written: 42


In [15]:


FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz"

df = pd.read_csv(FILE, nrows=5)
print(df.columns.tolist())
print("Total columns:", len(df.columns))
print(df.head())


['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label', 'label_bin']
Total columns: 42
   flow_duration  Header_Length  Protocol Type  Duration         Rate  \
0       0.000000          54.00           6.00     64.00     0.329807   
1       0.000000          57.04           6.33     64.00     4.290556   
2       0.000000           0.00           1.00     64.00    33.396800   
3       0.328175       76175.00          17.00     64.00  4642.133000   
4       0.117320         101.73           6.11     65.91     6.202211   

         Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_numbe

In [16]:
print(df.dtypes)


flow_duration      float64
Header_Length      float64
Protocol Type      float64
Duration           float64
Rate               float64
Srate              float64
Drate              float64
fin_flag_number    float64
syn_flag_number    float64
rst_flag_number    float64
psh_flag_number    float64
ack_flag_number    float64
ack_count          float64
syn_count          float64
fin_count          float64
urg_count          float64
rst_count          float64
HTTP               float64
HTTPS              float64
DNS                float64
SSH                float64
TCP                float64
UDP                float64
ARP                float64
ICMP               float64
IPv                float64
LLC                float64
Tot sum            float64
Min                float64
Max                float64
AVG                float64
Std                float64
Tot size           float64
IAT                float64
Number             float64
Magnitue           float64
Radius             float64
C

In [17]:
from collections import Counter

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz"
CHUNK = 500000

# --- CHECK NULL AND INF ---
null_found = False
inf_found = False

for chunk in pd.read_csv(FILE, chunksize=CHUNK, low_memory=False):
    if chunk.isna().any().any():
        null_found = True
        break
    if np.isinf(chunk.select_dtypes(include=[float, int])).any().any():
        inf_found = True
        break

print("\nNull present:", null_found)
print("Inf present:", inf_found)

# --- CHECK LABEL_BIN UNIQUE ---
df = pd.read_csv(FILE, nrows=10)
print("\nUnique label_bin:", df["label_bin"].unique())

# --- CLASS COUNTS ---
counter = Counter()
for chunk in pd.read_csv(FILE, usecols=["label_bin"], chunksize=CHUNK):
    counter.update(chunk["label_bin"])

print("\nBinary class counts:", counter)



Null present: False
Inf present: False

Unique label_bin: [1]

Binary class counts: Counter({1: 45588384, 0: 1098195})


In [18]:

from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz"
CHUNK = 500_000
SEED  = 42
rng = np.random.RandomState(SEED)

# ---- config you approved ----
BENIGN_NAME = "BenignTraffic"
BENIGN_CAP  = 1_000_000
ATTACK_CAP  = 100_000
MIN_KEEP    = 500  # drop any attack class with < 500 samples

# explicit drop-list (your list)
DROP_ATTACKS = {
    "BrowserHijacking","CommandInjection","SqlInjection","XSS",
    "Backdoor_Malware","Recon-PingSweep","Uploading_Attack"
}

# ========== PASS 1: count per attack label + benign ==========
label_counts = Counter()
for chunk in pd.read_csv(FILE, usecols=["label"], chunksize=CHUNK, low_memory=False):
    chunk["label"] = chunk["label"].astype(str)
    label_counts.update(chunk["label"])

benign_total = label_counts.get(BENIGN_NAME, 0)
# Build quotas
attack_labels = [lab for lab,cnt in label_counts.items()
                 if lab != BENIGN_NAME]

quotas = {}  # per-label target to collect
dropped_labels = set()

for lab in attack_labels:
    cnt = label_counts[lab]
    if lab in DROP_ATTACKS or cnt < MIN_KEEP:
        dropped_labels.add(lab)
        continue
    quotas[lab] = min(cnt, ATTACK_CAP)

benign_quota = min(benign_total, BENIGN_CAP)

print("Summary:")
print(f"  Benign total: {benign_total:,} -> cap {benign_quota:,}")
kept_attack_total_target = sum(quotas.values())
print(f"  Attack labels kept: {len(quotas)} (target rows {kept_attack_total_target:,})")
print(f"  Dropped labels: {len(dropped_labels)} → {sorted(list(dropped_labels))[:10]}{' ...' if len(dropped_labels)>10 else ''}")

# ========== PASS 2: collect rows to meet quotas ==========
frames = []
kept_per_label = defaultdict(int)
kept_benign = 0

for chunk in pd.read_csv(FILE, chunksize=CHUNK, low_memory=False):
    chunk["label"] = chunk["label"].astype(str)

    # Benign
    if kept_benign < benign_quota:
        ben = chunk[chunk["label"] == BENIGN_NAME]
        need_b = benign_quota - kept_benign
        if not ben.empty and need_b > 0:
            take_b = ben.iloc[:min(len(ben), need_b)]
            frames.append(take_b)
            kept_benign += len(take_b)

    # Attacks (kept labels only)
    att = chunk[chunk["label"] != BENIGN_NAME]
    if not att.empty:
        # filter out dropped labels
        att = att[att["label"].isin(quotas.keys())]
        if not att.empty:
            # iterate by label to respect per-class caps
            for lab, sub in att.groupby("label"):
                need = quotas[lab] - kept_per_label[lab]
                if need <= 0:
                    continue
                take = sub.iloc[:min(len(sub), need)]
                if len(take):
                    frames.append(take)
                    kept_per_label[lab] += len(take)

    # stop early if all quotas satisfied
    if kept_benign >= benign_quota and all(kept_per_label[l] >= quotas[l] for l in quotas):
        break

# Concatenate
df_bal = pd.concat(frames, ignore_index=True)
# Shuffle once
df_bal = df_bal.sample(frac=1, random_state=SEED).reset_index(drop=True)

print(f"\nCollected: {len(df_bal):,} rows "
      f"(Benign {kept_benign:,} + Attacks {sum(kept_per_label.values()):,})")
print("Top kept attacks (label: kept/target):",
      ", ".join([f"{lab}:{kept_per_label[lab]}/{quotas[lab]}" for lab in list(quotas)[:10]]), "...")

# ========== Build X/y ==========
feature_cols = [c for c in df_bal.columns if c not in ("label","label_bin")]
X = df_bal[feature_cols].astype("float32").values
y = df_bal["label_bin"].astype("int8").values

# ========== Stratified 70/15/15 split ==========
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, stratify=y_tmp, random_state=SEED
)

print("\nShapes:",
      "\n  X_train", X_train.shape, " y_train", y_train.shape,
      "\n  X_val  ", X_val.shape,   " y_val  ", y_val.shape,
      "\n  X_test ", X_test.shape,  " y_test ", y_test.shape)

# ========== Fit scaler on TRAIN ONLY ==========
scaler = StandardScaler()
scaler.fit(X_train)                 # train-only
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# ========== Save tensors (single compact file) ==========
OUT_NPZ = r"D:/PARA/projects/FYP/IDS2/magnum_opus/binary_balanced_tensors.npz"
np.savez_compressed(
    OUT_NPZ,
    X_train=X_train, y_train=y_train,
    X_val=X_val,     y_val=y_val,
    X_test=X_test,   y_test=y_test,
    feature_names=np.array(feature_cols, dtype=object),
    kept_per_label=np.array(sorted([(k, kept_per_label[k]) for k in kept_per_label], key=lambda x:x[0]), dtype=object),
    dropped_labels=np.array(sorted(list(dropped_labels)), dtype=object),
    seed=np.array([SEED])
)
print(f"\n[OK] Saved -> {OUT_NPZ}")


Summary:
  Benign total: 1,098,195 -> cap 1,000,000
  Attack labels kept: 26 (target rows 2,255,069)
  Dropped labels: 7 → ['Backdoor_Malware', 'BrowserHijacking', 'CommandInjection', 'Recon-PingSweep', 'SqlInjection', 'Uploading_Attack', 'XSS']

Collected: 3,255,069 rows (Benign 1,000,000 + Attacks 2,255,069)
Top kept attacks (label: kept/target): DDoS-RSTFINFlood:100000/100000, DoS-TCP_Flood:100000/100000, DDoS-ICMP_Flood:100000/100000, DoS-UDP_Flood:100000/100000, DoS-SYN_Flood:100000/100000, Mirai-greeth_flood:100000/100000, DDoS-SynonymousIP_Flood:100000/100000, Mirai-udpplain:100000/100000, DDoS-SYN_Flood:100000/100000, DDoS-PSHACK_Flood:100000/100000 ...

Shapes: 
  X_train (2278548, 40)  y_train (2278548,) 
  X_val   (488260, 40)  y_val   (488260,) 
  X_test  (488261, 40)  y_test  (488261,)

[OK] Saved -> D:/PARA/projects/FYP/IDS2/magnum_opus/binary_balanced_tensors.npz


In [20]:

from sklearn.preprocessing import StandardScaler

# Ensure float32 for memory + DL frameworks
X_train = X_train.astype("float32", copy=False)
X_val   = X_val.astype("float32",   copy=False)
X_test  = X_test.astype("float32",  copy=False)

# 1) Fit scaler on TRAIN only (no leakage)
scaler = StandardScaler()
scaler.fit(X_train)

# 2) Transform all splits
X_train = scaler.transform(X_train).astype("float32", copy=False)
X_val   = scaler.transform(X_val).astype("float32",   copy=False)
X_test  = scaler.transform(X_test).astype("float32",  copy=False)

# 3) (Optional but recommended) Save one compact file for reuse
OUT_NPZ = r"D:/PARA/projects/FYP/IDS2/magnum_opus/binary_balanced_tensors.npz"
np.savez_compressed(
    OUT_NPZ,
    X_train=X_train, y_train=y_train.astype("int8"),
    X_val=X_val,     y_val=y_val.astype("int8"),
    X_test=X_test,   y_test=y_test.astype("int8"),
)
print("[OK] Tensors saved:", OUT_NPZ,
      "\n  X_train", X_train.shape, " y_train", y_train.shape,
      "\n  X_val  ", X_val.shape,   " y_val  ", y_val.shape,
      "\n  X_test ", X_test.shape,  " y_test ", y_test.shape)


[OK] Tensors saved: D:/PARA/projects/FYP/IDS2/magnum_opus/binary_balanced_tensors.npz 
  X_train (2278548, 40)  y_train (2278548,) 
  X_val   (488260, 40)  y_val   (488260,) 
  X_test  (488261, 40)  y_test  (488261,)


In [35]:
# KronNet binary training on prebuilt tensors
# - architecture: 40 -> 64 -> 32 -> (4 -> 2x2 -> left-mult 2x2) -> head(2)
# - loss: 0.7 * Focal(gamma=2) + 0.3 * CrossEntropy, with Class-Balanced (CB) weights
# - sampler: WeightedRandomSampler from original y_train
# - scheduler: ReduceLROnPlateau on val macro-F1 (factor 0.5, patience 2, min_lr 1e-5)
# - early stop: patience 8; fail-fast: <0.70 by epoch 8 abort; target-stop: >=0.85

import os, json, math, random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# ------------------- config -------------------
NPZ_PATH   = r"D:/PARA/projects/FYP/IDS2/magnum_opus/binary_balanced_tensors.npz"
OUT_DIR    = r"D:/PARA/projects/FYP/IDS2/magnum_opus"
SEED       = 42
BATCH_SIZE = 4096
LR         = 1e-3
WD         = 1e-5
MAX_EPOCHS = 50
PATIENCE   = 8
GAMMA_FOCAL= 2.0
CB_BETA    = 0.999    # effective number of samples
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(OUT_DIR, exist_ok=True)

# ------------------- utils -------------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False  # allow speed
    torch.backends.cudnn.benchmark = True

def class_balanced_weights(y, beta=CB_BETA, num_classes=2):
    # y: numpy array of ints in [0, C)
    counts = np.bincount(y.astype(np.int64), minlength=num_classes)
    eff_num = 1.0 - np.power(beta, counts)
    w = (1.0 - beta) / np.maximum(eff_num, 1e-12)
    w = w / np.sum(w) * num_classes  # normalize to mean ~1
    return w.astype(np.float32), counts

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction
    def forward(self, logits, target):
        # target: (N,) int64; logits: (N,C)
        ce = F.cross_entropy(logits, target, weight=self.weight, reduction='none')
        pt = torch.softmax(logits, dim=1).gather(1, target.view(-1,1)).squeeze(1)
        loss = ((1 - pt) ** self.gamma) * ce
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        return loss

# ------------------- model -------------------
class KronNetBinary(nn.Module):
    def __init__(self, in_dim, p_drop=0.05):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.map4 = nn.Linear(32, 4)     # -> reshape to (2,2)
        # learnable left 2x2 multiply
        self.L = nn.Parameter(torch.eye(2, dtype=torch.float32))
        self.head = nn.Linear(4, 2)
        self.drop = nn.Dropout(p_drop)

        # init
        for m in [self.fc1, self.fc2, self.map4, self.head]:
            nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
            if m.bias is not None: nn.init.zeros_(m.bias)
        with torch.no_grad():
            self.L.copy_(torch.tensor([[1.0,0.0],[0.0,1.0]]))

    def forward(self, x):
        x = self.drop(F.relu(self.fc1(x)))
        x = self.drop(F.relu(self.fc2(x)))
        x = self.map4(x)           # (N,4)
        M = x.view(-1, 2, 2)       # (N,2,2)
        L = self.L                 # (2,2)
        M2 = torch.matmul(L, M)    # left multiply -> (2,2)
        z = M2.view(-1, 4)         # (N,4)
        out = self.head(z)         # (N,2)
        return out

# ------------------- data -------------------
set_seed(SEED)
data = np.load(NPZ_PATH, allow_pickle=False)
X_train, y_train = data["X_train"], data["y_train"]
X_val,   y_val   = data["X_val"],   data["y_val"]
X_test,  y_test  = data["X_test"],  data["y_test"]
in_dim = X_train.shape[1]

# tensors
Xtr = torch.from_numpy(X_train).to(torch.float32)
ytr = torch.from_numpy(y_train).to(torch.long)
Xv  = torch.from_numpy(X_val).to(torch.float32)
yv  = torch.from_numpy(y_val).to(torch.long)
Xte = torch.from_numpy(X_test).to(torch.float32)
yte = torch.from_numpy(y_test).to(torch.long)

# sampler from original y_train (no oversampled labels)
# inverse-frequency weights per-sample
counts = np.bincount(y_train, minlength=2)
w_inv = 1.0 / np.maximum(counts, 1)
sample_w = torch.where(ytr==0, torch.tensor(w_inv[0]), torch.tensor(w_inv[1])).float()
sampler = WeightedRandomSampler(sample_w, num_samples=len(sample_w), replacement=True)

train_loader = DataLoader(TensorDataset(Xtr, ytr), batch_size=BATCH_SIZE, sampler=sampler, num_workers=0, pin_memory=(DEVICE=="cuda"))
val_loader   = DataLoader(TensorDataset(Xv,  yv ), batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=(DEVICE=="cuda"))
test_loader  = DataLoader(TensorDataset(Xte, yte), batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=(DEVICE=="cuda"))

# ------------------- loss / opt / sched -------------------
cb_weights_np, cls_counts = class_balanced_weights(y_train, beta=CB_BETA, num_classes=2)
cb_weights = torch.tensor(cb_weights_np, dtype=torch.float32, device=DEVICE)

criterion_focal = FocalLoss(gamma=GAMMA_FOCAL, weight=cb_weights)
criterion_ce    = nn.CrossEntropyLoss(weight=cb_weights)

model = KronNetBinary(in_dim).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="max", factor=0.5, patience=2, min_lr=1e-5, verbose=True)

# ------------------- train/eval helpers -------------------
@torch.no_grad()
def eval_loader(loader, mdl):
    mdl.eval()
    all_p, all_t = [], []
    for xb, yb in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)
        logits = mdl(xb)
        preds = torch.argmax(logits, dim=1)
        all_p.append(preds.cpu().numpy())
        all_t.append(yb.cpu().numpy())
    y_pred = np.concatenate(all_p)
    y_true = np.concatenate(all_t)
    acc   = accuracy_score(y_true, y_pred)
    f1_ma = f1_score(y_true, y_pred, average="macro", zero_division=0)
    f1_wt = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    f1_pc = f1_score(y_true, y_pred, average=None, zero_division=0)
    cm    = confusion_matrix(y_true, y_pred, labels=[0,1])
    return {"acc":acc, "f1_macro":f1_ma, "f1_weighted":f1_wt, "f1_per_class":f1_pc.tolist(), "cm":cm.tolist()}

def train_epoch(loader, mdl, opt):
    mdl.train()
    total = 0; loss_sum = 0.0
    for xb, yb in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)
        opt.zero_grad(set_to_none=True)
        logits = mdl(xb)
        # hybrid loss
        lf = criterion_focal(logits, yb)
        lce= criterion_ce(logits, yb)
        loss = 0.7*lf + 0.3*lce
        loss.backward()
        opt.step()
        loss_sum += loss.item() * xb.size(0)
        total += xb.size(0)
    return loss_sum / max(total,1)

# ------------------- training loop -------------------
best_f1 = -1.0
best_state = None
epochs_no_improve = 0

print("Class counts (train):", cls_counts.tolist())
for epoch in range(1, MAX_EPOCHS+1):
    tr_loss = train_epoch(train_loader, model, opt)
    val_metrics = eval_loader(val_loader, model)
    sched.step(val_metrics["f1_macro"])

    curr_f1 = val_metrics["f1_macro"]
    print(f"[Epoch {epoch:02d}] loss={tr_loss:.4f}  val_f1_macro={curr_f1:.4f}  acc={val_metrics['acc']:.4f}")

    # target reached?
    if curr_f1 >= 0.85:
        best_f1 = curr_f1
        best_state = {k:v.cpu() for k,v in model.state_dict().items()}
        print("Target reached (val macro-F1 ≥ 0.85). Stopping.")
        break

    # track best
    if curr_f1 > best_f1:
        best_f1 = curr_f1
        best_state = {k:v.cpu() for k,v in model.state_dict().items()}
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    # fail-fast guard
    if epoch == 8 and best_f1 < 0.70:
        print("Fail-fast: best val macro-F1 < 0.70 by epoch 8. Aborting to revisit data/config.")
        break

    # early stopping
    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping (no improve {PATIENCE} epochs).")
        break

# ------------------- save + final test -------------------
if best_state is not None:
    model.load_state_dict(best_state)
torch.save(model.state_dict(), os.path.join(OUT_DIR, "model_kronnet_binary.pt"))

val_report = eval_loader(val_loader, model)
test_report= eval_loader(test_loader, model)

metrics = {
    "seed": SEED,
    "device": DEVICE,
    "best_val_macroF1": best_f1,
    "val": val_report,
    "test": test_report,
    "train_counts": cls_counts.tolist(),
}
with open(os.path.join(OUT_DIR, "metrics_binary.json"), "w") as f:
    json.dump(metrics, f, indent=2)

print("\n[Done]")
print("Best val macro-F1:", best_f1)
print("Test metrics:", json.dumps(test_report, indent=2))
print("Saved:", os.path.join(OUT_DIR, "model_kronnet_binary.pt"),
      "and", os.path.join(OUT_DIR, "metrics_binary.json"))




Class counts (train): [700000, 1578548]
[Epoch 01] loss=0.1168  val_f1_macro=0.9132  acc=0.9220
Target reached (val macro-F1 ≥ 0.85). Stopping.

[Done]
Best val macro-F1: 0.9131790043404009
Test metrics: {
  "acc": 0.9226356395452432,
  "f1_macro": 0.913821147299156,
  "f1_weighted": 0.9244480652881837,
  "f1_per_class": [
    0.8862598913606418,
    0.9413824032376702
  ],
  "cm": [
    [
      147167,
      2833
    ],
    [
      34941,
      303320
    ]
  ]
}
Saved: D:/PARA/projects/FYP/IDS2/magnum_opus\model_kronnet_binary.pt and D:/PARA/projects/FYP/IDS2/magnum_opus\metrics_binary.json


In [25]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import numpy as np
import torch

# model already loaded as "model"
model.eval()

# we recompute the test predictions
y_true = []
y_pred = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1)

        y_true.append(yb.cpu().numpy())
        y_pred.append(preds.cpu().numpy())

y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Macro-F1:", f1_score(y_true, y_pred, average="macro"))
print("Weighted-F1:", f1_score(y_true, y_pred, average="weighted"))
print("Per-class F1:", f1_score(y_true, y_pred, average=None))
print("\nConfusion matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))


Accuracy: 0.937269206428529
Macro-F1: 0.9292021575158557
Weighted-F1: 0.938416740416477
Per-class F1: [0.90530384 0.95310048]

Confusion matrix:
 [[146408   3592]
 [ 27037 311224]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8441    0.9761    0.9053    150000
           1     0.9886    0.9201    0.9531    338261

    accuracy                         0.9373    488261
   macro avg     0.9164    0.9481    0.9292    488261
weighted avg     0.9442    0.9373    0.9384    488261



In [27]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import torch

model.eval()

train_true = []
train_pred = []

with torch.no_grad():
    for xb, yb in train_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1)

        train_true.append(yb.cpu().numpy())
        train_pred.append(preds.cpu().numpy())

train_true = np.concatenate(train_true)
train_pred = np.concatenate(train_pred)

print("=== TRAIN METRICS ===")
print("Accuracy:", accuracy_score(train_true, train_pred))
print("Macro-F1:", f1_score(train_true, train_pred, average="macro"))
print("Weighted-F1:", f1_score(train_true, train_pred, average="weighted"))
print("Per-class F1:", f1_score(train_true, train_pred, average=None))
print("Confusion matrix:\n", confusion_matrix(train_true, train_pred))


=== TRAIN METRICS ===
Accuracy: 0.9477294311991672
Macro-F1: 0.9476884373735617
Weighted-F1: 0.9476878306765111
Per-class F1: [0.94915283 0.94622404]
Confusion matrix:
 [[1111616   27186]
 [  91915 1047831]]


In [28]:
correct = 0
for i in range(100):
    idx = np.random.randint(0, len(X_test))
    sample = X_test[idx].reshape(1,-1)
    true = y_test[idx]

    with torch.no_grad():
        out = model(torch.from_numpy(sample).to(DEVICE))
    pred = torch.argmax(out,1).item()

    print(f"Sample {i}: True={true} Pred={pred}")
    if pred == true: correct += 1

print("Correct out of 100:", correct)


Sample 0: True=1 Pred=1
Sample 1: True=0 Pred=0
Sample 2: True=1 Pred=1
Sample 3: True=0 Pred=0
Sample 4: True=0 Pred=0
Sample 5: True=0 Pred=0
Sample 6: True=1 Pred=1
Sample 7: True=1 Pred=1
Sample 8: True=1 Pred=0
Sample 9: True=1 Pred=1
Sample 10: True=0 Pred=0
Sample 11: True=0 Pred=0
Sample 12: True=1 Pred=0
Sample 13: True=1 Pred=1
Sample 14: True=1 Pred=1
Sample 15: True=1 Pred=1
Sample 16: True=0 Pred=0
Sample 17: True=1 Pred=1
Sample 18: True=1 Pred=1
Sample 19: True=0 Pred=0
Sample 20: True=1 Pred=1
Sample 21: True=1 Pred=1
Sample 22: True=1 Pred=1
Sample 23: True=1 Pred=1
Sample 24: True=0 Pred=0
Sample 25: True=0 Pred=0
Sample 26: True=1 Pred=1
Sample 27: True=1 Pred=1
Sample 28: True=1 Pred=1
Sample 29: True=1 Pred=1
Sample 30: True=0 Pred=0
Sample 31: True=1 Pred=1
Sample 32: True=1 Pred=1
Sample 33: True=0 Pred=0
Sample 34: True=1 Pred=1
Sample 35: True=1 Pred=0
Sample 36: True=1 Pred=1
Sample 37: True=0 Pred=0
Sample 38: True=0 Pred=0
Sample 39: True=1 Pred=1
Sample 40:

In [36]:
# Build confusion matrices & F1 plots and create a clean deployment bundle
# Uses your exact filenames from the folder screenshot.

import os, json, shutil, datetime, math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

ROOT = r"D:/PARA/projects/FYP/IDS2/magnum_opus"

# Files exactly as in your folder
NPZ_PATH   = os.path.join(ROOT, "binary_balanced_tensors.npz")
MODEL_PT_5 = os.path.join(ROOT, "model_kronnet_binary_ep5.pt")  # may or may not exist
MODEL_PT_1 = os.path.join(ROOT, "model_kronnet_binary.pt")      # exists per screenshot
SCALER_PKL = os.path.join(ROOT, "scaler.pkl")                   # optional

# Choose best available checkpoint (prefer 5-epoch)
MODEL_PT = MODEL_PT_5 if os.path.exists(MODEL_PT_5) else MODEL_PT_1
print("[info] Using model:", os.path.basename(MODEL_PT))

BUNDLE  = os.path.join(ROOT, "deployment_bundle")
FIG_DIR = os.path.join(BUNDLE, "figures")
os.makedirs(FIG_DIR, exist_ok=True)

# ---- Load tensors ----
data = np.load(NPZ_PATH, allow_pickle=False)
X_train, y_train = data["X_train"], data["y_train"]
X_val,   y_val   = data["X_val"],   data["y_val"]
X_test,  y_test  = data["X_test"],  data["y_test"]

in_dim = X_train.shape[1]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---- KronNet (same shape as training) ----
class KronNetBinary(nn.Module):
    def __init__(self, in_dim, p_drop=0.05):
        super().__init__()
        self.fc1  = nn.Linear(in_dim, 64)
        self.fc2  = nn.Linear(64, 32)
        self.map4 = nn.Linear(32, 4)   # -> (2,2)
        self.L    = nn.Parameter(torch.eye(2))
        self.head = nn.Linear(4, 2)
        self.drop = nn.Dropout(p_drop)
        for m in [self.fc1, self.fc2, self.map4, self.head]:
            nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
            if m.bias is not None: nn.init.zeros_(m.bias)
    def forward(self, x):
        x = self.drop(torch.relu(self.fc1(x)))
        x = self.drop(torch.relu(self.fc2(x)))
        x = self.map4(x).view(-1,2,2)
        x = torch.matmul(self.L, x).view(-1,4)
        return self.head(x)

model = KronNetBinary(in_dim).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PT, map_location=DEVICE))
model.eval()

# ---- Eval helpers ----
def eval_numpy(X, y, batch=8192):
    y_true, y_pred = [], []
    with torch.no_grad():
        for i in range(0, len(X), batch):
            xb = torch.from_numpy(X[i:i+batch]).to(torch.float32).to(DEVICE)
            pred = torch.argmax(model(xb), dim=1).cpu().numpy()
            y_pred.append(pred); y_true.append(y[i:i+batch])
    y_true = np.concatenate(y_true); y_pred = np.concatenate(y_pred)
    return {
        "acc": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted", zero_division=0),
        "f1_per_class": f1_score(y_true, y_pred, average=None, zero_division=0).tolist(),
        "cm": confusion_matrix(y_true, y_pred, labels=[0,1]).tolist()
    }

def plot_cm(cm, title, path):
    cm = np.array(cm)
    fig, ax = plt.subplots(figsize=(4.8,4.2), dpi=140)
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_title(title); ax.set_xlabel('Predicted'); ax.set_ylabel('True')
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(['Benign','Attack']); ax.set_yticklabels(['Benign','Attack'])
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, f"{cm[i,j]:,}", ha="center", va="center")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    plt.tight_layout(); fig.savefig(path, bbox_inches="tight"); plt.close(fig)

def plot_f1(train_f1, test_f1, path):
    labels = ['Benign','Attack']; x = np.arange(len(labels)); w = 0.35
    fig, ax = plt.subplots(figsize=(5.2,3.8), dpi=140)
    ax.bar(x - w/2, train_f1, w, label='Train'); ax.bar(x + w/2, test_f1, w, label='Test')
    ax.set_ylim(0.0, 1.0); ax.set_xticks(x); ax.set_xticklabels(labels)
    ax.set_ylabel('F1-score'); ax.set_title('Per-class F1: Train vs Test'); ax.legend()
    plt.tight_layout(); fig.savefig(path, bbox_inches="tight"); plt.close(fig)

# ---- Compute metrics ----
train_rep = eval_numpy(X_train, y_train)
val_rep   = eval_numpy(X_val,   y_val)
test_rep  = eval_numpy(X_test,  y_test)

# ---- Save figures ----
plot_cm(train_rep["cm"], "Confusion Matrix (Train)", os.path.join(FIG_DIR, "cm_train.png"))
plot_cm(test_rep["cm"],  "Confusion Matrix (Test)",  os.path.join(FIG_DIR, "cm_test.png"))
plot_f1(train_rep["f1_per_class"], test_rep["f1_per_class"], os.path.join(FIG_DIR, "f1_train_vs_test.png"))

# ---- Write fresh metrics JSON into bundle ----
metrics_json_path = os.path.join(BUNDLE, "metrics_binary.json")
with open(metrics_json_path, "w") as f:
    json.dump({
        "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
        "model_file_used": os.path.basename(MODEL_PT),
        "train": train_rep,
        "val":   val_rep,
        "test":  test_rep
    }, f, indent=2)

# ---- Copy artifacts ----
shutil.copy2(MODEL_PT, BUNDLE)
if os.path.exists(SCALER_PKL):
    shutil.copy2(SCALER_PKL, BUNDLE)
# If you also want to include tensors (large), uncomment:
# shutil.copy2(NPZ_PATH, BUNDLE)

# ---- Minimal manifest ----
manifest = {
    "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
    "model_file": os.path.basename(MODEL_PT),
    "metrics_file": os.path.basename(metrics_json_path),
    "scaler_file": os.path.basename(SCALER_PKL) if os.path.exists(SCALER_PKL) else None,
    "figures": {
        "cm_train": "figures/cm_train.png",
        "cm_test":  "figures/cm_test.png",
        "f1_bars":  "figures/f1_train_vs_test.png"
    },
    "input_dim": int(in_dim),
    "class_names": ["Benign","Attack"],
    "notes": "KronNet binary IDS (64→32→4→2). Check metrics_binary.json for scores."
}
with open(os.path.join(BUNDLE, "manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)

print("\n[OK] Bundle ready at:", BUNDLE)
for root, _, files in os.walk(BUNDLE):
    for name in files:
        print(os.path.relpath(os.path.join(root, name), BUNDLE))


[info] Using model: model_kronnet_binary.pt

[OK] Bundle ready at: D:/PARA/projects/FYP/IDS2/magnum_opus\deployment_bundle
manifest.json
metrics_binary.json
model_kronnet_binary.pt
figures\cm_test.png
figures\cm_train.png
figures\f1_train_vs_test.png


  "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
  "created_utc": datetime.datetime.utcnow().isoformat() + "Z",


In [38]:
import pandas as pd

df = pd.read_csv("D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz", nrows=5)
print(df.columns.tolist())


['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label', 'label_bin']


In [40]:
# Rebuild StandardScaler (train-only) from combined_clean_binary.csv, memory-safe
# - reproduces the same balanced selection you used for tensors
# - assigns rows into 70/15/15 per-class quotas in a streaming manner
# - fits scaler ONLY on TRAIN rows via Welford running stats
# - saves scaler.pkl for deployment

import os
import math
import json
import joblib
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.preprocessing import StandardScaler

ROOT = r"D:/PARA/projects/FYP/IDS2/magnum_opus"
SRC  = os.path.join(ROOT, "combined_clean_binary.csv.gz")   # your file as in screenshot
OUT  = os.path.join(ROOT, "scaler.pkl")

SEED = 42
np.random.seed(SEED)

# Feature columns in correct order (your list)
FEATURE_COLS = [
 'flow_duration','Header_Length','Protocol Type','Duration','Rate','Srate','Drate',
 'fin_flag_number','syn_flag_number','rst_flag_number','psh_flag_number','ack_flag_number',
 'ack_count','syn_count','fin_count','urg_count','rst_count','HTTP','HTTPS','DNS',
 'SSH','TCP','UDP','ARP','ICMP','IPv','LLC','Tot sum','Min','Max','AVG','Std',
 'Tot size','IAT','Number','Magnitue','Radius','Covariance','Variance','Weight'
]
TARGET_COL  = "label"
BIN_COL     = "label_bin"

# Balance settings you used
BENIGN_NAME = "BenignTraffic"
BENIGN_CAP  = 1_000_000
ATTACK_CAP  = 100_000
MIN_KEEP    = 500
DROP_ATTACKS = {
    "BrowserHijacking","CommandInjection","SqlInjection","XSS",
    "Backdoor_Malware","Recon-PingSweep","Uploading_Attack"
}

CHUNK = 500_000

# ---------- PASS 1: count labels to compute quotas ----------
label_counts = Counter()
for chunk in pd.read_csv(SRC, usecols=[TARGET_COL], chunksize=CHUNK, low_memory=False):
    label_counts.update(chunk[TARGET_COL].astype(str))

benign_total = label_counts.get(BENIGN_NAME, 0)
attack_labels = [k for k in label_counts if k != BENIGN_NAME]

quotas_total = {}  # total rows to keep per label (after caps/drops)
for lab in attack_labels:
    cnt = label_counts[lab]
    if lab in DROP_ATTACKS or cnt < MIN_KEEP:
        continue
    quotas_total[lab] = min(cnt, ATTACK_CAP)

if benign_total > 0:
    quotas_total[BENIGN_NAME] = min(benign_total, BENIGN_CAP)

# split quotas: 70/15/15 per label
def split_quota(n):
    tr = int(round(n * 0.70))
    va = int(round(n * 0.15))
    te = n - tr - va
    return tr, va, te

split_need = {}  # label -> {"train": int, "val": int, "test": int}
for lab, n in quotas_total.items():
    tr, va, te = split_quota(n)
    split_need[lab] = {"train": tr, "val": va, "test": te}

# ---------- Welford running stats for TRAIN ONLY ----------
D = len(FEATURE_COLS)
count = 0
mean  = np.zeros(D, dtype=np.float64)
M2    = np.zeros(D, dtype=np.float64)

def update_welford(Xbatch):
    global count, mean, M2
    # Xbatch: (B, D) float
    for x in Xbatch:
        count += 1
        delta = x - mean
        mean += delta / count
        delta2 = x - mean
        M2 += delta * delta2

# ---------- PASS 2: stream rows, assign to splits, update scaler stats on TRAIN ----------
kept_counters = defaultdict(lambda: {"train":0, "val":0, "test":0})
total_seen = 0

for chunk in pd.read_csv(SRC, chunksize=CHUNK, low_memory=False):
    # enforce dtypes
    sub = chunk[[*FEATURE_COLS, TARGET_COL, BIN_COL]].copy()
    sub[TARGET_COL] = sub[TARGET_COL].astype(str)

    # filter to labels we actually keep
    mask_keep = sub[TARGET_COL].isin(quotas_total.keys())
    if not mask_keep.any():
        continue
    sub = sub[mask_keep]

    # iterate by label for simple quota assignment
    for lab, grp in sub.groupby(TARGET_COL):
        need = split_need.get(lab, None)
        if not need:
            continue
        # compute remaining per split
        rem_tr = need["train"] - kept_counters[lab]["train"]
        rem_va = need["val"]   - kept_counters[lab]["val"]
        rem_te = need["test"]  - kept_counters[lab]["test"]
        if rem_tr <= 0 and rem_va <= 0 and rem_te <= 0:
            continue

        # take as many as needed for each split in order: train -> val -> test
        take_tr = max(0, min(rem_tr, len(grp)))
        if take_tr > 0:
            X_tr = grp.iloc[:take_tr][FEATURE_COLS].to_numpy(np.float64, copy=False)
            update_welford(X_tr)
            kept_counters[lab]["train"] += take_tr
            grp = grp.iloc[take_tr:]

        # (we skip collecting val/test rows; we only need train rows for scaler)

        # early exit if all quotas for this label are done
        # Note: speeds up overall
        # (Remaining val/test quotas will be satisfied by later chunks, but not needed for scaler.)

    total_seen += len(sub)

# finalize stats
if count == 0:
    raise RuntimeError("No TRAIN rows were processed for scaler fitting. Check quotas/file paths.")

var = M2 / max(count - 1, 1)
scale = np.sqrt(np.maximum(var, 1e-12))

scaler = StandardScaler()
# setattr the learned parameters explicitly
scaler.mean_ = mean.astype(np.float64)
scaler.var_  = var.astype(np.float64)
scaler.scale_= scale.astype(np.float64)
scaler.n_samples_seen_ = np.array([count], dtype=np.int64)

# tiny sanity test on a small batch
test_batch = pd.read_csv(SRC, nrows=10, low_memory=False)[FEATURE_COLS].to_numpy(np.float64)
_ = scaler.transform(test_batch)  # should run without error and keep shape

joblib.dump(scaler, OUT)
print(f"[OK] scaler.pkl saved to: {OUT}")
print(f"TRAIN samples used for stats: {count:,}")
print("Per-label train rows used (top few):")
print({k: v['train'] for k, v in list(kept_counters.items())[:5]})


[OK] scaler.pkl saved to: D:/PARA/projects/FYP/IDS2/magnum_opus\scaler.pkl
TRAIN samples used for stats: 2,278,548
Per-label train rows used (top few):
{'BenignTraffic': 700000, 'DDoS-ACK_Fragmentation': 70000, 'DDoS-HTTP_Flood': 20153, 'DDoS-ICMP_Flood': 70000, 'DDoS-ICMP_Fragmentation': 70000}


In [8]:
import pandas as pd

FILE = r"D:/PARA/projects/FYP/IDS2/magnum_opus/combined_clean_binary.csv.gz"

# Read only header (no data loaded)
cols = pd.read_csv(FILE, nrows=0).columns.tolist()

print("All columns:")
print(cols)

print("\nFeature columns (X):")
feature_cols = [c for c in cols if c not in ["label", "label_bin"]]
print(feature_cols)

print("\nNumber of features:", len(feature_cols))



All columns:
['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label', 'label_bin']

Feature columns (X):
['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight']

Number of features: 40
