In [None]:
import re, numpy as np, pandas as pd

RAW = "/content/UNSW_NB15_training-set.csv"   # <-- your path
OUT = "/content/unsw_flows_labeled.csv"

def norm(s):  # normalize header
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())

def find_one(df, *candidates):
    cols = {norm(c): c for c in df.columns}
    for cand in candidates:
        if norm(cand) in cols:
            return cols[norm(cand)]
    return None

def proto_to_str(x):
    try:
        x = int(x)
        return {6: "TCP", 17: "UDP", 1: "ICMP"}.get(x, str(x))
    except Exception:
        return str(x)

# Common service→port guesses (extend as needed)
SERVICE_PORT = {
    "http":80, "https":443, "dns":53, "ssh":22, "telnet":23, "smtp":25, "imap":143, "pop3":110,
    "ftp":21, "ftp-data":20, "mysql":3306, "microsoft-ds":445, "smb":445, "rdp":3389, "ntp":123,
    "snmp":161, "ldap":389, "kerberos":88, "dhcp":67, "dhcpv6":546, "irc":6667, "redis":6379,
    "mongodb":27017, "postgresql":5432, "sql":1433, "smtp-ssl":465, "imap-ssl":993, "pop3-ssl":995,
    "sip":5060, "rtsp":554, "ftp-control":21, "http-alt":8080, "ssl":443
}

df_raw = pd.read_csv(RAW)
print("Total cols:", len(df_raw.columns))

# Find core UNSW fields
dur    = find_one(df_raw, "dur", "duration")
spkts  = find_one(df_raw, "spkts", "src_pkts", "s_pkts")
dpkts  = find_one(df_raw, "dpkts", "dst_pkts", "d_pkts")
sbytes = find_one(df_raw, "sbytes", "src_bytes", "s_bytes")
dbytes = find_one(df_raw, "dbytes", "dst_bytes", "d_bytes")
sport  = find_one(df_raw, "sport", "src_port", "s_port")
dport  = find_one(df_raw, "dport", "dst_port", "d_port")
proto  = find_one(df_raw, "proto", "protocol")
service= find_one(df_raw, "service")

needed = [dur, spkts, dpkts, sbytes, dbytes, proto]
if any(x is None for x in needed):
    raise SystemExit("Missing core UNSW columns (dur/spkts/dpkts/sbytes/dbytes/proto). Share df_raw.columns if unsure.")

flows = pd.DataFrame()
flows["duration"] = df_raw[dur].astype(float).clip(lower=1e-6)
flows["packets"]  = df_raw[spkts].fillna(0).astype(float) + df_raw[dpkts].fillna(0).astype(float)
flows["bytes"]    = df_raw[sbytes].fillna(0).astype(float) + df_raw[dbytes].fillna(0).astype(float)
flows["bytes_per_packet"]   = (flows["bytes"] / flows["packets"].replace(0, np.nan)).fillna(0)
flows["packets_per_second"] = (flows["packets"] / flows["duration"]).replace([np.inf, -np.inf], 0).fillna(0)

# Fields not present in tabular UNSW → 0
for c in ["iat_mean","iat_std","tcp_syn","tcp_ack","tcp_rst","tcp_fin"]:
    flows[c] = 0

# Protocol as string
flows["protocol"] = df_raw[proto].apply(proto_to_str)

# Ports: prefer real columns; otherwise infer dst_port from service, set src_port=0
if sport is not None:
    flows["src_port"] = df_raw[sport].fillna(0).astype(int)
else:
    flows["src_port"] = 0

if dport is not None:
    flows["dst_port"] = df_raw[dport].fillna(0).astype(int)
else:
    if service is not None:
        svc = df_raw[service].astype(str).str.lower().str.strip()
        flows["dst_port"] = svc.map(SERVICE_PORT).fillna(0).astype(int)
        print("Info: inferred dst_port from 'service'. Unique services:", sorted(svc.unique())[:25], "...")
    else:
        flows["dst_port"] = 0
        print("Warning: no dport/service — dst_port set to 0.")

# Labels
attack_cat = find_one(df_raw, "attack_cat", "attackcat")
label_num  = find_one(df_raw, "label")
if attack_cat:
    flows["label"] = df_raw[attack_cat].fillna("Normal").astype(str).str.strip().replace({"": "Normal"})
elif label_num:
    flows["label"] = df_raw[label_num].apply(lambda x: "Normal" if str(x).strip() in {"0","normal","benign"} else "Attack")
else:
    raise SystemExit("Couldn’t find 'attack_cat' or 'label' for targets.")

print(flows.head(3))
flows.to_csv(OUT, index=False)
print(" Wrote:", OUT, "rows:", len(flows))

Total cols: 45
Info: inferred dst_port from 'service'. Unique services: ['-', 'dhcp', 'dns', 'ftp', 'ftp-data', 'http', 'irc', 'pop3', 'radius', 'smtp', 'snmp', 'ssh', 'ssl'] ...
   duration  packets   bytes  bytes_per_packet  packets_per_second  iat_mean  \
0  0.000011      2.0   496.0             248.0       181818.181818         0   
1  0.000008      2.0  1762.0             881.0       250000.000000         0   
2  0.000005      2.0  1068.0             534.0       400000.000000         0   

   iat_std  tcp_syn  tcp_ack  tcp_rst  tcp_fin protocol  src_port  dst_port  \
0        0        0        0        0        0      udp         0         0   
1        0        0        0        0        0      udp         0         0   
2        0        0        0        0        0      udp         0         0   

    label  
0  Normal  
1  Normal  
2  Normal  
 Wrote: /content/unsw_flows_labeled.csv rows: 82332


In [None]:
!python trainmodel.py \
  --train_csv /content/unsw_flows_labeled.csv \
  --out_dir artifacts --basename unsw --fast

[INFO] FAST mode: fitting baseline RF…
[OK] Saved preprocessor → /content/artifacts/unsw_preprocessor.pkl
[OK] Saved classifier   → /content/artifacts/unsw_clf.pkl
[OK] Saved feature importances → /content/artifacts/feature_importances.csv
[OK] Saved metrics → /content/artifacts/metrics.json
[OK] Saved confusion matrix → /content/artifacts/confusion_matrix.csv
[INFO] Fitting IsolationForest on 29600 Normal training samples…
[OK] Saved IsolationForest → /content/artifacts/unsw_iso.pkl
[OK] Saved iso meta       → /content/artifacts/unsw_iso_meta.json
[DONE] All artifacts & metrics saved in: /content/artifacts
