In [8]:
# ========= Mount & IO paths =========
import os, pathlib, json, numpy as np, pandas as pd
if "COLAB_RELEASE_TAG" in os.environ or os.path.exists("/content"):
    try:
        from google.colab import drive
        if not os.path.exists("/content/drive"):
            drive.mount("/content/drive")
    except Exception as e:
        print("Colab mount skipped:", e)

DATA_DIR = "/content/drive/MyDrive/INCS870/data" if os.path.exists("/content/drive") else "data"
RESULTS  = "/content/drive/MyDrive/INCS870/results" if os.path.exists("/content/drive") else "results"
pathlib.Path(f"{RESULTS}/logs").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"{RESULTS}/figures").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"{RESULTS}/tables").mkdir(parents=True, exist_ok=True)

with open(f"{DATA_DIR}/ciciot2023_split_meta.json","r") as f:
    meta = json.load(f)
class_names = meta["class_names"]
OBJECTIVE   = "multiclass" if meta["task_mode"]=="multiclass" else "binary"
NUM_CLASS   = len(class_names) if OBJECTIVE=="multiclass" else None

Xtr = pd.read_parquet(f"{DATA_DIR}/train.parquet"); ytr = Xtr.pop("label").values
Xva = pd.read_parquet(f"{DATA_DIR}/val.parquet");   yva = Xva.pop("label").values
Xte = pd.read_parquet(f"{DATA_DIR}/test.parquet");  yte = Xte.pop("label").values

M_RAW = Xtr.shape[1]; print("M_raw =", M_RAW, "| shapes:", Xtr.shape, Xva.shape, Xte.shape)


M_raw = 39 | shapes: (10073978, 39) (2158123, 39) (2158079, 39)


In [14]:
# ========= Train function =========
# !pip -q install lightgbm scikit-learn matplotlib pyarrow
import time, json, matplotlib.pyplot as plt, lightgbm as lgb
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_fscore_support

def per_class_recall(y_true, y_pred):
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    return {int(i): float(r[i]) for i in range(len(r))}

def train_lgbm(Xtr, ytr, Xva, yva, *, use_goss=False, enable_efb=False, top_rate=0.1, other_rate=0.1):
    params = dict(
        objective=OBJECTIVE,
        num_class=NUM_CLASS if OBJECTIVE=="multiclass" else None,
        metric=["multi_logloss"] if OBJECTIVE=="multiclass" else ["auc","binary_logloss"],
        boosting_type="goss" if use_goss else "gbdt",
        learning_rate=0.05,
        num_leaves=64,
        min_data_in_leaf=50,
        max_bin=255,
        lambda_l2=1.0,
        feature_fraction=1.0,
        bagging_fraction=1.0,      # GOSS 下保持 1.0
        enable_bundle=enable_efb,  # ← EFB 开关
        force_col_wise=True,
        deterministic=True,
        seed=42,
        verbose=-1,
        num_threads=-1,
        two_round=True,
        bin_construct_sample_cnt=200000,
    )
    if use_goss:
        params.update(dict(top_rate=top_rate, other_rate=other_rate))

    dtr = lgb.Dataset(Xtr, ytr, free_raw_data=False)
    dva = lgb.Dataset(Xva, yva, reference=dtr, free_raw_data=False)

    # ==== 用回调实现早停 & 打印日志（兼容 v3/v4） ====
    callbacks = []
    if hasattr(lgb, "early_stopping"):
        callbacks.append(lgb.early_stopping(stopping_rounds=100, verbose=True))
    if hasattr(lgb, "log_evaluation"):
        callbacks.append(lgb.log_evaluation(period=50))

    t0 = time.time()
    booster = lgb.train(
    params, dtr,
    num_boost_round=3000,                         # 给个上限：3000（或 2500）
    valid_sets=[dtr, dva],
    valid_names=["train", "val"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, first_metric_only=True, verbose=True),
        lgb.log_evaluation(period=25),
    ],
)
    elapsed = time.time() - t0

    # 兼容式取得 best_iteration（v3/v4）
    best_it = getattr(booster, "best_iteration", None)

    # 计算验证集 macro-F1 作为摘要
    proba = booster.predict(Xva, num_iteration=best_it)
    yhat  = proba.argmax(1) if OBJECTIVE=="multiclass" else (proba >= 0.5).astype(int)
    macro_f1 = f1_score(yva, yhat, average="macro" if OBJECTIVE=="multiclass" else "binary")

    return booster, dict(
        time_sec=float(elapsed),
        best_iter=int(best_it if best_it is not None else 0),
        macro_f1=float(macro_f1),
        n_features_input=int(Xtr.shape[1]),
        n_features_effective=len(booster.feature_name()),
    )
    Xtr = pd.read_parquet(f"{DATA_DIR}/train.parquet"); ytr = Xtr.pop("label").values
Xva = pd.read_parquet(f"{DATA_DIR}/val.parquet");   yva = Xva.pop("label").values
Xte = pd.read_parquet(f"{DATA_DIR}/test.parquet");  yte = Xte.pop("label").values

with open(f"{DATA_DIR}/ciciot2023_split_meta.json","r") as f:
    meta = json.load(f)
class_names = meta["class_names"]
OBJECTIVE   = "multiclass" if meta["task_mode"]=="multiclass" else "binary"
NUM_CLASS   = len(class_names) if OBJECTIVE=="multiclass" else None

# >>> 新增：把字符串标签映射为整数 <<<
import numpy as np
if OBJECTIVE == "multiclass":
    label2id = {c:i for i,c in enumerate(class_names)}
else:
    # 二分类时确保 0/1；按 class_names 的顺序来（比如 ['Attack','Benign']）
    label2id = {class_names[0]:0, class_names[1]:1}

ytr = np.array([label2id[s] for s in ytr], dtype=np.int32)
yva = np.array([label2id[s] for s in yva], dtype=np.int32)
yte = np.array([label2id[s] for s in yte], dtype=np.int32)

# （可选安全检查：确保特征没有 object 列）
for df in (Xtr, Xva, Xte):
    obj_cols = df.select_dtypes(include=["object"]).columns
    if len(obj_cols):
        df[obj_cols] = df[obj_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0).astype("float32")

M_RAW = Xtr.shape[1]
print("M_raw =", M_RAW, "| shapes:", Xtr.shape, Xva.shape, Xte.shape)
print("classes -> id:", label2id)

M_raw = 39 | shapes: (10073978, 39) (2158123, 39) (2158079, 39)
classes -> id: {'Backdoor_Malware': 0, 'Benign_Final': 1, 'DDoS-TCP_Flood': 2, 'DDoS-UDP_Flood': 3, 'DoS-SYN_Flood': 4, 'Mirai-udpplain': 5, 'Recon-PortScan': 6, 'VulnerabilityScan': 7, 'XSS': 8}


In [15]:
# ========= Run (Baseline: gbdt + no EFB) =========
USE_GOSS=False; ENABLE_EFB=False; TOP_RATE=0.1; OTHER_RATE=0.1
SETTING_NAME = "gbdt_noEFB"

booster, result = train_lgbm(Xtr,ytr,Xva,yva,use_goss=USE_GOSS,enable_efb=ENABLE_EFB,
                             top_rate=TOP_RATE,other_rate=OTHER_RATE)
proba = booster.predict(Xva, num_iteration=getattr(booster, "best_iteration", None))
yhat  = proba.argmax(1) if OBJECTIVE=="multiclass" else (proba>=0.5).astype(int)

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
rec = per_class_recall(yva, yhat)
cm  = confusion_matrix(yva, yhat).tolist()
print(json.dumps({**result,"setting":SETTING_NAME}, indent=2))
print(classification_report(yva, yhat, target_names=class_names[:len(np.unique(yva))], digits=4))

# save
import pathlib
with open(f"{RESULTS}/logs/{SETTING_NAME}.json","w") as f:
    json.dump({**result,"setting":SETTING_NAME,"per_class_recall":rec,"cm":cm}, f, indent=2)
booster.save_model(f"{RESULTS}/{SETTING_NAME}.txt")

row = {
    "setting": SETTING_NAME, "time_sec": result["time_sec"], "best_iter": result["best_iter"],
    "macro_f1": result["macro_f1"], "M_raw": result["n_features_input"],
    "M_effective": result["n_features_effective"], "use_goss": USE_GOSS, "enable_efb": ENABLE_EFB,
    "top_rate": None, "other_rate": None
}
tbl = f"{RESULTS}/tables/efb_goss_ablation.csv"
pd.DataFrame([row]).to_csv(tbl, mode="a", header=not pathlib.Path(tbl).exists(), index=False)

# simple per-class recall bar
plt.figure(figsize=(9,3)); plt.bar(range(len(rec)), list(rec.values()))
plt.xticks(range(len(rec)), [class_names[i] for i in rec.keys()], rotation=45, ha='right'); plt.ylim(0,1)
plt.title(f"Per-class Recall: {SETTING_NAME}"); plt.tight_layout()
plt.savefig(f"{RESULTS}/figures/recall_{SETTING_NAME}.png", dpi=150); plt.show()




Training until validation scores don't improve for 100 rounds
[50]	train's multi_logloss: 0.060194	val's multi_logloss: 0.0605994
[100]	train's multi_logloss: 0.0441392	val's multi_logloss: 0.0450515
[150]	train's multi_logloss: 0.0425168	val's multi_logloss: 0.043856
[200]	train's multi_logloss: 0.0416202	val's multi_logloss: 0.0433623
[250]	train's multi_logloss: 0.0409627	val's multi_logloss: 0.0430776
[300]	train's multi_logloss: 0.0403971	val's multi_logloss: 0.0428449
[350]	train's multi_logloss: 0.0398711	val's multi_logloss: 0.0426485
[400]	train's multi_logloss: 0.0394225	val's multi_logloss: 0.0425075
[450]	train's multi_logloss: 0.038996	val's multi_logloss: 0.0423834
[500]	train's multi_logloss: 0.0385974	val's multi_logloss: 0.0422696
[550]	train's multi_logloss: 0.0382419	val's multi_logloss: 0.0421821
[600]	train's multi_logloss: 0.0379308	val's multi_logloss: 0.0421218
[650]	train's multi_logloss: 0.0376041	val's multi_logloss: 0.0420602
[700]	train's multi_logloss: 0.0

KeyboardInterrupt: 