# IDS Full Benchmark (One Notebook)
- **Dataset chính:** CICDDoS2017 + CICDDoS2019 (đọc từng file, liệt kê nhãn), sau đó gộp.
- **Pipeline giữ nguyên:** gộp → chuẩn hóa nhãn → **split 8/2** (stratify) → **Scaler + SMOTE** → train → đánh giá.
- **Phase‑1 (Binary):** Benign vs DDoS.
- **Phase‑2 (Multiclass):** chỉ trên các mẫu DDoS, phân loại loại tấn công (AttackType nhóm lại giống notebook cũ).
- **Benchmark:** so sánh nhiều model cho **cả 2 phase** (LightGBM, XGBoost, CatBoost, HistGBDT, RF, ExtraTrees, BalancedRF*, LogReg, LinearSVC-Calib, SVC‑RBF, Ridge, KNN, GaussianNB). (*) BalancedRF cần `imblearn`.
- **Tùy chọn:** phần cuối có **NSL-KDD** và **UNSW_NB15** chạy độc lập nếu thư mục có dữ liệu.

In [None]:

# ==== Imports & flags ====
import os, time, math, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, classification_report,
                             confusion_matrix)

# Base models
LGBM_AVAILABLE = True; XGB_AVAILABLE = True; CAT_AVAILABLE = True
try:
    import lightgbm as lgb
except Exception:
    LGBM_AVAILABLE = False

try:
    import xgboost as xgb
except Exception:
    XGB_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
except Exception:
    CAT_AVAILABLE = False

from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Optional extras
BRF_AVAILABLE = True
try:
    from imblearn.ensemble import BalancedRandomForestClassifier
except Exception:
    BRF_AVAILABLE = False

from imblearn.over_sampling import SMOTE
import joblib, pickle, pyarrow

warnings.filterwarnings('ignore')
print({
    "lightgbm":LGBM_AVAILABLE, "xgboost":XGB_AVAILABLE, "catboost":CAT_AVAILABLE,
    "BRF":BRF_AVAILABLE
})


In [None]:

# ==== Utils ====
def plot_cm(cm, labels, title):
    plt.figure(figsize=(max(6, len(labels)*0.6), max(5, len(labels)*0.5)))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.title(title)
    plt.tight_layout(); plt.show()


## 1) Load CICDDoS2017 + CICDDoS2019 (main) & liệt kê nhãn

In [None]:

def collect_2019(root=r'dataset/CICDDoS2019'):
    train_paths, test_paths, labels = [], [], set()
    for dirname,_,files in os.walk(root):
        for fn in files:
            if fn.endswith('-training.parquet'):
                p=os.path.join(dirname,fn); train_paths.append(p); labels.add(fn.split('-')[0])
            elif fn.endswith('-testing.parquet'):
                p=os.path.join(dirname,fn); test_paths.append(p); labels.add(fn.split('-')[0])
    return train_paths, test_paths, sorted(labels)

def collect_2017(root=r'dataset/CICDDoS2017'):
    paths, labels = [], set()
    for dirname,_,files in os.walk(root):
        for fn in files:
            if fn.endswith('.parquet'):
                p=os.path.join(dirname,fn); paths.append(p); labels.add(fn.split('-')[0])
    return paths, sorted(labels)

p2019_train, p2019_test, labels2019 = collect_2019()
p2017, labels2017 = collect_2017()
print("2019 train files:", len(p2019_train), "| 2017 files:", len(p2017))

print("\nCác nhãn (2019) ví dụ:")
for lb in labels2019[:20]: print("-", lb)
print("\nCác nhãn (2017) ví dụ:")
for lb in labels2017[:20]: print("-", lb)


In [None]:

# ==== Đọc & gộp 2017 + 2019 ====
df_2017 = pd.concat([pd.read_parquet(p) for p in p2017], ignore_index=True) if len(p2017) else pd.DataFrame()
df_2019 = pd.concat([pd.read_parquet(p) for p in p2019_train], ignore_index=True) if len(p2019_train) else pd.DataFrame()
assert len(df_2017) or len(df_2019), "Không có dữ liệu 2017/2019 để gộp."

df = pd.concat([df_2017, df_2019], ignore_index=True)
df['AttackType'] = df['Label']
df['Label'] = df['Label'].apply(lambda s: 'Benign' if s=='Benign' else 'DDoS')
print(df.shape, df['Label'].value_counts())


## 2) Danh sách đặc trưng (giữ nguyên form) + split 8/2 + Scaler + SMOTE

In [None]:

features_names = [
    'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max',
    'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
    'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
    'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
    'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
    'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
    'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
    'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
    'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
    'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
    'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Avg Packet Size',
    'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
    'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
    'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min',
    'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std',
    'Idle Max', 'Idle Min'
]

df = df.fillna(0)
X = df[features_names].astype(np.float32).values
y_bin = (df['Label']!='Benign').astype(int).values

X_tr, X_te, y_tr, y_te = train_test_split(X, y_bin, test_size=0.2, random_state=42, stratify=y_bin)
scaler = MinMaxScaler().fit(X_tr)
X_tr_s = scaler.transform(X_tr); X_te_s = scaler.transform(X_te)

# SMOTE trên train (giữ nguyên tinh thần notebook cũ)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_tr, y_tr)
X_res_s = scaler.transform(X_res)


## 3) Phase‑1 (Binary) — Model zoo & Benchmark

In [None]:

def model_zoo_binary():
    models = {}
    if LGBM_AVAILABLE:
        models["LightGBM"] = lgb.LGBMClassifier(n_estimators=800, learning_rate=0.07, max_depth=8,
                                                num_leaves=256, subsample=0.8, colsample_bytree=0.8,
                                                class_weight="balanced", n_jobs=-1, random_state=42)
    if XGB_AVAILABLE:
        models["XGBoost"] = xgb.XGBClassifier(n_estimators=800, learning_rate=0.06, max_depth=8,
                                              subsample=0.8, colsample_bytree=0.8, reg_lambda=2.0,
                                              tree_method="hist", n_jobs=-1, eval_metric="logloss", random_state=42)
    if CAT_AVAILABLE:
        models["CatBoost"] = CatBoostClassifier(iterations=1000, learning_rate=0.07, depth=8,
                                                l2_leaf_reg=5.0, loss_function="Logloss",
                                                auto_class_weights="Balanced", verbose=False, random_seed=42)
    models["HistGBDT"] = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.08, max_iter=600)
    models["RandomForest"] = RandomForestClassifier(n_estimators=600, n_jobs=-1, class_weight="balanced_subsample", random_state=42)
    models["ExtraTrees"] = ExtraTreesClassifier(n_estimators=700, n_jobs=-1, random_state=42)
    if BRF_AVAILABLE:
        models["BalancedRF"] = BalancedRandomForestClassifier(n_estimators=600, random_state=42, n_jobs=-1)
    models["LogReg"] = LogisticRegression(max_iter=1200, n_jobs=-1)
    models["LinearSVC-Calib"] = CalibratedClassifierCV(LinearSVC(), cv=3)
    models["SVC-RBF"] = SVC(kernel='rbf', C=2.0, gamma='scale')
    models["RidgeCls"] = RidgeClassifier()
    models["KNN-11"] = KNeighborsClassifier(n_neighbors=11)
    models["GaussianNB"] = GaussianNB()
    return models

def run_benchmark_binary(models: dict, Xtr, ytr, Xte, yte):
    rows = []
    for name, mdl in models.items():
        t0 = time.time()
        try:
            mdl.fit(Xtr, ytr)
            train_s = f\"{time.time()-t0:.1f}s\"
            try:
                yhat = mdl.predict(Xte)
            except Exception:
                yhat = (mdl.predict_proba(Xte)[:,1]>0.5).astype(int)
            acc = accuracy_score(yte, yhat)
            f1m = f1_score(yte, yhat, average='macro')
            f1w = f1_score(yte, yhat, average='weighted')
            try:
                proba = mdl.predict_proba(Xte)[:,1]; auc = roc_auc_score(yte, proba)
            except Exception:
                try:
                    proba = mdl.decision_function(Xte); auc = roc_auc_score(yte, proba)
                except Exception:
                    auc = np.nan
            rows.append([name, acc, f1m, f1w, auc, train_s])
            print(f\"{name:16s} | acc={acc:.4f} f1m={f1m:.4f} f1w={f1w:.4f} auc={auc if not math.isnan(auc) else np.nan} time={train_s}\")
        except Exception as e:
            print(f\"[SKIP] {name}: {e}\")
    cols = [\"model\",\"accuracy\",\"f1_macro\",\"f1_weighted\",\"roc_auc\",\"train_time\"]
    return pd.DataFrame(rows, columns=cols).sort_values(\"f1_macro\", ascending=False).reset_index(drop=True)

models_bin = model_zoo_binary()
res_bin = run_benchmark_binary(models_bin, X_res_s, y_res, X_te_s, y_te)
res_bin


In [None]:

# Plot & export (Phase‑1)
plt.figure(figsize=(10,5))
sns.barplot(data=res_bin, x="model", y="f1_macro")
plt.xticks(rotation=45, ha='right'); plt.title("Binary F1-macro (Benign vs DDoS)")
plt.tight_layout(); plt.show()

res_bin.to_csv("benchmark_binary_results.csv", index=False)
res_bin.to_excel("benchmark_binary_results.xlsx", index=False)
res_bin.head()


## 4) Phase‑2 (Multiclass trên DDoS) — Model zoo & Benchmark

In [None]:

# Gom nhóm AttackType như notebook cũ
attack_group_map = {
    'DrDoS_DNS': 'DrDoS','DrDoS_SNMP': 'DrDoS','DrDoS_NTP': 'DrDoS','DrDoS_MSSQL': 'DrDoS',
    'DrDoS_SSDP': 'DrDoS','DrDoS_UDP': 'DrDoS','TFTP': 'TFTP','UDP': 'UDP','UDPLag': 'UDP',
    'Syn': 'Syn','MSSQL': 'MSSQL','LDAP': 'LDAP','DoS slowloris': 'DoS','DoS Slowhttptest': 'DoS',
    'DoS Hulk': 'DoS','DoS GoldenEye': 'DoS','Heartbleed': 'Other','Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack','Web Attack � Sql Injection': 'Web Attack','FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force','Infiltration': 'Other','Bot': 'Other','PortScan': 'PortScan','NetBIOS': 'Other',
}
def group_attack(x): 
    return 'Benign' if x=='Benign' else attack_group_map.get(x,'Other')

df['AttackType'] = df['AttackType'].apply(group_attack)

# Lấy riêng DDoS
ddos = df[df['Label']=='DDoS'].copy()
Xa = ddos[features_names].astype(np.float32).values
ya = ddos['AttackType'].astype(str).values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder(); ya_enc = le.fit_transform(ya)
n_classes = len(le.classes_)
print("Classes:", list(le.classes_))

Xa_tr, Xa_te, ya_tr, ya_te = train_test_split(Xa, ya_enc, test_size=0.2, random_state=42, stratify=ya_enc)
scaler_a = MinMaxScaler().fit(Xa_tr)
Xa_tr_s = scaler_a.transform(Xa_tr); Xa_te_s = scaler_a.transform(Xa_te)

sm2 = SMOTE(random_state=42)
Xa_res, ya_res = sm2.fit_resample(Xa_tr, ya_tr)
Xa_res_s = scaler_a.transform(Xa_res)


In [None]:

from sklearn.multiclass import OneVsRestClassifier

def model_zoo_multi(nc:int):
    models = {}
    if LGBM_AVAILABLE:
        models["LightGBM"] = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, max_depth=9,
                                                num_leaves=256, subsample=0.9, colsample_bytree=0.9,
                                                class_weight="balanced", n_jobs=-1, random_state=42)
    if XGB_AVAILABLE:
        models["XGBoost"] = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth=9,
                                              subsample=0.9, colsample_bytree=0.9, reg_lambda=2.0,
                                              tree_method="hist", n_jobs=-1, objective="multi:softprob",
                                              num_class=nc, random_state=42)
    if CAT_AVAILABLE:
        models["CatBoost"] = CatBoostClassifier(iterations=1200, learning_rate=0.05, depth=8, l2_leaf_reg=5.0,
                                                loss_function="MultiClass", auto_class_weights="Balanced",
                                                verbose=False, random_seed=42)
    models["HistGBDT"] = HistGradientBoostingClassifier(max_iter=900)
    models["ExtraTrees"] = ExtraTreesClassifier(n_estimators=800, n_jobs=-1, random_state=42)
    models["RandomForest"] = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=42)
    models["LogReg-OvR"] = OneVsRestClassifier(LogisticRegression(max_iter=1500, n_jobs=-1))
    models["LinearSVC-OvR"] = OneVsRestClassifier(LinearSVC())
    models["KNN-11"] = KNeighborsClassifier(n_neighbors=11)
    return models

def run_benchmark_multi(models: dict, Xtr, ytr, Xte, yte):
    rows = []
    for name, mdl in models.items():
        t0=time.time()
        try:
            mdl.fit(Xtr, ytr)
            tr=f\"{time.time()-t0:.1f}s\"
            yhat = mdl.predict(Xte)
            acc=accuracy_score(yte,yhat); f1m=f1_score(yte,yhat,average='macro'); f1w=f1_score(yte,yhat,average='weighted')
            rows.append([name,acc,f1m,f1w,tr])
            print(f\"{name:16s} | acc={acc:.4f} f1m={f1m:.4f} f1w={f1w:.4f} time={tr}\")
        except Exception as e:
            print(f\"[SKIP] {name}: {e}\")
    return pd.DataFrame(rows, columns=[\"model\",\"accuracy\",\"f1_macro\",\"f1_weighted\",\"train_time\"]).sort_values(\"f1_macro\", ascending=False).reset_index(drop=True)

models_multi = model_zoo_multi(n_classes)
res_multi = run_benchmark_multi(models_multi, Xa_res_s, ya_res, Xa_te_s, ya_te)
res_multi


In [None]:

# Plot & export (Phase‑2)
plt.figure(figsize=(10,5))
sns.barplot(data=res_multi, x="model", y="f1_macro")
plt.xticks(rotation=45, ha='right'); plt.title("Multiclass F1-macro (DDoS Attack Types)")
plt.tight_layout(); plt.show()

res_multi.to_csv("benchmark_multiclass_results.csv", index=False)
res_multi.to_excel("benchmark_multiclass_results.xlsx", index=False)
res_multi.head()


## 5) Tuỳ chọn: NSL‑KDD & UNSW_NB15 (độc lập, cùng pipeline 8/2)

In [None]:

def read_folder_any(root):
    paths=[]
    for dirname,_,files in os.walk(root):
        for fn in files:
            if fn.lower().endswith(('.csv','.txt','.parquet')):
                paths.append(os.path.join(dirname, fn))
    return paths

def generic_binary_eval(paths, dataset_name):
    if not paths:
        print(f"[{dataset_name}] Không tìm thấy file -> bỏ qua."); return None
    frames=[]
    for p in paths:
        try:
            if p.endswith('.parquet'):
                frames.append(pd.read_parquet(p))
            else:
                try:
                    frames.append(pd.read_csv(p))
                except Exception:
                    frames.append(pd.read_csv(p, sep='\\s+', header=None))
        except Exception as e:
            print("Skip", p, e)
    if not frames: 
        print(f"[{dataset_name}] Không đọc được bảng hợp lệ."); return None
    df = pd.concat(frames, ignore_index=True).fillna(0)
    # suy đoán cột nhãn
    label_col=None
    for c in ['label','Label','attack','attack_cat','class','Class','target']:
        if c in df.columns: label_col=c; break
    if label_col is None: label_col = df.columns[-1]
    X = df.select_dtypes(include=[np.number]).values
    y_raw = df[label_col].astype(str).values
    y = np.array([0 if s.lower() in ['benign','normal','0'] else 1 for s in y_raw])

    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = MinMaxScaler().fit(X_tr)
    X_tr_s = scaler.transform(X_tr); X_te_s = scaler.transform(X_te)
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_tr_s, y_tr)

    models = model_zoo_binary()
    res = run_benchmark_binary(models, X_res, y_res, X_te_s, y_te)
    res.to_csv(f"{dataset_name}_binary_results.csv", index=False)
    print(f"[{dataset_name}] Done.")
    return res

# NSL-KDD
paths_kdd = read_folder_any(r"dataset/NSL-KDD")
res_kdd = generic_binary_eval(paths_kdd, "NSL_KDD")

# UNSW
paths_unsw = read_folder_any(r"dataset/UNSW_NB15")
res_unsw = generic_binary_eval(paths_unsw, "UNSW_NB15")
