
# IDS DDoS Pipeline (CICDDoS2017+2019 là chính) + KDD99 + UNSW-NB15 — LightGBM (Nhanh & Chính xác)

Notebook này **giữ nguyên pipeline** như bạn đang dùng:  
- Đọc từng file theo thư mục  
- Gộp dữ liệu (2017 + 2019 là chính, thêm KDD99 & UNSW-NB15)  
- Chuẩn hoá, SMOTE, scaler, split **8/2**  
- **Phase 1**: Nhị phân Benign vs DDoS  
- **Phase 2**: Phân lớp kiểu tấn công trong nhóm DDoS (ưu tiên mapping từ 2017/2019; dữ liệu ngoài nhóm này gán `Other`)  

Chỉ **thay model** sang **LightGBM** (rất nhanh, phù hợp dữ liệu tabular), không đụng vào logic/mạch pipeline.


In [None]:

# (Tuỳ chọn) Cài đặt nếu thiếu thư viện — KHÔNG bắt buộc nếu môi trường đã có
# %pip install lightgbm imbalanced-learn seaborn joblib pyarrow fastparquet

import os, glob, warnings, pickle, joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score)
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

warnings.filterwarnings("ignore")
plt.rcParams['figure.dpi'] = 120


ModuleNotFoundError: No module named 'imblearn'

In [None]:

# ==== Cấu hình đường dẫn (SỬA lại cho đúng máy bạn) ====
CIC2019_DIR = r'D:\DACN\dataset\CICDDoS2019'   # chứa *-training.parquet / *-testing.parquet
CIC2017_DIR = r'D:\DACN\dataset\CICDDoS2017'   # chứa *.parquet
KDD99_DIR   = r'D:\DACN\dataset\KDD99'         # thêm mới (csv/parquet đều được)
UNSW15_DIR  = r'D:\DACN\dataset\UNSW-NB15'     # thêm mới (csv/parquet đều được)


In [None]:

# ==== Danh sách feature GIỮ NGUYÊN như code cũ ====
features_names = [
    'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max',
    'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
    'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
    'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
    'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
    'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
    'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
    'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
    'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
    'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
    'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Avg Packet Size',
    'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
    'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
    'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min',
    'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std',
    'Idle Max', 'Idle Min'
]
print("Số features khai báo:", len(features_names))


In [None]:

# ==== Utils: đọc file & phát hiện cột nhãn ====
LABEL_CANDS = ["Label","label","Attack","attack","attack_cat","class","Class","target","category","Category","result"]

def detect_label_col(df: pd.DataFrame):
    for c in LABEL_CANDS:
        if c in df.columns: return c
    raise ValueError("Không tìm thấy cột nhãn trong dataframe.")

def safe_read_any(path: str) -> pd.DataFrame:
    try:
        if path.lower().endswith(".parquet"):
            return pd.read_parquet(path)
        elif path.lower().endswith(".csv"):
            for enc in ("utf-8-sig","utf-8","cp1252","latin1"):
                try:
                    return pd.read_csv(path, encoding=enc)
                except Exception:
                    continue
            return pd.read_csv(path)
        else:
            return pd.DataFrame()
    except Exception as e:
        print(f"[WARN] skip {path}: {e}")
        return pd.DataFrame()

def load_dir_parquet_csv(root: str, pattern=("*.parquet","*.csv")) -> list:
    if not os.path.isdir(root): return []
    files = []
    for p in pattern:
        files += glob.glob(os.path.join(root, "**", p), recursive=True)
    out = []
    for f in tqdm(files, desc=f"Đọc {os.path.basename(root)}"):
        df = safe_read_any(f)
        if not df.empty:
            out.append(df)
    return out


In [None]:

# ==== Đọc CICDDoS2019 (train/test) + CICDDoS2017 (all) ====
df2019_train_paths, df2019_test_paths = [], []
labels_2019 = set()
for dirname, _, filenames in os.walk(CIC2019_DIR):
    for filename in filenames:
        if filename.endswith('-training.parquet'):
            dfp = os.path.join(dirname, filename)
            df2019_train_paths.append(dfp); labels_2019.add(filename.split('-')[0])
        elif filename.endswith('-testing.parquet'):
            dfp = os.path.join(dirname, filename)
            df2019_test_paths.append(dfp); labels_2019.add(filename.split('-')[0])

print("Các nhãn (label) tìm thấy trong CICDDos2019:")
for label in sorted(labels_2019): print("-", label)

df2017_paths, labels_2017 = [], set()
for dirname, _, filenames in os.walk(CIC2017_DIR):
    for filename in filenames:
        if filename.endswith('.parquet'):
            fp = os.path.join(dirname, filename)
            df2017_paths.append(fp); labels_2017.add(filename.split('-')[0])

print("Các nhãn (label) tìm thấy trong CICDDos2017:")
for label in sorted(labels_2017): print("-", label)

df_2017 = pd.concat([pd.read_parquet(p) for p in df2017_paths], ignore_index=True) if df2017_paths else pd.DataFrame()
df_2019 = pd.concat([pd.read_parquet(p) for p in df2019_train_paths], ignore_index=True) if df2019_train_paths else pd.DataFrame()

print("CIC2017 shape:", df_2017.shape, "| CIC2019 train shape:", df_2019.shape)


In [None]:

# ==== Đọc KDD99 + UNSW-NB15 (tuỳ chọn) ====
dfs_kdd  = load_dir_parquet_csv(KDD99_DIR)
dfs_unsw = load_dir_parquet_csv(UNSW15_DIR)
df_kdd   = pd.concat(dfs_kdd, ignore_index=True) if dfs_kdd else pd.DataFrame()
df_unsw  = pd.concat(dfs_unsw, ignore_index=True) if dfs_unsw else pd.DataFrame()
print("KDD99 shape:", df_kdd.shape, "| UNSW-NB15 shape:", df_unsw.shape)


In [None]:

# ==== Chuẩn hoá nhãn cho KDD/UNSW: Label -> Benign/Attack; AttackType giữ hoặc tạo Other ====
def normalize_ext_dataset(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    if df.empty: return df
    try:
        lbl = detect_label_col(df)
        df = df.copy()
        df.rename(columns={lbl: "Label"}, inplace=True)
        df["Label"] = df["Label"].astype(str).str.strip()
        df.loc[df["Label"].str.lower().isin(["normal","benign","non-attack","good"]), "Label"] = "Benign"
        if "AttackType" not in df.columns:
            df["AttackType"] = np.where(df["Label"]=="Benign","Benign","Other")
        return df
    except Exception as e:
        print(f"[WARN] {dataset_name}: không chuẩn hoá được label ({e})")
        return pd.DataFrame()

df_kdd  = normalize_ext_dataset(df_kdd,  "KDD99")
df_unsw = normalize_ext_dataset(df_unsw, "UNSW-NB15")


In [None]:

# ==== Gộp dữ liệu: 2017+2019 là chính; KDD/UNSW thêm để tăng độ phong phú ====
df_main = pd.concat([df_2017, df_2019], ignore_index=True) if not df_2017.empty or not df_2019.empty else pd.DataFrame()
df_extra = []
if not df_kdd.empty:  df_extra.append(df_kdd)
if not df_unsw.empty: df_extra.append(df_unsw)
df = pd.concat([df_main] + df_extra, ignore_index=True) if df_extra else df_main
if df.empty:
    raise SystemExit("Không có dữ liệu sau khi đọc các thư mục. Kiểm tra đường dẫn!")

# Nếu chưa có AttackType, tạo từ Label
if "AttackType" not in df.columns and "Label" in df.columns:
    df["AttackType"] = df["Label"]

# Mapping nhóm tấn công (ưu tiên 2017/2019), còn lại -> Other
attack_group_map = {
    'DrDoS_DNS':'DrDoS','DrDoS_SNMP':'DrDoS','DrDoS_NTP':'DrDoS','DrDoS_MSSQL':'DrDoS',
    'DrDoS_SSDP':'DrDoS','DrDoS_UDP':'DrDoS','TFTP':'TFTP',
    'UDP':'UDP','UDPLag':'UDP','Syn':'Syn','MSSQL':'MSSQL','LDAP':'LDAP',
    'DoS slowloris':'DoS','DoS Slowhttptest':'DoS','DoS Hulk':'DoS','DoS GoldenEye':'DoS',
    'Heartbleed':'Other',
    'Web Attack � Brute Force':'Web Attack','Web Attack � XSS':'Web Attack','Web Attack � Sql Injection':'Web Attack',
    'FTP-Patator':'Brute Force','SSH-Patator':'Brute Force','Infiltration':'Other','Bot':'Other',
    'PortScan':'PortScan','NetBIOS':'Other'
}
def group_attack_type(x):
    if pd.isna(x): return 'Other'
    if x == 'Benign': return 'Benign'
    return attack_group_map.get(str(x), 'Other')

df["AttackType"] = df["AttackType"].apply(group_attack_type)
df["Label"] = df["Label"].apply(lambda v: 'Benign' if str(v)=='Benign' else 'DDoS')

print("Label uniq:", df["Label"].unique())
print("AttackType uniq:", df["AttackType"].unique())

# Chỉ giữ các feature có mặt trong df
used_features = [c for c in features_names if c in df.columns]
print(f"Using {len(used_features)} / {len(features_names)} features.")


In [None]:

# ==== Chuẩn hoá dữ liệu & Split 8/2 (giữ logic) ====
df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
X_full = df[used_features].astype(np.float32).values
y_bin  = (df["Label"] != "Benign").astype(int).values

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_bin, test_size=0.2, random_state=42, stratify=y_bin
)

scaler = MinMaxScaler()
scaler.fit(X_train)
joblib.dump(scaler, 'scaler_2019.pkl')

X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

# SMOTE (như code cũ)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_s, y_train)
print("After SMOTE:", X_res.shape, Counter(y_res))


In [None]:

# ==== PHASE 1: Binary Benign vs DDoS (LightGBM) ====
clf_bin = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

clf_bin.fit(
    X_res, y_res,
    eval_set=[(X_test_s, y_test)],
    eval_metric=['auc','binary_logloss'],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

joblib.dump(clf_bin, 'lgbm_binary_ddos.joblib')

y_prob = clf_bin.predict_proba(X_test_s)[:,1]
y_pred = (y_prob >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign','DDoS'], yticklabels=['Benign','DDoS'])
plt.title("Confusion Matrix - Phase 1 (Binary - LightGBM)")
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.tight_layout(); plt.show()

print("Phase 1 report:")
print(classification_report(y_test, y_pred, target_names=['Benign','DDoS']))
try:
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))
except Exception:
    pass


In [None]:

# ==== PHASE 2: Multiclass trong nhóm DDoS ====
df_attack = df[df['Label']=='DDoS'].copy()

if df_attack.empty:
    print("[WARN] Không có mẫu DDoS cho Phase 2.")
else:
    X_attack = df_attack[used_features].astype(np.float32).values
    y_attack = df_attack['AttackType'].astype(str).values

    le_attack = LabelEncoder()
    y_attack_enc = le_attack.fit_transform(y_attack)
    joblib.dump(le_attack, "attack_label_encoder.pkl")
    n_classes = len(le_attack.classes_)
    print("Các lớp attack:", list(le_attack.classes_))

    X_attack_s = scaler.transform(X_attack)
    X_attack_res, y_attack_res = SMOTE(random_state=42).fit_resample(X_attack_s, y_attack_enc)

    inv_freq = pd.Series(y_attack_res).value_counts()
    inv_ratio = inv_freq.max() / inv_freq
    sample_weight = pd.Series(y_attack_res).map(inv_ratio).values

    Xa_tr, Xa_te, ya_tr, ya_te, sw_tr, sw_te = train_test_split(
        X_attack_res, y_attack_res, sample_weight, test_size=0.2, random_state=42, stratify=y_attack_res
    )

    clf_multi = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=n_classes,
        n_estimators=1500,
        learning_rate=0.05,
        num_leaves=96,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        n_jobs=-1,
        random_state=42
    )

    clf_multi.fit(
        Xa_tr, ya_tr,
        sample_weight=sw_tr,
        eval_set=[(Xa_te, ya_te)],
        eval_metric=['multi_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    joblib.dump(clf_multi, 'lgbm_attack_classifier.joblib')

    # Đánh giá trên phần test DDoS lấy từ split 8/2 toàn bộ df
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Label'], random_state=42)
    df_attack_test = test_df[test_df['Label']=='DDoS'].copy()
    if not df_attack_test.empty:
        X_attack_test = scaler.transform(df_attack_test[used_features].astype(np.float32).values)
        y_attack_test = le_attack.transform(df_attack_test['AttackType'].astype(str).values)
        y_attack_pred = clf_multi.predict(X_attack_test)

        cm2 = confusion_matrix(y_attack_test, y_attack_pred)
        labels_multi = le_attack.classes_

        plt.figure(figsize=(12,10))
        sns.heatmap(cm2, annot=True, fmt='d', cmap='YlGnBu', xticklabels=labels_multi, yticklabels=labels_multi)
        plt.title("Confusion Matrix - Phase 2 (Multiclass - LightGBM)")
        plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.tight_layout(); plt.show()

        print("Phase 2 report:")
        print(classification_report(y_attack_test, y_attack_pred, target_names=labels_multi))

        report = classification_report(y_attack_test, y_attack_pred, target_names=labels_multi, output_dict=True)
        df_report = pd.DataFrame(report).transpose()
        df_report_main = df_report.iloc[:len(labels_multi), :4].round(2)
        df_report_main.rename(columns={'precision':'Độ chính xác','recall':'Độ bao phủ','f1-score':'F1','support':'Số mẫu'}, inplace=True)
        df_report_main.to_excel("bang_phan_loai_ddos.xlsx", index=True)
        with open("classification_report.html", "w", encoding="utf-8") as f:
            f.write(df_report_main.to_html(border=1))
    else:
        print("[WARN] Không có mẫu DDoS trong tập test để đánh giá Phase 2.")


In [None]:

# (Tuỳ chọn) Thống kê nhanh phân bố Label & AttackType
fig, ax = plt.subplots(1,2, figsize=(12,4))
df['Label'].value_counts().plot(kind='bar', ax=ax[0], title='Label distribution')
df['AttackType'].value_counts().head(10).plot(kind='bar', ax=ax[1], title='Top-10 AttackType')
plt.tight_layout(); plt.show()
