
# **Imports & cấu hình chung**

In [3]:
# %% [markdown]
# # IDS Notebook — Pipeline 2-Phase + Save/Plot + Resume
# - Lưu mô hình dạng `.h5` vào D:\DACN\results\training\models
# - Ảnh biểu đồ vào D:\DACN\results\training\plots
# - Bảng so sánh vào D:\DACN\results\training\tables
# - Có thể **tiếp tục train** (resume) mà **không phải chạy lại** tiền xử lý.

import os, glob, io, time, json, warnings, joblib, random, pickle, math
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_fscore_support, accuracy_score
)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

warnings.filterwarnings("ignore")

# ==== PATHS (chỉnh theo máy bạn) ====
CIC2019_DIR = r'E:\DACN\dataset\CICDDoS2019'
CIC2017_DIR = r'E:\DACN\dataset\CICDDoS2017'
UNSW15_DIR  = r'E:\DACN\dataset\UNSW_NB15'
NSLKDD_DIR  = r'E:\DACN\dataset\NSL-KDD'   # có KDDTrain+.txt, KDDTest+.txt

# ==== nơi lưu kết quả ====
ROOT_SAVE = Path(r"E:\DACN\results\training")
DIR_MODELS = ROOT_SAVE / "models"
DIR_PLOTS  = ROOT_SAVE / "plots"
DIR_TABLES = ROOT_SAVE / "tables"
for p in [DIR_MODELS, DIR_PLOTS, DIR_TABLES]:
    p.mkdir(parents=True, exist_ok=True)

# ==== loại cột ID/time ====
EXCLUDE_ID_COLUMNS = True
ID_LIKE_COLS = set([
    'Flow ID','FlowID','Timestamp','StartTime','Start Time','stime','time','Date','datetime',
    'Src IP','Dst IP','Source IP','Destination IP',
    'srcip','dstip','srcip_addr','dstip_addr', 
    'Src Port','Dst Port','Sport','Dport','srcport','dstport',
    'ProtocolName','ProtoName','Service','service','state','attack_cat','label',
    'Unnamed: 0','id','No.','Index'
])
LABEL_CANDS = ["Label","label","Attack","attack","attack_cat","class","Class","target","category","Category","result"]

# kiểm soát lệch phân bố từ UNSW (toàn attack)
MAX_UNSW_RATIO = 0.30   # tối đa 30%
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)



# **Giới hạn CPU + TensorFlow threads**

In [4]:
# %%
import multiprocessing as mp

def limit_cpu(fraction: float = 0.90) -> int:
    """
    Giới hạn tài nguyên CPU ~fraction (theo số luồng).
    Trả về số threads cho n_jobs/num_threads.
    """
    fraction = max(0.1, min(1.0, float(fraction)))
    total = os.cpu_count() or mp.cpu_count() or 1
    allow = max(1, math.floor(total * fraction))

    os.environ["OMP_NUM_THREADS"] = str(allow)
    os.environ["OPENBLAS_NUM_THREADS"] = str(allow)
    os.environ["MKL_NUM_THREADS"] = str(allow)
    os.environ["VECLIB_MAXIMUM_THREADS"] = str(allow)
    os.environ["NUMEXPR_MAX_THREADS"] = str(allow)
    os.environ["NUMEXPR_NUM_THREADS"] = str(allow)

    try:
        from threadpoolctl import threadpool_limits
        threadpool_limits(allow)
    except Exception:
        pass

    try:
        import psutil
        p = psutil.Process()
        cpus = list(range(allow))
        p.cpu_affinity(cpus)
    except Exception:
        pass

    print(f"[CPU] total={total} | allow={allow} threads (~{fraction*100:.0f}%)")
    return allow

threads_allowed = limit_cpu(0.90)

import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(threads_allowed)
tf.config.threading.set_inter_op_parallelism_threads(max(1, threads_allowed//2))


[CPU] total=28 | allow=25 threads (~90%)



# **Hàm I/O an toàn + Chuẩn hoá nhãn**

In [5]:
def safe_read_any(path: str, nrows=None) -> pd.DataFrame:
    low = path.lower()
    try:
        if low.endswith(".parquet"):
            return pd.read_parquet(path) if nrows is None else pd.read_parquet(path).head(nrows)
        # NSL-KDD .txt không header
        if low.endswith(".txt") and ("kddtrain" in low or "kddtest" in low):
            df = pd.read_csv(path, header=None)
            if df.shape[1] == 43:
                cols = [f"feat_{i}" for i in range(41)] + ["label","difficulty"]
            elif df.shape[1] == 42:
                cols = [f"feat_{i}" for i in range(41)] + ["label"]
            else:
                cols = [f"col_{i}" for i in range(df.shape[1])]
            df.columns = cols
            return df if nrows is None else df.head(nrows)
        # CSV chung
        for enc in ("utf-8-sig","utf-8","cp1252","latin1"):
            try:
                return pd.read_csv(path, encoding=enc, compression="infer", low_memory=False, nrows=nrows)
            except Exception:
                continue
        return pd.read_csv(path, compression="infer", low_memory=False, nrows=nrows)
    except Exception as e:
        print(f"[WARN] skip {os.path.basename(path)}: {e}")
        return pd.DataFrame()

def find_label_col(df: pd.DataFrame):
    for c in LABEL_CANDS:
        if c in df.columns: return c
    return None

attack_group_map = {
    'DrDoS_DNS':'DrDoS','DrDoS_SNMP':'DrDoS','DrDoS_NTP':'DrDoS','DrDoS_MSSQL':'DrDoS',
    'DrDoS_SSDP':'DrDoS','DrDoS_UDP':'DrDoS','TFTP':'TFTP',
    'UDP':'UDP','UDPLag':'UDP','Syn':'Syn','MSSQL':'MSSQL','LDAP':'LDAP',
    'DoS slowloris':'DoS','DoS Slowhttptest':'DoS','DoS Hulk':'DoS','DoS GoldenEye':'DoS',
    'Heartbleed':'Other',
    'Web Attack � Brute Force':'Web Attack','Web Attack � XSS':'Web Attack','Web Attack � Sql Injection':'Web Attack',
    'FTP-Patator':'Brute Force','SSH-Patator':'Brute Force','Infiltration':'Other','Bot':'Other',
    'PortScan':'PortScan','NetBIOS':'Other'
}

def normalize_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    lbl = find_label_col(df)
    if lbl is None:
        return pd.DataFrame()
    df.rename(columns={lbl: "Label"}, inplace=True)
    df["Label"] = df["Label"].astype(str).str.strip()
    df.loc[df["Label"].str.lower().isin(["normal","benign","non-attack","good"]), "Label"] = "Benign"
    if "AttackType" not in df.columns:
        df["AttackType"] = df["Label"]
    def group_attack_type(x):
        if pd.isna(x): return 'Other'
        if x == 'Benign': return 'Benign'
        return attack_group_map.get(str(x), 'Other')
    df["AttackType"] = df["AttackType"].apply(group_attack_type)
    df["Label"] = df["Label"].apply(lambda v: 'Benign' if str(v)=='Benign' else 'DDoS')
    return df



# **Liệt kê file & union features**

In [6]:
# CIC-2019 train/test
cic19_train, cic19_test = [], []
for root,_,files in os.walk(CIC2019_DIR):
    for fn in files:
        if fn.endswith("-training.parquet"): cic19_train.append(os.path.join(root, fn))
        if fn.endswith("-testing.parquet"):  cic19_test.append(os.path.join(root, fn))

# CIC-2017 parquet
cic17_files = glob.glob(os.path.join(CIC2017_DIR, "**", "*.parquet"), recursive=True)

# UNSW: bỏ *_features.csv, *_LIST_EVENTS.csv, *_GT.csv
unsw_all = glob.glob(os.path.join(UNSW15_DIR, "**", "*.csv"), recursive=True)
unsw_files = [p for p in unsw_all if ("features" not in os.path.basename(p).lower()
                                      and "list_events" not in os.path.basename(p).lower()
                                      and not os.path.basename(p).lower().endswith("_gt.csv"))]

# NSL: chỉ .txt
nsl_all = glob.glob(os.path.join(NSLKDD_DIR, "**", "*.txt"), recursive=True)
nsl_files = [p for p in nsl_all if ("kddtrain" in os.path.basename(p).lower() or
                                    "kddtest" in os.path.basename(p).lower())]

print("CIC19 train:", len(cic19_train), "CIC19 test:", len(cic19_test))
print("CIC17:", len(cic17_files), "UNSW:", len(unsw_files), "NSL:", len(nsl_files))

def infer_numeric_cols(files: List[str]) -> set:
    s = set()
    for p in files[:10]:
        head = safe_read_any(p, nrows=200)
        if head.empty: continue
        head = normalize_labels(head)
        if head.empty: continue
        cols = [c for c in head.columns if c not in ID_LIKE_COLS and c not in ("Label","AttackType")]
        for c in cols:
            if pd.api.types.is_numeric_dtype(head[c]):
                s.add(c)
    s.add("dataset_id")
    return s

union_cols = set()
union_cols |= infer_numeric_cols(cic19_train + cic19_test)
union_cols |= infer_numeric_cols(cic17_files)
union_cols |= infer_numeric_cols(unsw_files)
union_cols |= infer_numeric_cols(nsl_files)

FEATURES = sorted(list(union_cols))
print("Tổng số cột numeric union:", len(FEATURES))
joblib.dump({"feature_order": FEATURES}, ROOT_SAVE / "feature_order_union.pkl")


CIC19 train: 7 CIC19 test: 10
CIC17: 8 UNSW: 6 NSL: 4
Tổng số cột numeric union: 156


['E:\\DACN\\results\\training\\feature_order_union.pkl']


# **Load & Normalize datasets + Gộp + Lưu parquet**

In [7]:
def load_and_normalize(files: List[str], dataset_id: int) -> pd.DataFrame:
    out = []
    for p in tqdm(files, desc=f"Load ds{dataset_id}"):
        df = safe_read_any(p)
        if df.empty: 
            continue
        df = normalize_labels(df)
        if df.empty:
            continue
        df["dataset_id"] = dataset_id
        out.append(df)
    return pd.concat(out, ignore_index=True) if out else pd.DataFrame()

df17  = load_and_normalize(cic17_files, 1)
df19t = load_and_normalize(cic19_train, 2)
df19e = load_and_normalize(cic19_test, 2)
dfUN  = load_and_normalize(unsw_files, 3)
dfNSL = load_and_normalize(nsl_files, 4)

print("Shapes:", {k:v.shape for k,v in {"CIC17":df17,"CIC19_train":df19t,"UNSW":dfUN,"NSL":dfNSL}.items()})

# gộp chính (2017 + 2019 train)
df_main = pd.concat([df17, df19t], ignore_index=True)

# hạn chế UNSW (gần như toàn attack)
if not dfUN.empty:
    cur_ddos = (df_main["Label"]=="DDoS").sum()
    cap = int(MAX_UNSW_RATIO * max(1, cur_ddos))
    dfUN_ddos = dfUN[dfUN["Label"]=="DDoS"]
    if len(dfUN_ddos) > cap:
        dfUN_ddos = dfUN_ddos.sample(cap, random_state=RANDOM_STATE)
    dfUN = dfUN_ddos

df_all = pd.concat([df_main, dfUN, dfNSL], ignore_index=True)
assert not df_all.empty, "Không có dữ liệu!"

df_all = df_all.replace([np.inf, -np.inf], np.nan).fillna(0)
for c in df_all.columns:
    if df_all[c].dtype == "object":
        df_all[c] = df_all[c].astype(str)

parq_path = ROOT_SAVE / "df_all_union.parquet"
df_all.to_parquet(parq_path, index=False)
print("Đã lưu:", parq_path)


Load ds1: 100%|██████████| 8/8 [00:01<00:00,  4.18it/s]
Load ds2: 100%|██████████| 7/7 [00:00<00:00, 41.07it/s]
Load ds2: 100%|██████████| 10/10 [00:00<00:00, 28.14it/s]
Load ds3: 100%|██████████| 6/6 [00:09<00:00,  1.51s/it]
Load ds4: 100%|██████████| 4/4 [00:00<00:00, 10.45it/s]


Shapes: {'CIC17': (2313810, 80), 'CIC19_train': (125170, 80), 'UNSW': (257673, 47), 'NSL': (185559, 45)}
Đã lưu: E:\DACN\results\training\df_all_union.parquet



# **Đọc lại parquet + Tạo tập train/test + SMOTE + Chuẩn bị DL**

In [8]:
df_all = pd.read_parquet(ROOT_SAVE / "df_all_union.parquet")
print("Đọc lại:", df_all.shape)

drop_cols = {'Label','AttackType'}
if EXCLUDE_ID_COLUMNS:
    drop_cols |= {c for c in df_all.columns if c in ID_LIKE_COLS}
feature_candidates = [c for c in FEATURES if c not in drop_cols and c in df_all.columns]
print("Số cột dùng:", len(feature_candidates))

X = df_all.reindex(columns=feature_candidates, fill_value=0.0).astype(np.float32)
y_bin = (df_all['Label'] != 'Benign').astype(int).values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y_bin, test_size=0.2, random_state=RANDOM_STATE, stratify=y_bin
)
print("Train:", X_train_raw.shape, "Test:", X_test_raw.shape)

scaler = MinMaxScaler()
scaler.fit(X_train_raw.values)
joblib.dump(scaler, ROOT_SAVE / 'scaler_union.pkl')

X_train_s = scaler.transform(X_train_raw.values)
X_test_s  = scaler.transform(X_test_raw.values)

sm = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = sm.fit_resample(X_train_s, y_train)
print("After SMOTE:", X_res.shape, "| pos_ratio:", y_res.mean().round(4))

# Dữ liệu cho DL (mỗi feature là 1 “time step”)
X_train_dl = X_res.astype(np.float32).reshape(-1, X_res.shape[1], 1)
X_test_dl  = X_test_s.astype(np.float32).reshape(-1, X_test_s.shape[1], 1)
y_train_dl = y_res.astype(np.int32)
y_test_dl  = y_test.astype(np.int32)

print("DL shapes:", X_train_dl.shape, X_test_dl.shape)


Đọc lại: (2749109, 166)
Số cột dùng: 156
Train: (2199287, 156) Test: (549822, 156)
After SMOTE: (3386240, 156) | pos_ratio: 0.5
DL shapes: (3386240, 156, 1) (549822, 156, 1)



# **Tiện ích Save/Load .h5 + Plot (CM + Val)**

In [9]:
import h5py

def _stamp():
    return time.strftime("%Y%m%d-%H%M%S")

def save_fig_current(fig, name: str):
    png = DIR_PLOTS / f"{name}-{_stamp()}.png"
    fig.savefig(png, dpi=150, bbox_inches="tight")
    print(f"[SAVE] Figure -> {png}")
    return str(png)

def save_model_h5_any(model, name: str, extra: dict | None = None):
    """
    Lưu mô hình dạng .h5:
    - Keras: model.save(.h5)
    - Sklearn/XGBoost/LightGBM: pickle vào HDF5['pickle'] + attrs['extra_json']
    """
    path = DIR_MODELS / f"{name}-{_stamp()}.h5"
    # Thử Keras trước
    try:
        import tensorflow as tf
        if hasattr(model, "save") and isinstance(getattr(model, "save"), type(tf.keras.Model.save)):
            model.save(path)
            print(f"[SAVE] Keras model -> {path}")
            if extra:
                with open(str(path).replace(".h5", ".meta.json"), "w", encoding="utf-8") as f:
                    json.dump(extra, f, ensure_ascii=False, indent=2, default=str)
            return str(path)
    except Exception:
        pass
    # Non-Keras
    blob = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
    with h5py.File(path, "w") as h5:
        h5.create_dataset("pickle", data=np.void(blob))
        if extra:
            try:
                h5.attrs["extra_json"] = json.dumps(extra, default=str)
            except Exception:
                h5.attrs["extra_json"] = "{}"
    print(f"[SAVE] Pickled model-in-HDF5 -> {path}")
    return str(path)

def load_model_h5_any(path: str):
    import tensorflow as tf
    try:
        return tf.keras.models.load_model(path)
    except Exception:
        pass
    with h5py.File(path, "r") as h5:
        blob = bytes(h5["pickle"][()])
    return pickle.loads(blob)

# --- Plot: Binary
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

def show_cm_and_valacc(model_name, y_true, y_prob, threshold=0.5, savepath=None):
    y_pred = (y_prob >= threshold).astype(int)
    cm  = confusion_matrix(y_true, y_pred, labels=[0,1])
    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)

    fig, axes = plt.subplots(1, 2, figsize=(10,4))

    im = axes[0].imshow(cm, cmap="Blues")
    for (i,j),v in np.ndenumerate(cm):
        axes[0].text(j, i, str(v), ha="center", va="center", fontsize=10)
    axes[0].set_xticks([0,1]); axes[0].set_xticklabels(["Benign","DDoS"])
    axes[0].set_yticks([0,1]); axes[0].set_yticklabels(["Benign","DDoS"])
    axes[0].set_xlabel("Predicted"); axes[0].set_ylabel("Actual")
    axes[0].set_title(f"{model_name} — Confusion Matrix")
    fig.colorbar(im, ax=axes[0], fraction=0.046, pad=0.04)

    axes[1].bar([0], [acc], width=0.5)
    axes[1].set_ylim(0, 1.0)
    axes[1].set_xticks([0]); axes[1].set_xticklabels(["Accuracy"])
    axes[1].set_ylabel("Value")
    axes[1].set_title(f"{model_name} — Validation Accuracy")
    axes[1].text(0, min(acc+0.03, 0.98), f"{acc:.6f}", ha="center", va="center", fontsize=11, fontweight="bold")

    plt.suptitle(f"{model_name}  |  ACC={acc:.6f}  AUC={auc:.6f}  thr={threshold}", y=1.04, fontsize=11)
    plt.tight_layout()
    if savepath: plt.savefig(savepath, dpi=140, bbox_inches="tight")
    plt.show()

# --- Plot: Multiclass
def show_cm_and_valacc_multiclass(model_name, y_true, y_pred, labels, savepath=None):
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(labels)))

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    im = axes[0].imshow(cm_norm, cmap="Blues", vmin=0, vmax=1)
    for (i, j), v in np.ndenumerate(cm):
        axes[0].text(j, i, str(v), ha="center", va="center", fontsize=8,
                     color="white" if cm_norm[i, j] > 0.5 else "black")
    axes[0].set_xticks(np.arange(len(labels))); axes[0].set_xticklabels(labels, rotation=90)
    axes[0].set_yticks(np.arange(len(labels))); axes[0].set_yticklabels(labels)
    axes[0].set_xlabel("Predicted"); axes[0].set_ylabel("Actual")
    axes[0].set_title(f"{model_name} — Confusion Matrix (Normalized)")
    fig.colorbar(im, ax=axes[0], fraction=0.046, pad=0.04)

    axes[1].bar([0], [acc], width=0.5)
    axes[1].set_ylim(0, 1.0)
    axes[1].set_xticks([0]); axes[1].set_xticklabels(["Accuracy"])
    axes[1].set_ylabel("Value")
    axes[1].set_title(f"{model_name} — Validation Accuracy")
    axes[1].text(0, min(acc+0.03, 0.98), f"{acc:.4f}", ha="center", va="center", fontsize=11, fontweight="bold")

    plt.suptitle(f"{model_name}  |  ACC={acc:.4f}", y=1.03, fontsize=11)
    plt.tight_layout()
    if savepath:
        plt.savefig(savepath, dpi=150, bbox_inches="tight")
        print(f"[Saved] {savepath}")
    plt.show()


In [10]:
# ===== EarlyStop theo "cải thiện quá nhỏ" giữa 2 epoch liên tiếp (Keras) =====
from tensorflow.keras import layers, models, callbacks  # type: ignore

class StopOnTinyChange(callbacks.Callback):
    """
    Dừng sớm nếu metric theo dõi (monitor) cải thiện quá nhỏ giữa 2 epoch liên tiếp.
    - monitor: 'val_auc' cho bài toán binary, hoặc 'val_accuracy' cho multiclass.
    - min_delta: ngưỡng cải thiện tối thiểu. Nếu |Δ| < min_delta => dừng.
    """
    def __init__(self, monitor="val_auc", min_delta=1e-4):
        super().__init__()
        self.monitor = monitor
        self.min_delta = float(min_delta)
        self.prev = None

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        curr = logs.get(self.monitor)
        if curr is None:
            return
        if self.prev is not None and abs(curr - self.prev) < self.min_delta:
            print(f"\n[STOP] Δ{self.monitor}={curr - self.prev:.6f} < {self.min_delta} tại epoch {epoch+1}. Kết thúc huấn luyện.")
            self.model.stop_training = True
        self.prev = curr


In [11]:
RESULTS_BIN = []  # lưu so sánh phase-1

def eval_binary(y_true, y_prob, name, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    auc = roc_auc_score(y_true, y_prob)
    RESULTS_BIN.append({"Model": name, "ACC": acc, "Precision": prec, "Recall": rec, "F1": f1, "AUC": auc})
    print(f"\n=== {name} ===")
    print(classification_report(y_true, y_pred, target_names=["Benign","DDoS"]))
    print("ROC-AUC:", auc)
    return y_pred



# **Phase 1: Kết hợp (Ensemble Weighted Soft Voting) giữa XGBoost + LightGBM + LSTM.**

In [None]:
# %% Phase-1 BEST — Ensemble XGB + LGBM + LSTM SOTA (opt weights, min recall=0.85)
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score, accuracy_score

cand = []
if 'yprob_xgb' in globals():         cand.append(('XGB',  yprob_xgb))
if 'yprob_lgb' in globals():         cand.append(('LGBM', yprob_lgb))
if 'yprob_lstm_sota' in globals():   cand.append(('LSTM', yprob_lstm_sota)) # type: ignore
assert len(cand)>=1, "Thiếu yprob_* để ensemble."

names, probs = zip(*cand)
P = np.vstack(probs); y = y_test

def best_thr(y_true, y_prob, min_rec=0.85):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    f1 = 2*p[:-1]*r[:-1]/(p[:-1]+r[:-1]+1e-9)
    mask = r[:-1] >= min_rec
    idx = (np.flatnonzero(mask)[f1[mask].argmax()] if mask.any() else f1.argmax())
    return float(thr[idx]), float(p[idx]), float(r[idx]), float(f1[idx])

def score(w):
    w = np.asarray(w)/(np.sum(w)+1e-12)
    yprob = np.dot(w, P)
    thr, bp, br, bf1 = best_thr(y, yprob, 0.85)
    return dict(w=w, thr=thr, P=bp, R=br, F1=bf1, AUC=roc_auc_score(y,yprob),
                ACC=accuracy_score(y,(yprob>=thr).astype(int)))

grid = np.arange(0,1.1,0.1); best=None
for w in np.array(np.meshgrid(*([grid]*len(names)))).T.reshape(-1,len(names)):
    if w.sum()<=0: continue
    res = score(w)
    if (best is None) or (res['F1']>best['F1']) or (res['F1']==best['F1'] and res['AUC']>best['AUC']):
        best = res

print(f"[Ensemble-Binary] {names} -> weights={dict(zip(names, np.round(best['w'],3)))} | thr={best['thr']:.4f}")
print(f"   P={best['P']:.3f} R={best['R']:.3f} F1={best['F1']:.3f} AUC={best['AUC']:.4f} ACC={best['ACC']:.4f}")

if 'RESULTS_BIN' not in globals(): RESULTS_BIN = []
RESULTS_BIN.append({"Model": f"Ensemble[{'+'.join(names)}] (thr@{best['thr']:.3f})",
                    "ACC": best['ACC'], "Precision": best['P'], "Recall": best['R'],
                    "F1": best['F1'], "AUC": best['AUC']})


[✓] Epoch 1/3 — best_iter: 542


In [None]:
# %% Phase-1 BEST — Ensemble INLINE Validation (ROC/PR + CM + Report, no saving)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, roc_auc_score, precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report, accuracy_score, f1_score,
    precision_score, recall_score
)

# --- helper: best threshold theo F1 (có ràng buộc recall nếu muốn)
def _best_thr(y_true, y_prob, min_rec=0.85):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    f1s = 2*p[:-1]*r[:-1] / np.maximum(p[:-1] + r[:-1], 1e-12)
    mask = r[:-1] >= min_rec
    idx = (np.flatnonzero(mask)[f1s[mask].argmax()] if mask.any() else int(np.nanargmax(f1s)))
    return float(thr[idx]), float(p[idx]), float(r[idx]), float(f1s[idx])

def _show_confusion(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    fig, ax = plt.subplots(figsize=(4.5, 4.0), dpi=140)
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_title(title)
    ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, f"{v}", ha='center', va='center')
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    plt.tight_layout(); plt.show()
    return cm

# --- dùng kết quả 'best' & 'P' & 'names' từ cell trước
w_best = np.asarray(best["w"]) / (np.sum(best["w"]) + 1e-12)
yprob_ensemble = np.dot(w_best, P)  # lưu lại để dùng sau nếu cần

# --- ROC ---
fpr, tpr, _ = roc_curve(y, yprob_ensemble)
auc = roc_auc_score(y, yprob_ensemble)
plt.figure(figsize=(4.8, 4.0), dpi=140)
plt.plot(fpr, tpr, label=f"AUC = {auc:.4f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.title(f"ROC — Ensemble[{'+'.join(names)}]")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(loc="lower right")
plt.tight_layout(); plt.show()

# --- PR ---
prec, rec, _ = precision_recall_curve(y, yprob_ensemble)
ap = average_precision_score(y, yprob_ensemble)
plt.figure(figsize=(4.8, 4.0), dpi=140)
plt.plot(rec, prec, label=f"AP = {ap:.4f}")
plt.title(f"Precision–Recall — Ensemble[{'+'.join(names)}]")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend(loc="lower left")
plt.tight_layout(); plt.show()

# --- đánh giá tại 2 ngưỡng: 0.5 và best-threshold (theo min recall=0.85)
thr_best, p_best, r_best, f1_best = _best_thr(y, yprob_ensemble, min_rec=0.85)
for th, tag in [(0.5, "0.50"), (thr_best, f"F1*={thr_best:.4f}")]:
    ypred = (yprob_ensemble >= th).astype(int)
    acc = accuracy_score(y, ypred)
    f1  = f1_score(y, ypred)
    ppv = precision_score(y, ypred, zero_division=0)
    rr  = recall_score(y, ypred)

    print(f"\n=== Ensemble[{'+'.join(names)}] @ th={tag} ===")
    print(f"ACC={acc:.6f} | F1={f1:.6f} | P={ppv:.6f} | R={rr:.6f} | AUC={auc:.6f} | AP={ap:.6f}")
    print("Classification Report:")
    print(classification_report(y, ypred, digits=4))
    _ = _show_confusion(y, ypred, f"Ensemble — Confusion (th={tag})")

# --- In lại thông tin trọng số tối ưu (từ cell trước) cho rõ ràng
print("\n[Weights] " + ", ".join([f"{n}={w:.3f}" for n, w in zip(names, w_best)]))
print(f"[Best-threshold] th={thr_best:.6f} | P={p_best:.6f} | R={r_best:.6f} | F1={f1_best:.6f}")

# --- Cập nhật RESULTS_BIN (ghi dòng ensemble với threshold tối ưu)
if 'RESULTS_BIN' not in globals():
    RESULTS_BIN = []
RESULTS_BIN.append({
    "Model": f"Ensemble[{'+'.join(names)}] (thr@{thr_best:.3f})",
    "ACC": accuracy_score(y, (yprob_ensemble>=thr_best).astype(int)),
    "Precision": p_best,
    "Recall": r_best,
    "F1": f1_best,
    "AUC": auc
})

# (tuỳ chọn) hiển thị bảng kết quả tổng hợp nhanh nếu đã có các model trước đó
try:
    import pandas as pd
    df_res = pd.DataFrame(RESULTS_BIN)
    display(df_res.sort_values(["F1","AUC"], ascending=[False, False]).reset_index(drop=True))
except Exception:
    pass



# **Phase 2: Chuẩn bị dữ liệu đa lớp (AttackType)**

In [None]:
# Lấy tất cả mẫu DDoS
df_attack = df_all[df_all['Label']=='DDoS'].copy()
X_attack = df_attack.reindex(columns=feature_candidates, fill_value=0.0).astype(np.float32)
X_attack_s = scaler.transform(X_attack)

# Mã hoá nhãn AttackType
y_attack_txt = df_attack['AttackType'].astype(str).values
le_attack = LabelEncoder()
y_attack = le_attack.fit_transform(y_attack_txt)
num_classes = len(le_attack.classes_)
joblib.dump(le_attack, ROOT_SAVE / "attack_label_encoder_union.pkl")
print("Classes:", list(le_attack.classes_))

# SMOTE multiclass
X_attack_res, y_attack_res = SMOTE(random_state=RANDOM_STATE).fit_resample(X_attack_s, y_attack)

# Train/test split
Xa_tr, Xa_te, ya_tr, ya_te = train_test_split(
    X_attack_res, y_attack_res, test_size=0.2, random_state=RANDOM_STATE, stratify=y_attack_res
)
Xa_tr = Xa_tr.astype(np.float16)
Xa_te = Xa_te.astype(np.float16)
print("Train:", Xa_tr.shape, "Test:", Xa_te.shape)

# Lưu để DL Phase-2 dùng
Xa_tr_dl = Xa_tr.astype(np.float32).reshape(-1, Xa_tr.shape[1], 1)
Xa_te_dl = Xa_te.astype(np.float32).reshape(-1, Xa_te.shape[1], 1)


# **Phase 2 - Hybrid Weighted Soft-Voting Ensemble giữa XGBoost (tree model) và LSTM SOTA (deep learning model)**

In [None]:
# %% Phase-2 BEST — Soft-vote XGB + LSTM SOTA (self-healing)
import os, glob, joblib, numpy as np
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# -------- 0) Ensure Phase-2 data exists (Xa_tr, Xa_te, ya_tr, ya_te, Xa_te_dl) --------
need_build = False
for v in ["Xa_tr","Xa_te","ya_tr","ya_te"]:
    if v not in globals():
        need_build = True
        break

if need_build:
    print("[INFO] Rebuild Phase-2 data from df_all ...")
    # yêu cầu các biến nền tảng: df_all, scaler, feature_candidates, RANDOM_STATE
    assert 'df_all' in globals() and 'scaler' in globals() and 'feature_candidates' in globals(), \
        "Thiếu df_all/scaler/feature_candidates. Hãy chạy các cell chuẩn bị dữ liệu trước."
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE  # dùng nếu bạn muốn; có thể bỏ để nhanh

    # lấy mẫu DDoS + encode AttackType
    df_attack = df_all[df_all["Label"]=="DDoS"].copy()
    X_attack = df_attack.reindex(columns=feature_candidates, fill_value=0.0).astype(np.float32).values
    X_attack_s = scaler.transform(X_attack)
    y_attack_txt = df_attack["AttackType"].astype(str).values

    if 'le_attack' not in globals():
        le_attack = LabelEncoder()
        y_attack = le_attack.fit_transform(y_attack_txt)
    else:
        # dùng encoder đang có (nếu đã tồn tại)
        y_attack = le_attack.transform(y_attack_txt)

    # KHÔNG SMOTE để nhanh (nếu muốn SMOTE hãy bật 2 dòng sau)
    # X_attack_res, y_attack_res = SMOTE(random_state=RANDOM_STATE).fit_resample(X_attack_s, y_attack)
    # Xa_tr, Xa_te, ya_tr, ya_te = train_test_split(X_attack_res, y_attack_res, test_size=0.2, random_state=RANDOM_STATE, stratify=y_attack_res)
    Xa_tr, Xa_te, ya_tr, ya_te = train_test_split(
        X_attack_s, y_attack, test_size=0.2, random_state=RANDOM_STATE, stratify=y_attack
    )

    # DL view để dùng cho LSTM SOTA Multiclass
    Xa_tr_dl = Xa_tr.astype(np.float32).reshape(-1, Xa_tr.shape[1], 1)
    Xa_te_dl = Xa_te.astype(np.float32).reshape(-1, Xa_te.shape[1], 1)

# -------- 1) Ensure xgb_multi exists: load saved pack or (re)train fast --------
if 'xgb_multi' not in globals():
    print("[INFO] xgb_multi not in RAM -> try load saved joblib ...")
    # ưu tiên thư mục results; nếu không có thì tìm local
    cand = []
    for pat in [
        r"D:\DACN\results\training\models\xgb_attack_union.h5",
        r"D:\DACN\results\training\models\*.h5",
        "xgb_attack_union.joblib",
        "*.h5",
    ]:
        cand.extend(glob.glob(pat))
    cand = [p for p in cand if os.path.isfile(p)]
    cand = sorted(cand, key=os.path.getmtime, reverse=True)

    loaded = False
    for p in cand:
        try:
            pack = joblib.load(p)
            if isinstance(pack, dict) and "model" in pack:
                xgb_multi = pack["model"]
                # đồng bộ hoá metadata nếu cần
                if "encoder" in pack and 'le_attack' not in globals(): le_attack = pack["encoder"]
                if "scaler" in pack and 'scaler' not in globals():     scaler = pack["scaler"]
                if "features" in pack and 'feature_candidates' not in globals(): feature_candidates = pack["features"]
                print(f"[OK] Loaded XGB multiclass from: {p}")
                loaded = True
                break
        except Exception as e:
            print(f"[WARN] Failed to load {p}: {e}")

    if not loaded:
        print("[INFO] No saved pack -> quick train XGB multiclass (hist, sample_weight)")
        # weight ngược tần suất lớp (nhanh & thường tốt hơn SMOTE)
        unique, counts = np.unique(ya_tr, return_counts=True)
        inv = counts.max() / counts
        w_tr = inv[ya_tr]

        xgb_multi = XGBClassifier(
            objective="multi:softprob", num_class=len(unique),
            n_estimators=500, max_depth=7, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9,
            tree_method="hist", max_bin=256,
            random_state=RANDOM_STATE, eval_metric="mlogloss"
        )
        xgb_multi.fit(Xa_tr, ya_tr, sample_weight=w_tr, eval_set=[(Xa_te, ya_te)], verbose=False)

proba_xgb = xgb_multi.predict_proba(Xa_te)  # (n, C)

# -------- 2) Optional: DL proba from LSTM SOTA multiclass --------
proba_dl = None
if 'lstm_sota_mc' in globals():
    try:
        proba_dl = lstm_sota_mc.predict(Xa_te_dl, batch_size=512, verbose=0)  # type: ignore # (n, C)
    except Exception as e:
        print(f"[WARN] LSTM SOTA multiclass predict failed: {e}")
        proba_dl = None

# -------- 3) Soft-vote or fallback --------
if proba_dl is None:
    y_pred_best = proba_xgb.argmax(axis=1)
    name = "XGBoost (Multiclass)"
else:
    w_xgb, w_dl = 0.6, 0.4  # bạn có thể thử 0.7/0.3
    y_pred_best = (w_xgb*proba_xgb + w_dl*proba_dl).argmax(axis=1)
    name = "Ensemble[XGB + LSTM SOTA]"

from sklearn.metrics import classification_report
acc = accuracy_score(ya_te, y_pred_best)
print(f"[Phase-2 BEST] {name} — ACC={acc:.4f}")
print(classification_report(ya_te, y_pred_best, target_names=le_attack.classes_))

# Alias cho cell tổng hợp Phase-2 cuối
y_pred_lstm = y_pred_best


In [None]:
# %% Phase-2 BEST — Multiclass INLINE Validation (CM + Reports + AUC/AP OvR)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    f1_score, precision_score, recall_score,
    roc_auc_score, average_precision_score, top_k_accuracy_score
)
import itertools

# --- lấy proba đang dùng để suy ra nhãn (từ cell trước)
if 'proba_dl' in globals() and proba_dl is not None:
    w_xgb, w_dl = 0.6, 0.4  # phải khớp với cell trước
    yproba_mc = w_xgb * proba_xgb + w_dl * proba_dl
else:
    yproba_mc = proba_xgb

classes = list(le_attack.classes_)
C = len(classes)

# --- helpers ---
def _plot_cm(cm, labels, title="Confusion Matrix", normalize=False):
    if normalize:
        with np.errstate(divide='ignore', invalid='ignore'):
            cm = cm.astype('float') / np.maximum(cm.sum(axis=1, keepdims=True), 1e-12)
    fig, ax = plt.subplots(figsize=(1.2 + 0.5*C, 1.0 + 0.5*C), dpi=140)
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks(range(C)); ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_yticks(range(C)); ax.set_yticklabels(labels)
    # annotate
    fmt = ".2f" if normalize else "d"
    thresh = np.nanmax(cm) / 2.0 if np.isfinite(cm).all() else 0.5
    for i, j in itertools.product(range(C), range(C)):
        val = cm[i, j]
        ax.text(j, i, format(val, fmt),
                ha="center", va="center",
                color="white" if np.isfinite(val) and val > thresh else "black")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    plt.tight_layout(); plt.show()

# --- Confusion matrices ---
cm_raw = confusion_matrix(ya_te, y_pred_best, labels=range(C))
_plot_cm(cm_raw, classes, title=f"{name} — Confusion Matrix (counts)", normalize=False)
_plot_cm(cm_raw, classes, title=f"{name} — Confusion Matrix (row-normalized)", normalize=True)

# --- Classification report (per-class) ---
print(f"\n=== {name} — Classification Report ===")
print(classification_report(ya_te, y_pred_best, target_names=classes, digits=4))

# --- Overall metrics (macro/micro/weighted) ---
acc  = accuracy_score(ya_te, y_pred_best)
f1_macro  = f1_score(ya_te, y_pred_best, average="macro")
f1_micro  = f1_score(ya_te, y_pred_best, average="micro")
f1_weight = f1_score(ya_te, y_pred_best, average="weighted")
p_macro   = precision_score(ya_te, y_pred_best, average="macro", zero_division=0)
r_macro   = recall_score(ya_te, y_pred_best, average="macro", zero_division=0)

# --- AUC (OvR) & Average Precision (OvR) nếu có xác suất ---
try:
    auc_ovr_macro = roc_auc_score(ya_te, yproba_mc, multi_class="ovr", average="macro")
    auc_ovr_weight = roc_auc_score(ya_te, yproba_mc, multi_class="ovr", average="weighted")
except Exception:
    auc_ovr_macro = np.nan
    auc_ovr_weight = np.nan

try:
    ap_macro = average_precision_score(ya_te, yproba_mc, average="macro")
    ap_micro = average_precision_score(ya_te, yproba_mc, average="micro")
    ap_weight = average_precision_score(ya_te, yproba_mc, average="weighted")
except Exception:
    ap_macro = ap_micro = ap_weight = np.nan

# --- Top-k accuracy (nếu có proba) ---
try:
    top2 = top_k_accuracy_score(ya_te, yproba_mc, k=2, labels=range(C))
    top3 = top_k_accuracy_score(ya_te, yproba_mc, k=min(3, C), labels=range(C))
except Exception:
    top2 = top3 = np.nan

print(f"\n=== {name} — Summary Metrics ===")
print(f"ACC={acc:.6f} | F1(macro)={f1_macro:.6f} | F1(micro)={f1_micro:.6f} | F1(weighted)={f1_weight:.6f}")
print(f"P(macro)={p_macro:.6f} | R(macro)={r_macro:.6f}")
print(f"AUC OvR(macro)={auc_ovr_macro:.6f} | AUC OvR(weighted)={auc_ovr_weight:.6f}")
print(f"AP(macro)={ap_macro:.6f} | AP(micro)={ap_micro:.6f} | AP(weighted)={ap_weight:.6f}")
print(f"Top-2 Acc={top2:.6f} | Top-3 Acc={top3:.6f}")

# --- (tuỳ chọn) đẩy nhanh tổng hợp kết quả Phase-2 vào bảng RESULTS_MC ---
try:
    import pandas as pd
    if 'RESULTS_MC' not in globals():
        RESULTS_MC = []
    RESULTS_MC.append({
        "Model": name,
        "ACC": acc,
        "F1_macro": f1_macro,
        "F1_micro": f1_micro,
        "AUC_OvR_macro": auc_ovr_macro,
        "AP_macro": ap_macro,
        "Top2": top2
    })
    df_mc = pd.DataFrame(RESULTS_MC)
    display(df_mc.sort_values(["F1_macro","ACC"], ascending=[False, False]).reset_index(drop=True))
except Exception:
    pass


# **Dọn Ram**

In [None]:
import gc, sys, types, numpy as np, pandas as pd

KEEP = {
    # artifacts cần giữ
    "feature_candidates", "scaler", "le_attack",
    "xgb_bin","lgb_bin","cat_bin","lstm","gru","cnn","ae",
    "metrics_phase1","metrics_phase2","best_thresholds",
    # config/seed
    "RANDOM_STATE","split_info"
}

SIZE_MB_THRESHOLD = 0  # chỉ dọn biến > 100MB để an toàn

def nbytes(obj):
    try:
        if isinstance(obj, np.ndarray): return obj.nbytes
        if isinstance(obj, pd.DataFrame): return obj.memory_usage(deep=True).sum()
        if isinstance(obj, pd.Series): return obj.memory_usage(deep=True)
        return sys.getsizeof(obj)
    except Exception:
        return 0

deleted = []
for name, val in list(globals().items()):
    if name.startswith("_") or name in KEEP: 
        continue
    if isinstance(val, types.ModuleType) or isinstance(val, types.FunctionType):
        continue
    try:
        mb = nbytes(val) / (1024**2)
        if mb >= SIZE_MB_THRESHOLD:
            del globals()[name]
            deleted.append((name, f"{mb:.1f} MB"))
    except Exception:
        pass

gc.collect()
print("Đã dọn:", deleted[:10], "... tổng:", len(deleted))
