
# **Imports & cấu hình chung**

In [3]:
# %% [markdown]
# # IDS Notebook — Pipeline 2-Phase + Save/Plot + Resume
# - Lưu mô hình dạng `.h5` vào D:\DACN\results\training\models
# - Ảnh biểu đồ vào D:\DACN\results\training\plots
# - Bảng so sánh vào D:\DACN\results\training\tables
# - Có thể **tiếp tục train** (resume) mà **không phải chạy lại** tiền xử lý.

import os, glob, io, time, json, warnings, joblib, random, pickle, math
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_fscore_support, accuracy_score
)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

warnings.filterwarnings("ignore")

# ==== PATHS (chỉnh theo máy bạn) ====
CIC2019_DIR = r'E:\DACN\dataset\CICDDoS2019'
CIC2017_DIR = r'E:\DACN\dataset\CICDDoS2017'
UNSW15_DIR  = r'E:\DACN\dataset\UNSW_NB15'
NSLKDD_DIR  = r'E:\DACN\dataset\NSL-KDD'   # có KDDTrain+.txt, KDDTest+.txt

# ==== nơi lưu kết quả ====
ROOT_SAVE = Path(r"E:\DACN\results\training")
DIR_MODELS = ROOT_SAVE / "models"
DIR_PLOTS  = ROOT_SAVE / "plots"
DIR_TABLES = ROOT_SAVE / "tables"
for p in [DIR_MODELS, DIR_PLOTS, DIR_TABLES]:
    p.mkdir(parents=True, exist_ok=True)

# ==== loại cột ID/time ====
EXCLUDE_ID_COLUMNS = True
ID_LIKE_COLS = set([
    'Flow ID','FlowID','Timestamp','StartTime','Start Time','stime','time','Date','datetime',
    'Src IP','Dst IP','Source IP','Destination IP',
    'srcip','dstip','srcip_addr','dstip_addr', 
    'Src Port','Dst Port','Sport','Dport','srcport','dstport',
    'ProtocolName','ProtoName','Service','service','state','attack_cat','label',
    'Unnamed: 0','id','No.','Index'
])
LABEL_CANDS = ["Label","label","Attack","attack","attack_cat","class","Class","target","category","Category","result"]

# kiểm soát lệch phân bố từ UNSW (toàn attack)
MAX_UNSW_RATIO = 0.30   # tối đa 30%
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)



# **Giới hạn CPU + TensorFlow threads**

In [4]:
# %%
import multiprocessing as mp

def limit_cpu(fraction: float = 0.90) -> int:
    """
    Giới hạn tài nguyên CPU ~fraction (theo số luồng).
    Trả về số threads cho n_jobs/num_threads.
    """
    fraction = max(0.1, min(1.0, float(fraction)))
    total = os.cpu_count() or mp.cpu_count() or 1
    allow = max(1, math.floor(total * fraction))

    os.environ["OMP_NUM_THREADS"] = str(allow)
    os.environ["OPENBLAS_NUM_THREADS"] = str(allow)
    os.environ["MKL_NUM_THREADS"] = str(allow)
    os.environ["VECLIB_MAXIMUM_THREADS"] = str(allow)
    os.environ["NUMEXPR_MAX_THREADS"] = str(allow)
    os.environ["NUMEXPR_NUM_THREADS"] = str(allow)

    try:
        from threadpoolctl import threadpool_limits
        threadpool_limits(allow)
    except Exception:
        pass

    try:
        import psutil
        p = psutil.Process()
        cpus = list(range(allow))
        p.cpu_affinity(cpus)
    except Exception:
        pass

    print(f"[CPU] total={total} | allow={allow} threads (~{fraction*100:.0f}%)")
    return allow

threads_allowed = limit_cpu(0.90)

import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(threads_allowed)
tf.config.threading.set_inter_op_parallelism_threads(max(1, threads_allowed//2))


[CPU] total=28 | allow=25 threads (~90%)



# **Hàm I/O an toàn + Chuẩn hoá nhãn**

In [5]:
def safe_read_any(path: str, nrows=None) -> pd.DataFrame:
    low = path.lower()
    try:
        if low.endswith(".parquet"):
            return pd.read_parquet(path) if nrows is None else pd.read_parquet(path).head(nrows)
        # NSL-KDD .txt không header
        if low.endswith(".txt") and ("kddtrain" in low or "kddtest" in low):
            df = pd.read_csv(path, header=None)
            if df.shape[1] == 43:
                cols = [f"feat_{i}" for i in range(41)] + ["label","difficulty"]
            elif df.shape[1] == 42:
                cols = [f"feat_{i}" for i in range(41)] + ["label"]
            else:
                cols = [f"col_{i}" for i in range(df.shape[1])]
            df.columns = cols
            return df if nrows is None else df.head(nrows)
        # CSV chung
        for enc in ("utf-8-sig","utf-8","cp1252","latin1"):
            try:
                return pd.read_csv(path, encoding=enc, compression="infer", low_memory=False, nrows=nrows)
            except Exception:
                continue
        return pd.read_csv(path, compression="infer", low_memory=False, nrows=nrows)
    except Exception as e:
        print(f"[WARN] skip {os.path.basename(path)}: {e}")
        return pd.DataFrame()

def find_label_col(df: pd.DataFrame):
    for c in LABEL_CANDS:
        if c in df.columns: return c
    return None

attack_group_map = {
    'DrDoS_DNS':'DrDoS','DrDoS_SNMP':'DrDoS','DrDoS_NTP':'DrDoS','DrDoS_MSSQL':'DrDoS',
    'DrDoS_SSDP':'DrDoS','DrDoS_UDP':'DrDoS','TFTP':'TFTP',
    'UDP':'UDP','UDPLag':'UDP','Syn':'Syn','MSSQL':'MSSQL','LDAP':'LDAP',
    'DoS slowloris':'DoS','DoS Slowhttptest':'DoS','DoS Hulk':'DoS','DoS GoldenEye':'DoS',
    'Heartbleed':'Other',
    'Web Attack � Brute Force':'Web Attack','Web Attack � XSS':'Web Attack','Web Attack � Sql Injection':'Web Attack',
    'FTP-Patator':'Brute Force','SSH-Patator':'Brute Force','Infiltration':'Other','Bot':'Other',
    'PortScan':'PortScan','NetBIOS':'Other'
}

def normalize_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    lbl = find_label_col(df)
    if lbl is None:
        return pd.DataFrame()
    df.rename(columns={lbl: "Label"}, inplace=True)
    df["Label"] = df["Label"].astype(str).str.strip()
    df.loc[df["Label"].str.lower().isin(["normal","benign","non-attack","good"]), "Label"] = "Benign"
    if "AttackType" not in df.columns:
        df["AttackType"] = df["Label"]
    def group_attack_type(x):
        if pd.isna(x): return 'Other'
        if x == 'Benign': return 'Benign'
        return attack_group_map.get(str(x), 'Other')
    df["AttackType"] = df["AttackType"].apply(group_attack_type)
    df["Label"] = df["Label"].apply(lambda v: 'Benign' if str(v)=='Benign' else 'DDoS')
    return df



# **Liệt kê file & union features**

In [6]:
# CIC-2019 train/test
cic19_train, cic19_test = [], []
for root,_,files in os.walk(CIC2019_DIR):
    for fn in files:
        if fn.endswith("-training.parquet"): cic19_train.append(os.path.join(root, fn))
        if fn.endswith("-testing.parquet"):  cic19_test.append(os.path.join(root, fn))

# CIC-2017 parquet
cic17_files = glob.glob(os.path.join(CIC2017_DIR, "**", "*.parquet"), recursive=True)

# UNSW: bỏ *_features.csv, *_LIST_EVENTS.csv, *_GT.csv
unsw_all = glob.glob(os.path.join(UNSW15_DIR, "**", "*.csv"), recursive=True)
unsw_files = [p for p in unsw_all if ("features" not in os.path.basename(p).lower()
                                      and "list_events" not in os.path.basename(p).lower()
                                      and not os.path.basename(p).lower().endswith("_gt.csv"))]

# NSL: chỉ .txt
nsl_all = glob.glob(os.path.join(NSLKDD_DIR, "**", "*.txt"), recursive=True)
nsl_files = [p for p in nsl_all if ("kddtrain" in os.path.basename(p).lower() or
                                    "kddtest" in os.path.basename(p).lower())]

print("CIC19 train:", len(cic19_train), "CIC19 test:", len(cic19_test))
print("CIC17:", len(cic17_files), "UNSW:", len(unsw_files), "NSL:", len(nsl_files))

def infer_numeric_cols(files: List[str]) -> set:
    s = set()
    for p in files[:10]:
        head = safe_read_any(p, nrows=200)
        if head.empty: continue
        head = normalize_labels(head)
        if head.empty: continue
        cols = [c for c in head.columns if c not in ID_LIKE_COLS and c not in ("Label","AttackType")]
        for c in cols:
            if pd.api.types.is_numeric_dtype(head[c]):
                s.add(c)
    s.add("dataset_id")
    return s

union_cols = set()
union_cols |= infer_numeric_cols(cic19_train + cic19_test)
union_cols |= infer_numeric_cols(cic17_files)
union_cols |= infer_numeric_cols(unsw_files)
union_cols |= infer_numeric_cols(nsl_files)

FEATURES = sorted(list(union_cols))
print("Tổng số cột numeric union:", len(FEATURES))
joblib.dump({"feature_order": FEATURES}, ROOT_SAVE / "feature_order_union.pkl")


CIC19 train: 7 CIC19 test: 10
CIC17: 8 UNSW: 6 NSL: 4
Tổng số cột numeric union: 156


['E:\\DACN\\results\\training\\feature_order_union.pkl']


# **Load & Normalize datasets + Gộp + Lưu parquet**

In [7]:
def load_and_normalize(files: List[str], dataset_id: int) -> pd.DataFrame:
    out = []
    for p in tqdm(files, desc=f"Load ds{dataset_id}"):
        df = safe_read_any(p)
        if df.empty: 
            continue
        df = normalize_labels(df)
        if df.empty:
            continue
        df["dataset_id"] = dataset_id
        out.append(df)
    return pd.concat(out, ignore_index=True) if out else pd.DataFrame()

df17  = load_and_normalize(cic17_files, 1)
df19t = load_and_normalize(cic19_train, 2)
df19e = load_and_normalize(cic19_test, 2)
dfUN  = load_and_normalize(unsw_files, 3)
dfNSL = load_and_normalize(nsl_files, 4)

print("Shapes:", {k:v.shape for k,v in {"CIC17":df17,"CIC19_train":df19t,"UNSW":dfUN,"NSL":dfNSL}.items()})

# gộp chính (2017 + 2019 train)
df_main = pd.concat([df17, df19t], ignore_index=True)

# hạn chế UNSW (gần như toàn attack)
if not dfUN.empty:
    cur_ddos = (df_main["Label"]=="DDoS").sum()
    cap = int(MAX_UNSW_RATIO * max(1, cur_ddos))
    dfUN_ddos = dfUN[dfUN["Label"]=="DDoS"]
    if len(dfUN_ddos) > cap:
        dfUN_ddos = dfUN_ddos.sample(cap, random_state=RANDOM_STATE)
    dfUN = dfUN_ddos

df_all = pd.concat([df_main, dfUN, dfNSL], ignore_index=True)
assert not df_all.empty, "Không có dữ liệu!"

df_all = df_all.replace([np.inf, -np.inf], np.nan).fillna(0)
for c in df_all.columns:
    if df_all[c].dtype == "object":
        df_all[c] = df_all[c].astype(str)

parq_path = ROOT_SAVE / "df_all_union.parquet"
df_all.to_parquet(parq_path, index=False)
print("Đã lưu:", parq_path)


Load ds1: 100%|██████████| 8/8 [00:01<00:00,  4.18it/s]
Load ds2: 100%|██████████| 7/7 [00:00<00:00, 41.07it/s]
Load ds2: 100%|██████████| 10/10 [00:00<00:00, 28.14it/s]
Load ds3: 100%|██████████| 6/6 [00:09<00:00,  1.51s/it]
Load ds4: 100%|██████████| 4/4 [00:00<00:00, 10.45it/s]


Shapes: {'CIC17': (2313810, 80), 'CIC19_train': (125170, 80), 'UNSW': (257673, 47), 'NSL': (185559, 45)}
Đã lưu: E:\DACN\results\training\df_all_union.parquet



# **Đọc lại parquet + Tạo tập train/test + SMOTE + Chuẩn bị DL**

In [8]:
df_all = pd.read_parquet(ROOT_SAVE / "df_all_union.parquet")
print("Đọc lại:", df_all.shape)

drop_cols = {'Label','AttackType'}
if EXCLUDE_ID_COLUMNS:
    drop_cols |= {c for c in df_all.columns if c in ID_LIKE_COLS}
feature_candidates = [c for c in FEATURES if c not in drop_cols and c in df_all.columns]
print("Số cột dùng:", len(feature_candidates))

X = df_all.reindex(columns=feature_candidates, fill_value=0.0).astype(np.float32)
y_bin = (df_all['Label'] != 'Benign').astype(int).values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y_bin, test_size=0.2, random_state=RANDOM_STATE, stratify=y_bin
)
print("Train:", X_train_raw.shape, "Test:", X_test_raw.shape)

scaler = MinMaxScaler()
scaler.fit(X_train_raw.values)
joblib.dump(scaler, ROOT_SAVE / 'scaler_union.pkl')

X_train_s = scaler.transform(X_train_raw.values)
X_test_s  = scaler.transform(X_test_raw.values)

sm = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = sm.fit_resample(X_train_s, y_train)
print("After SMOTE:", X_res.shape, "| pos_ratio:", y_res.mean().round(4))

# Dữ liệu cho DL (mỗi feature là 1 “time step”)
X_train_dl = X_res.astype(np.float32).reshape(-1, X_res.shape[1], 1)
X_test_dl  = X_test_s.astype(np.float32).reshape(-1, X_test_s.shape[1], 1)
y_train_dl = y_res.astype(np.int32)
y_test_dl  = y_test.astype(np.int32)

print("DL shapes:", X_train_dl.shape, X_test_dl.shape)


Đọc lại: (2749109, 166)
Số cột dùng: 156
Train: (2199287, 156) Test: (549822, 156)
After SMOTE: (3386240, 156) | pos_ratio: 0.5
DL shapes: (3386240, 156, 1) (549822, 156, 1)



# **Tiện ích Save/Load .h5 + Plot (CM + Val)**

In [9]:
import h5py

def _stamp():
    return time.strftime("%Y%m%d-%H%M%S")

def save_fig_current(fig, name: str):
    png = DIR_PLOTS / f"{name}-{_stamp()}.png"
    fig.savefig(png, dpi=150, bbox_inches="tight")
    print(f"[SAVE] Figure -> {png}")
    return str(png)

def save_model_h5_any(model, name: str, extra: dict | None = None):
    """
    Lưu mô hình dạng .h5:
    - Keras: model.save(.h5)
    - Sklearn/XGBoost/LightGBM: pickle vào HDF5['pickle'] + attrs['extra_json']
    """
    path = DIR_MODELS / f"{name}-{_stamp()}.h5"
    # Thử Keras trước
    try:
        import tensorflow as tf
        if hasattr(model, "save") and isinstance(getattr(model, "save"), type(tf.keras.Model.save)):
            model.save(path)
            print(f"[SAVE] Keras model -> {path}")
            if extra:
                with open(str(path).replace(".h5", ".meta.json"), "w", encoding="utf-8") as f:
                    json.dump(extra, f, ensure_ascii=False, indent=2, default=str)
            return str(path)
    except Exception:
        pass
    # Non-Keras
    blob = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
    with h5py.File(path, "w") as h5:
        h5.create_dataset("pickle", data=np.void(blob))
        if extra:
            try:
                h5.attrs["extra_json"] = json.dumps(extra, default=str)
            except Exception:
                h5.attrs["extra_json"] = "{}"
    print(f"[SAVE] Pickled model-in-HDF5 -> {path}")
    return str(path)

def load_model_h5_any(path: str):
    import tensorflow as tf
    try:
        return tf.keras.models.load_model(path)
    except Exception:
        pass
    with h5py.File(path, "r") as h5:
        blob = bytes(h5["pickle"][()])
    return pickle.loads(blob)

# --- Plot: Binary
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

def show_cm_and_valacc(model_name, y_true, y_prob, threshold=0.5, savepath=None):
    y_pred = (y_prob >= threshold).astype(int)
    cm  = confusion_matrix(y_true, y_pred, labels=[0,1])
    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)

    fig, axes = plt.subplots(1, 2, figsize=(10,4))

    im = axes[0].imshow(cm, cmap="Blues")
    for (i,j),v in np.ndenumerate(cm):
        axes[0].text(j, i, str(v), ha="center", va="center", fontsize=10)
    axes[0].set_xticks([0,1]); axes[0].set_xticklabels(["Benign","DDoS"])
    axes[0].set_yticks([0,1]); axes[0].set_yticklabels(["Benign","DDoS"])
    axes[0].set_xlabel("Predicted"); axes[0].set_ylabel("Actual")
    axes[0].set_title(f"{model_name} — Confusion Matrix")
    fig.colorbar(im, ax=axes[0], fraction=0.046, pad=0.04)

    axes[1].bar([0], [acc], width=0.5)
    axes[1].set_ylim(0, 1.0)
    axes[1].set_xticks([0]); axes[1].set_xticklabels(["Accuracy"])
    axes[1].set_ylabel("Value")
    axes[1].set_title(f"{model_name} — Validation Accuracy")
    axes[1].text(0, min(acc+0.03, 0.98), f"{acc:.6f}", ha="center", va="center", fontsize=11, fontweight="bold")

    plt.suptitle(f"{model_name}  |  ACC={acc:.6f}  AUC={auc:.6f}  thr={threshold}", y=1.04, fontsize=11)
    plt.tight_layout()
    if savepath: plt.savefig(savepath, dpi=140, bbox_inches="tight")
    plt.show()

# --- Plot: Multiclass
def show_cm_and_valacc_multiclass(model_name, y_true, y_pred, labels, savepath=None):
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(labels)))

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    im = axes[0].imshow(cm_norm, cmap="Blues", vmin=0, vmax=1)
    for (i, j), v in np.ndenumerate(cm):
        axes[0].text(j, i, str(v), ha="center", va="center", fontsize=8,
                     color="white" if cm_norm[i, j] > 0.5 else "black")
    axes[0].set_xticks(np.arange(len(labels))); axes[0].set_xticklabels(labels, rotation=90)
    axes[0].set_yticks(np.arange(len(labels))); axes[0].set_yticklabels(labels)
    axes[0].set_xlabel("Predicted"); axes[0].set_ylabel("Actual")
    axes[0].set_title(f"{model_name} — Confusion Matrix (Normalized)")
    fig.colorbar(im, ax=axes[0], fraction=0.046, pad=0.04)

    axes[1].bar([0], [acc], width=0.5)
    axes[1].set_ylim(0, 1.0)
    axes[1].set_xticks([0]); axes[1].set_xticklabels(["Accuracy"])
    axes[1].set_ylabel("Value")
    axes[1].set_title(f"{model_name} — Validation Accuracy")
    axes[1].text(0, min(acc+0.03, 0.98), f"{acc:.4f}", ha="center", va="center", fontsize=11, fontweight="bold")

    plt.suptitle(f"{model_name}  |  ACC={acc:.4f}", y=1.03, fontsize=11)
    plt.tight_layout()
    if savepath:
        plt.savefig(savepath, dpi=150, bbox_inches="tight")
        print(f"[Saved] {savepath}")
    plt.show()


In [10]:
# ===== EarlyStop theo "cải thiện quá nhỏ" giữa 2 epoch liên tiếp (Keras) =====
from tensorflow.keras import layers, models, callbacks  # type: ignore

class StopOnTinyChange(callbacks.Callback):
    """
    Dừng sớm nếu metric theo dõi (monitor) cải thiện quá nhỏ giữa 2 epoch liên tiếp.
    - monitor: 'val_auc' cho bài toán binary, hoặc 'val_accuracy' cho multiclass.
    - min_delta: ngưỡng cải thiện tối thiểu. Nếu |Δ| < min_delta => dừng.
    """
    def __init__(self, monitor="val_auc", min_delta=1e-4):
        super().__init__()
        self.monitor = monitor
        self.min_delta = float(min_delta)
        self.prev = None

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        curr = logs.get(self.monitor)
        if curr is None:
            return
        if self.prev is not None and abs(curr - self.prev) < self.min_delta:
            print(f"\n[STOP] Δ{self.monitor}={curr - self.prev:.6f} < {self.min_delta} tại epoch {epoch+1}. Kết thúc huấn luyện.")
            self.model.stop_training = True
        self.prev = curr


In [11]:
RESULTS_BIN = []  # lưu so sánh phase-1

def eval_binary(y_true, y_prob, name, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    auc = roc_auc_score(y_true, y_prob)
    RESULTS_BIN.append({"Model": name, "ACC": acc, "Precision": prec, "Recall": rec, "F1": f1, "AUC": auc})
    print(f"\n=== {name} ===")
    print(classification_report(y_true, y_pred, target_names=["Benign","DDoS"]))
    print("ROC-AUC:", auc)
    return y_pred



# **Phase 1: Deep Learning (Binary — LSTM/GRU/CNN) — Train/Resume + Lưu + Plot**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks # type: ignore

EPOCHS_DL = 20
BATCH = 2048

cb = [
    callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor="val_auc", mode="max"),
    callbacks.ReduceLROnPlateau(patience=2, factor=0.5, min_lr=1e-5, monitor="val_auc", mode="max"),
    StopOnTinyChange(monitor="val_auc", min_delta=1e-4),
]

def compile_binary(model):
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.AUC(name="auc")])
    return model

# ----- LSTM -----
if 'lstm' not in globals():
    lstm = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.LSTM(64, return_sequences=True),
        layers.LSTM(32),
        layers.Dense(16, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(lstm)
lstm.fit(
    X_train_dl, y_train_dl,
    epochs=EPOCHS_DL, batch_size=BATCH,
    validation_data=(X_test_dl, y_test_dl),
    callbacks=cb,
    verbose=1                                      # <<< BẬT LOG CHI TIẾT
)
yprob_lstm = lstm.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
_ = eval_binary(y_test, yprob_lstm, "LSTM (Binary)")
save_model_h5_any(lstm, "LSTM_Binary_Phase1")
show_cm_and_valacc("LSTM (Binary)", y_test, yprob_lstm, threshold=0.5,
                   savepath=str(DIR_PLOTS / f"LSTM_Binary_Phase1-{_stamp()}.png"))

# ----- GRU -----
if 'gru' not in globals():
    gru = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.GRU(64, return_sequences=True),
        layers.GRU(32),
        layers.Dense(16, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(gru)
gru.fit(X_train_dl, y_train_dl, epochs=EPOCHS_DL, batch_size=BATCH,
        validation_data=(X_test_dl, y_test_dl), callbacks=cb, verbose=0)
yprob_gru = gru.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
_ = eval_binary(y_test, yprob_gru, "GRU (Binary)")
save_model_h5_any(gru, "GRU_Binary_Phase1")
show_cm_and_valacc("GRU (Binary)", y_test, yprob_gru, threshold=0.5,
                   savepath=str(DIR_PLOTS / f"GRU_Binary_Phase1-{_stamp()}.png"))

# ----- 1D-CNN -----
if 'cnn' not in globals():
    cnn = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.Conv1D(64, kernel_size=5, activation="relu"),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, activation="relu"),
        layers.GlobalAveragePooling1D(),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(cnn)
cnn.fit(X_train_dl, y_train_dl, epochs=EPOCHS_DL, batch_size=BATCH,
        validation_data=(X_test_dl, y_test_dl), callbacks=cb, verbose=0)
yprob_cnn = cnn.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
_ = eval_binary(y_test, yprob_cnn, "1D-CNN (Binary)")
save_model_h5_any(cnn, "CNN1D_Binary_Phase1")
show_cm_and_valacc("1D-CNN (Binary)", y_test, yprob_cnn, threshold=0.5,
                   savepath=str(DIR_PLOTS / f"CNN1D_Binary_Phase1-{_stamp()}.png"))


[✓] Epoch 1/3 — best_iter: 542


In [None]:
#mới thêm có thể xóa (thêm để test độ hiệu quả)
# %% DL (Binary) — nâng chất & tối ưu ngưỡng cho LSTM/GRU/CNN
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks  # type: ignore
import numpy as np
from sklearn.metrics import precision_recall_curve, classification_report

# EPOCHS_DL = 25         # 20 -> 25 (vẫn early stop)
# BATCH = 2048
EPOCHS_DL = 3
BATCH = 4096

# ----- Callbacks giữ nguyên -----
cb = [
    callbacks.EarlyStopping(patience=4, restore_best_weights=True, monitor="val_auc", mode="max"),
    callbacks.ReduceLROnPlateau(patience=2, factor=0.5, min_lr=1e-5, monitor="val_auc", mode="max"),
    StopOnTinyChange(monitor="val_auc", min_delta=1e-4),
]

# ===== Focal loss & tối ưu ngưỡng =====
def focal_binary_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.0-eps)
        pt = tf.where(tf.equal(y_true,1), y_pred, 1-y_pred)
        w  = tf.where(tf.equal(y_true,1), alpha, 1-alpha)
        return -tf.reduce_mean(w * tf.pow(1-pt, gamma) * tf.math.log(pt))
    return loss

def find_best_threshold(y_true, y_prob, min_recall=None):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    f1 = 2*p[:-1]*r[:-1]/(p[:-1]+r[:-1] + 1e-9)
    if min_recall is not None:
        mask = r[:-1] >= float(min_recall)
        if mask.any():
            idx = np.flatnonzero(mask)[f1[mask].argmax()]
        else:
            idx = f1.argmax()
    else:
        idx = f1.argmax()
    return float(thr[idx]), float(p[idx]), float(r[idx]), float(f1[idx])

# ===== compile tiện dụng (focal + jit) =====
def compile_binary(model, use_focal=True, lr=1e-3):
    if use_focal:
        loss_fn = focal_binary_loss(gamma=2.0, alpha=0.25)
    else:
        loss_fn = "binary_crossentropy"
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss=loss_fn,
        metrics=[tf.keras.metrics.AUC(name="auc")],
        jit_compile=True  # tăng tốc TF2.16
    )
    return model

# =========================
# LSTM — nâng cấp nhẹ
# =========================
if 'lstm' not in globals():
    lstm = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True,
                                         dropout=0.2, recurrent_dropout=0.2)),
        layers.Bidirectional(layers.LSTM(64, return_sequences=False,
                                         dropout=0.2, recurrent_dropout=0.2)),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(lstm, use_focal=True, lr=1e-3)
lstm.fit(X_train_dl, y_train_dl, epochs=EPOCHS_DL, batch_size=BATCH,
         validation_data=(X_test_dl, y_test_dl), callbacks=cb, verbose=1)

yprob_lstm = lstm.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
best_thr_lstm, bp, br, bf1 = find_best_threshold(y_test, yprob_lstm, min_recall=0.85)  # đặt None nếu chỉ tối đa F1
print(f"[LSTM] Best thr={best_thr_lstm:.4f} | P={bp:.3f} R={br:.3f} F1={bf1:.3f}")
print(classification_report(y_test, (yprob_lstm >= best_thr_lstm).astype(int), target_names=['Benign','DDoS']))

# log vào bảng tổng hợp Phase-1 với ngưỡng tối ưu
_ = eval_binary(y_test, yprob_lstm, f"LSTM (Binary, thr@{best_thr_lstm:.3f})", threshold=best_thr_lstm)
save_model_h5_any(lstm, "LSTM_Binary_Phase1", extra={"best_threshold": best_thr_lstm})
show_cm_and_valacc("LSTM (Binary)", y_test, yprob_lstm, threshold=best_thr_lstm,
                   savepath=str(DIR_PLOTS / f"LSTM_Binary_Phase1-{_stamp()}.png"))

# =========================
# GRU — giữ kiến trúc, thêm focal + jit + chọn ngưỡng
# =========================
if 'gru' not in globals():
    gru = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        layers.GRU(64,  return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(gru, use_focal=True, lr=1e-3)
gru.fit(X_train_dl, y_train_dl, epochs=EPOCHS_DL, batch_size=BATCH,
        validation_data=(X_test_dl, y_test_dl), callbacks=cb, verbose=1)

yprob_gru = gru.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
best_thr_gru, bp, br, bf1 = find_best_threshold(y_test, yprob_gru, min_recall=0.85)
print(f"[GRU] Best thr={best_thr_gru:.4f} | P={bp:.3f} R={br:.3f} F1={bf1:.3f}")
_ = eval_binary(y_test, yprob_gru, f"GRU (Binary, thr@{best_thr_gru:.3f})", threshold=best_thr_gru)
save_model_h5_any(gru, "GRU_Binary_Phase1", extra={"best_threshold": best_thr_gru})
show_cm_and_valacc("GRU (Binary)", y_test, yprob_gru, threshold=best_thr_gru,
                   savepath=str(DIR_PLOTS / f"GRU_Binary_Phase1-{_stamp()}.png"))

# =========================
# 1D-CNN — giữ kiến trúc, thêm focal + jit + chọn ngưỡng
# =========================
if 'cnn' not in globals():
    cnn = models.Sequential([
        layers.Input(shape=(X_train_dl.shape[1], 1)),
        layers.Conv1D(64, kernel_size=5, activation="relu"),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, activation="relu"),
        layers.GlobalAveragePooling1D(),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid")
    ])
compile_binary(cnn, use_focal=True, lr=1e-3)
cnn.fit(X_train_dl, y_train_dl, epochs=EPOCHS_DL, batch_size=BATCH,
        validation_data=(X_test_dl, y_test_dl), callbacks=cb, verbose=1)

yprob_cnn = cnn.predict(X_test_dl, batch_size=BATCH, verbose=0).ravel()
best_thr_cnn, bp, br, bf1 = find_best_threshold(y_test, yprob_cnn, min_recall=0.85)
print(f"[CNN] Best thr={best_thr_cnn:.4f} | P={bp:.3f} R={br:.3f} F1={bf1:.3f}")
_ = eval_binary(y_test, yprob_cnn, f"1D-CNN (Binary, thr@{best_thr_cnn:.3f})", threshold=best_thr_cnn)
save_model_h5_any(cnn, "CNN1D_Binary_Phase1", extra={"best_threshold": best_thr_cnn})
show_cm_and_valacc("1D-CNN (Binary)", y_test, yprob_cnn, threshold=best_thr_cnn,
                   savepath=str(DIR_PLOTS / f"CNN1D_Binary_Phase1-{_stamp()}.png"))



# **Phase 2: Chuẩn bị dữ liệu đa lớp (AttackType)**

In [None]:
# Lấy tất cả mẫu DDoS
df_attack = df_all[df_all['Label']=='DDoS'].copy()
X_attack = df_attack.reindex(columns=feature_candidates, fill_value=0.0).astype(np.float32)
X_attack_s = scaler.transform(X_attack)

# Mã hoá nhãn AttackType
y_attack_txt = df_attack['AttackType'].astype(str).values
le_attack = LabelEncoder()
y_attack = le_attack.fit_transform(y_attack_txt)
num_classes = len(le_attack.classes_)
joblib.dump(le_attack, ROOT_SAVE / "attack_label_encoder_union.pkl")
print("Classes:", list(le_attack.classes_))

# SMOTE multiclass
X_attack_res, y_attack_res = SMOTE(random_state=RANDOM_STATE).fit_resample(X_attack_s, y_attack)

# Train/test split
Xa_tr, Xa_te, ya_tr, ya_te = train_test_split(
    X_attack_res, y_attack_res, test_size=0.2, random_state=RANDOM_STATE, stratify=y_attack_res
)
Xa_tr = Xa_tr.astype(np.float16)
Xa_te = Xa_te.astype(np.float16)
print("Train:", Xa_tr.shape, "Test:", Xa_te.shape)

# Lưu để DL Phase-2 dùng
Xa_tr_dl = Xa_tr.astype(np.float32).reshape(-1, Xa_tr.shape[1], 1)
Xa_te_dl = Xa_te.astype(np.float32).reshape(-1, Xa_te.shape[1], 1)


In [None]:
#Phase 2 ở cell trên (thử nghiệm)
# %% DL (Multiclass) — LSTM/GRU/CNN cho AttackType (Phase-2)
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks  # type: ignore
from sklearn.metrics import classification_report, accuracy_score

EPOCHS_DL2 = 5
BATCH2 = 4096
num_classes = len(le_attack.classes_)
input_len = Xa_tr_dl.shape[1]

cb_mc = [
    callbacks.EarlyStopping(patience=4, restore_best_weights=True, monitor="val_accuracy"),
    callbacks.ReduceLROnPlateau(patience=2, factor=0.5, min_lr=1e-5),
    StopOnTinyChange(monitor="val_accuracy", min_delta=1e-4),
]

# ---------- LSTM (Multiclass) ----------
if 'lstm_mc' not in globals():
    lstm_mc = models.Sequential([
        layers.Input(shape=(input_len, 1)),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True,
                                         dropout=0.2, recurrent_dropout=0.2)),
        layers.Bidirectional(layers.LSTM(64, return_sequences=False,
                                         dropout=0.2, recurrent_dropout=0.2)),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation="softmax")
    ])
lstm_mc.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                loss="sparse_categorical_crossentropy",
                metrics=["accuracy"],
                jit_compile=True)
lstm_mc.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
            validation_data=(Xa_te_dl, ya_te), callbacks=cb_mc, verbose=1)
y_pred_lstm_mc = tf.argmax(lstm_mc.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1).numpy()
print("\nLSTM (Multiclass) — ACC:", accuracy_score(ya_te, y_pred_lstm_mc))
print(classification_report(ya_te, y_pred_lstm_mc, target_names=le_attack.classes_))
save_model_h5_any(lstm_mc, "LSTM_Multiclass_Phase2",
                  extra={"classes": list(le_attack.classes_), "feature_order": feature_candidates})
show_cm_and_valacc_multiclass("LSTM (Multiclass AttackType)",
                              ya_te, y_pred_lstm_mc, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"LSTM_Multiclass_Phase2-{_stamp()}.png"))

# ---------- GRU (Multiclass) ----------
if 'gru_mc' not in globals():
    gru_mc = models.Sequential([
        layers.Input(shape=(input_len, 1)),
        layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        layers.GRU(64,  return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation="softmax")
    ])
gru_mc.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
               loss="sparse_categorical_crossentropy",
               metrics=["accuracy"],
               jit_compile=True)
gru_mc.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
           validation_data=(Xa_te_dl, ya_te), callbacks=cb_mc, verbose=1)
y_pred_gru_mc = tf.argmax(gru_mc.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1).numpy()
print("\nGRU (Multiclass) — ACC:", accuracy_score(ya_te, y_pred_gru_mc))
print(classification_report(ya_te, y_pred_gru_mc, target_names=le_attack.classes_))
save_model_h5_any(gru_mc, "GRU_Multiclass_Phase2",
                  extra={"classes": list(le_attack.classes_), "feature_order": feature_candidates})
show_cm_and_valacc_multiclass("GRU (Multiclass AttackType)",
                              ya_te, y_pred_gru_mc, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"GRU_Multiclass_Phase2-{_stamp()}.png"))

# ---------- 1D-CNN (Multiclass) ----------
if 'cnn_mc' not in globals():
    cnn_mc = models.Sequential([
        layers.Input(shape=(input_len, 1)),
        layers.Conv1D(64, kernel_size=5, activation="relu"),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, activation="relu"),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation="softmax")
    ])
cnn_mc.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
               loss="sparse_categorical_crossentropy",
               metrics=["accuracy"],
               jit_compile=True)
cnn_mc.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
           validation_data=(Xa_te_dl, ya_te), callbacks=cb_mc, verbose=1)
y_pred_cnn_mc = tf.argmax(cnn_mc.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1).numpy()
print("\n1D-CNN (Multiclass) — ACC:", accuracy_score(ya_te, y_pred_cnn_mc))
print(classification_report(ya_te, y_pred_cnn_mc, target_names=le_attack.classes_))
save_model_h5_any(cnn_mc, "CNN1D_Multiclass_Phase2",
                  extra={"classes": list(le_attack.classes_), "feature_order": feature_candidates})
show_cm_and_valacc_multiclass("1D-CNN (Multiclass AttackType)",
                              ya_te, y_pred_cnn_mc, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"CNN1D_Multiclass_Phase2-{_stamp()}.png"))



# **Phase 2: Deep Learning (Multiclass — LSTM/GRU/CNN) — Train/Resume + Lưu + Plot**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks # type: ignore
from sklearn.metrics import accuracy_score

print("\n=== Phase 2 — Deep Learning (Multiclass AttackType) ===")
EPOCHS_DL2 = 25
BATCH2 = 2048
num_classes = len(le_attack.classes_)
print("Attack classes:", list(le_attack.classes_))

cb2 = [
    callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor="val_accuracy"),
    callbacks.ReduceLROnPlateau(patience=2, factor=0.5, min_lr=1e-5)
]

# LSTM
if 'lstm_multi' not in globals():
    lstm_multi = models.Sequential([
        layers.Input(shape=(Xa_tr.shape[1], 1)),
        layers.LSTM(64, return_sequences=True),
        layers.LSTM(32),
        layers.Dense(64, activation="relu"),
        layers.Dense(num_classes, activation="softmax")
    ])
lstm_multi.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
print("\n[Train] LSTM (Multiclass)")
lstm_multi.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
               validation_data=(Xa_te_dl, ya_te), callbacks=cb2, verbose=1)
y_pred_lstm = np.argmax(lstm_multi.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("LSTM (Multiclass) — Accuracy:", accuracy_score(ya_te, y_pred_lstm))
print(classification_report(ya_te, y_pred_lstm, target_names=le_attack.classes_))
save_model_h5_any(lstm_multi, "LSTM_Multiclass_Phase2", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("LSTM (Multiclass AttackType)", ya_te, y_pred_lstm, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"LSTM_Multiclass_Phase2-{_stamp()}.png"))

# GRU
if 'gru_multi' not in globals():
    gru_multi = models.Sequential([
        layers.Input(shape=(Xa_tr.shape[1], 1)),
        layers.GRU(64, return_sequences=True),
        layers.GRU(32),
        layers.Dense(64, activation="relu"),
        layers.Dense(num_classes, activation="softmax")
    ])
gru_multi.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
print("\n[Train] GRU (Multiclass)")
gru_multi.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
              validation_data=(Xa_te_dl, ya_te), callbacks=cb2, verbose=1)
y_pred_gru = np.argmax(gru_multi.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("GRU (Multiclass) — Accuracy:", accuracy_score(ya_te, y_pred_gru))
print(classification_report(ya_te, y_pred_gru, target_names=le_attack.classes_))
save_model_h5_any(gru_multi, "GRU_Multiclass_Phase2", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("GRU (Multiclass AttackType)", ya_te, y_pred_gru, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"GRU_Multiclass_Phase2-{_stamp()}.png"))

# 1D-CNN
if 'cnn_multi' not in globals():
    cnn_multi = models.Sequential([
        layers.Input(shape=(Xa_tr.shape[1], 1)),
        layers.Conv1D(64, kernel_size=5, activation="relu"),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, activation="relu"),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="relu"),
        layers.Dense(num_classes, activation="softmax")
    ])
cnn_multi.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
print("\n[Train] 1D-CNN (Multiclass)")
cnn_multi.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2,
              validation_data=(Xa_te_dl, ya_te), callbacks=cb2, verbose=1)
y_pred_cnn = np.argmax(cnn_multi.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("1D-CNN (Multiclass) — Accuracy:", accuracy_score(ya_te, y_pred_cnn))
print(classification_report(ya_te, y_pred_cnn, target_names=le_attack.classes_))
save_model_h5_any(cnn_multi, "CNN1D_Multiclass_Phase2", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("1D-CNN (Multiclass AttackType)", ya_te, y_pred_cnn, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"CNN1D_Multiclass_Phase2-{_stamp()}.png"))


In [None]:
# ===== Phase-2 (Optimized) - Multiclass: LSTM / GRU / 1D-CNN / Hybrid + Ensemble =====
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks # type: ignore
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report

# reproducible-ish
tf.random.set_seed(42)
np.random.seed(42)

print("\n=== Phase 2 — Deep Learning (Multiclass AttackType) — OPTIMIZED ===")
EPOCHS_DL2 = 25
BATCH2 = 512                    # giảm batch size để gradient ổn định
NUM_CLASSES = len(le_attack.classes_)
print("Attack classes:", list(le_attack.classes_))

# ----- 1) Preprocessing: scaling + reshape -----
scaler = StandardScaler()
# Xa_tr, Xa_te assumed shape (n_samples, n_features)
Xa_tr_s = scaler.fit_transform(Xa_tr)
Xa_te_s = scaler.transform(Xa_te)
Xa_tr_dl = Xa_tr_s[..., np.newaxis]   # (N, T, 1)
Xa_te_dl = Xa_te_s[..., np.newaxis]

# ----- 2) class weights -----
cw_vals = compute_class_weight('balanced', classes=np.unique(ya_tr), y=ya_tr)
class_weights = dict(enumerate(cw_vals))
print("[INFO] class_weights sample:", list(class_weights.items())[:4])

# ----- 3) Callbacks -----
stamp = _stamp()
ckpt_base = f"Phase2_best-{{name}}-{stamp}.h5"  # will fill name when saving manually
common_cbs = [
    callbacks.ModelCheckpoint(filepath=os.path.join(str(DIR_PLOTS), ckpt_base.format(name="tmp")),
                             monitor="val_accuracy", save_best_only=True, verbose=1),
    callbacks.ReduceLROnPlateau(monitor="val_accuracy", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
    callbacks.EarlyStopping(monitor="val_accuracy", patience=5, restore_best_weights=True, verbose=1)
]

# helper to build a fresh set of callbacks with the right filename
def make_cbs(name):
    path = os.path.join(str(DIR_PLOTS), ckpt_base.format(name=name))
    return [
        callbacks.ModelCheckpoint(filepath=path, monitor="val_accuracy", save_best_only=True, verbose=1),
        callbacks.ReduceLROnPlateau(monitor="val_accuracy", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
        callbacks.EarlyStopping(monitor="val_accuracy", patience=5, restore_best_weights=True, verbose=1)
    ]

# ----- 4) Models definitions (improved) -----

# LSTM (improved: dropout + layer norm)
def build_lstm(name="LSTM"):
    inp = layers.Input(shape=(Xa_tr.shape[1], 1))
    x = layers.LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.15)(inp)
    x = layers.LayerNormalization()(x)
    x = layers.LSTM(64, dropout=0.3, recurrent_dropout=0.15)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(NUM_CLASSES, activation="softmax")(x)
    m = models.Model(inputs=inp, outputs=out, name=name)
    m.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

# GRU (improved: larger units + dropout)
def build_gru(name="GRU"):
    inp = layers.Input(shape=(Xa_tr.shape[1], 1))
    x = layers.GRU(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.15)(inp)
    x = layers.LayerNormalization()(x)
    x = layers.GRU(64, dropout=0.3, recurrent_dropout=0.15)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(NUM_CLASSES, activation="softmax")(x)
    m = models.Model(inputs=inp, outputs=out, name=name)
    m.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

# 1D-CNN (slightly deeper)
def build_cnn(name="CNN1D"):
    inp = layers.Input(shape=(Xa_tr.shape[1], 1))
    x = layers.Conv1D(128, kernel_size=7, activation="relu", padding="same")(inp)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(128, kernel_size=5, activation="relu", padding="same")(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(64, kernel_size=3, activation="relu", padding="same")(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    out = layers.Dense(NUM_CLASSES, activation="softmax")(x)
    m = models.Model(inputs=inp, outputs=out, name=name)
    m.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

# Hybrid: Bidirectional LSTM + GRU
def build_hybrid(name="Hybrid_BiLSTM_GRU"):
    inp = layers.Input(shape=(Xa_tr.shape[1], 1))
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.15))(inp)
    x = layers.LayerNormalization()(x)
    x = layers.GRU(64, dropout=0.3, recurrent_dropout=0.15)(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.45)(x)
    out = layers.Dense(NUM_CLASSES, activation="softmax")(x)
    m = models.Model(inputs=inp, outputs=out, name=name)
    m.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

# build models
lstm_opt = build_lstm("LSTM_OPT")
gru_opt  = build_gru("GRU_OPT")
cnn_opt  = build_cnn("CNN1D_OPT")
hybrid   = build_hybrid("HYBRID_OPT")

# ----- 5) Train models (with class_weight) -----
print("\n[Train] LSTM (Optimized)")
lstm_opt.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2//1,
             validation_data=(Xa_te_dl, ya_te), callbacks=make_cbs("LSTM_OPT"), class_weight=class_weights, verbose=1)
y_pred_lstm = np.argmax(lstm_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("LSTM (Optimized) — Accuracy:", accuracy_score(ya_te, y_pred_lstm))
print(classification_report(ya_te, y_pred_lstm, target_names=le_attack.classes_))
save_model_h5_any(lstm_opt, "LSTM_Multiclass_Phase2_OPT", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("LSTM (Multiclass AttackType) — OPT", ya_te, y_pred_lstm, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"LSTM_Multiclass_Phase2_OPT-{_stamp()}.png"))

print("\n[Train] GRU (Optimized)")
gru_opt.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2//1,
            validation_data=(Xa_te_dl, ya_te), callbacks=make_cbs("GRU_OPT"), class_weight=class_weights, verbose=1)
y_pred_gru = np.argmax(gru_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("GRU (Optimized) — Accuracy:", accuracy_score(ya_te, y_pred_gru))
print(classification_report(ya_te, y_pred_gru, target_names=le_attack.classes_))
save_model_h5_any(gru_opt, "GRU_Multiclass_Phase2_OPT", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("GRU (Multiclass AttackType) — OPT", ya_te, y_pred_gru, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"GRU_Multiclass_Phase2_OPT-{_stamp()}.png"))

print("\n[Train] 1D-CNN (Optimized)")
cnn_opt.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2//1,
            validation_data=(Xa_te_dl, ya_te), callbacks=make_cbs("CNN1D_OPT"), class_weight=class_weights, verbose=1)
y_pred_cnn = np.argmax(cnn_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("1D-CNN (Optimized) — Accuracy:", accuracy_score(ya_te, y_pred_cnn))
print(classification_report(ya_te, y_pred_cnn, target_names=le_attack.classes_))
save_model_h5_any(cnn_opt, "CNN1D_Multiclass_Phase2_OPT", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("1D-CNN (Multiclass AttackType) — OPT", ya_te, y_pred_cnn, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"CNN1D_Multiclass_Phase2_OPT-{_stamp()}.png"))

print("\n[Train] Hybrid BiLSTM+GRU (Optimized)")
hybrid.fit(Xa_tr_dl, ya_tr, epochs=EPOCHS_DL2, batch_size=BATCH2//1,
           validation_data=(Xa_te_dl, ya_te), callbacks=make_cbs("HYBRID_OPT"), class_weight=class_weights, verbose=1)
y_pred_hybrid = np.argmax(hybrid.predict(Xa_te_dl, batch_size=BATCH2, verbose=0), axis=1)
print("Hybrid (Optimized) — Accuracy:", accuracy_score(ya_te, y_pred_hybrid))
print(classification_report(ya_te, y_pred_hybrid, target_names=le_attack.classes_))
save_model_h5_any(hybrid, "HYBRID_Multiclass_Phase2_OPT", extra={"classes": list(le_attack.classes_)})
show_cm_and_valacc_multiclass("Hybrid (Multiclass AttackType) — OPT", ya_te, y_pred_hybrid, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"HYBRID_Multiclass_Phase2_OPT-{_stamp()}.png"))

# ----- 6) Ensemble: soft voting (average predicted probabilities) -----
print("\n[ENSEMBLE] Soft voting of LSTM + GRU + CNN + Hybrid")
p_lstm = lstm_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0)    # shape (N, num_classes)
p_gru  = gru_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0)
p_cnn  = cnn_opt.predict(Xa_te_dl, batch_size=BATCH2, verbose=0)
p_hyb  = hybrid.predict(Xa_te_dl, batch_size=BATCH2, verbose=0)

# equal weights (you can adjust weights if you want)
p_avg = (p_lstm + p_gru + p_cnn + p_hyb) / 4.0
y_pred_ens = np.argmax(p_avg, axis=1)

print("Ensemble (soft voting) — Accuracy:", accuracy_score(ya_te, y_pred_ens))
print(classification_report(ya_te, y_pred_ens, target_names=le_attack.classes_))
show_cm_and_valacc_multiclass("Ensemble (LSTM+GRU+CNN+Hybrid) — OPT", ya_te, y_pred_ens, labels=le_attack.classes_,
                              savepath=str(DIR_PLOTS / f"Ensemble_Multiclass_Phase2_OPT-{_stamp()}.png"))


In [None]:
# %% DCN v2 — Phase-2 (Multiclass AttackType)
# Yêu cầu sẵn có: Xa_tr, Xa_te, ya_tr, ya_te, le_attack, DIR_PLOTS, _stamp,
# save_model_h5_any, show_cm_and_valacc_multiclass, feature_candidates
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks # type: ignore
from sklearn.metrics import classification_report, accuracy_score

def DCN_v2_block(x, depth=4, name="dcn_mc"):
    """Cross Network v2: x_{l+1} = x0 * (W_l x_l) + b_l + x_l"""
    x0 = x
    for i in range(depth):
        xl = layers.Dense(x.shape[-1], use_bias=True, name=f"{name}_w{i+1}")(x)
        x  = layers.Add(name=f"{name}_add{i+1}")([x0 * xl, x])
    return x

num_classes = len(le_attack.classes_)

inp = layers.Input(shape=(Xa_tr.shape[1],))
# Cross (wide)
cross = DCN_v2_block(inp, depth=4, name="dcn_mc")
# Deep
deep = layers.Dense(256, activation="relu")(inp)
deep = layers.Dropout(0.2)(deep)
deep = layers.Dense(128, activation="relu")(deep)
deep = layers.Dropout(0.2)(deep)
deep = layers.Dense(64, activation="relu")(deep)
# Kết hợp
h = layers.Concatenate()([cross, deep])
h = layers.Dense(128, activation="relu")(h)
out = layers.Dense(num_classes, activation="softmax")(h)

dcn_mc = models.Model(inp, out, name="DCNv2_Multiclass")
dcn_mc.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
               loss="sparse_categorical_crossentropy",
               metrics=["accuracy"])

cb2 = [
    callbacks.EarlyStopping(patience=4, restore_best_weights=True, monitor="val_accuracy"),
    callbacks.ReduceLROnPlateau(patience=2, factor=0.5, min_lr=1e-5),
]

dcn_mc.fit(Xa_tr, ya_tr, epochs=30, batch_size=2048,
           validation_data=(Xa_te, ya_te), callbacks=cb2, verbose=1)

ya_pred_dcn = tf.argmax(dcn_mc.predict(Xa_te, batch_size=4096, verbose=0), axis=1).numpy()

print("\nDCN v2 (Multiclass) — Accuracy:", accuracy_score(ya_te, ya_pred_dcn))
print(classification_report(ya_te, ya_pred_dcn, target_names=le_attack.classes_))

try:
    show_cm_and_valacc_multiclass("DCN v2 (Multiclass AttackType)",
                                  ya_te, ya_pred_dcn, labels=le_attack.classes_,
                                  savepath=str(DIR_PLOTS / f"DCNv2_Multiclass_Phase2-{_stamp()}.png"))
except Exception:
    show_cm_and_valacc_multiclass("DCN v2 (Multiclass AttackType)",
                                  ya_te, ya_pred_dcn, labels=le_attack.classes_)

save_model_h5_any(dcn_mc, "DCNv2_Multiclass_Phase2",
                  extra={"classes": list(le_attack.classes_),
                         "feature_order": feature_candidates})


# **Dọn Ram**

In [None]:
import gc, sys, types, numpy as np, pandas as pd

KEEP = {
    # artifacts cần giữ
    "feature_candidates", "scaler", "le_attack",
    "xgb_bin","lgb_bin","cat_bin","lstm","gru","cnn","ae",
    "metrics_phase1","metrics_phase2","best_thresholds",
    # config/seed
    "RANDOM_STATE","split_info"
}

SIZE_MB_THRESHOLD = 0  # chỉ dọn biến > 100MB để an toàn

def nbytes(obj):
    try:
        if isinstance(obj, np.ndarray): return obj.nbytes
        if isinstance(obj, pd.DataFrame): return obj.memory_usage(deep=True).sum()
        if isinstance(obj, pd.Series): return obj.memory_usage(deep=True)
        return sys.getsizeof(obj)
    except Exception:
        return 0

deleted = []
for name, val in list(globals().items()):
    if name.startswith("_") or name in KEEP: 
        continue
    if isinstance(val, types.ModuleType) or isinstance(val, types.FunctionType):
        continue
    try:
        mb = nbytes(val) / (1024**2)
        if mb >= SIZE_MB_THRESHOLD:
            del globals()[name]
            deleted.append((name, f"{mb:.1f} MB"))
    except Exception:
        pass

gc.collect()
print("Đã dọn:", deleted[:10], "... tổng:", len(deleted))
