In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler # <<< Quay lại StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import time
import warnings
from sklearn.metrics import f1_score
import joblib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

# Thiết lập thiết bị tính toán (GPU nếu có)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sử dụng thiết bị: {DEVICE}\n")

# ===================================================================
# CÁC LỚP MODEL (Deep AE Enhanced Capacity)
# ===================================================================

class AdvancedDimReducerAE(nn.Module):
    """
    Model 1: Giảm chiều dữ liệu (46D -> 32D). Input -> 128 -> 64 -> 32.
    """
    def __init__(self, input_dim, latent_dim=32):
        super(AdvancedDimReducerAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(),
            nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64), nn.BatchNorm1d(64), nn.ReLU(),
            nn.Linear(64, 128), nn.BatchNorm1d(128), nn.ReLU(),
            nn.Linear(128, input_dim)
        )
        # self.encoder = nn.Sequential(
        #     nn.Linear(input_dim, 128), nn.LayerNorm(128), nn.ReLU(),
        #     nn.Linear(128, 64), nn.ReLU(),
        #     nn.Linear(64, latent_dim)
        # )
        # self.decoder = nn.Sequential(
        #     nn.Linear(latent_dim, 64), nn.LayerNorm(64), nn.ReLU(),
        #     nn.Linear(64, 128), nn.ReLU(),
        #     nn.Linear(128, input_dim)
        # )

    def forward(self, x):
        z = self.encoder(x)
        reconstructed = self.decoder(z)
        return reconstructed, z

Sử dụng thiết bị: cuda



In [2]:
class DeepAnomalyAE(nn.Module):
    """
    Model 2: Phát hiện bất thường (Kiến trúc robust 32D -> 8D).
    Increased layer width (capacity), tight bottleneck, increased dropout.
    """
    def __init__(self, input_dim=32, latent_dim=8): # <<< Bottleneck 8D
        super(DeepAnomalyAE, self).__init__()
        # Input (32) -> 256 -> 128 -> 8 (Latent)
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),      # <<< Increased width
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),                # <<< Increased dropout
            nn.Linear(256, 128),      # <<< Increased width
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),                # <<< Increased dropout
            nn.Linear(128, latent_dim) # To 8
        )
        # 8 (Latent) -> 128 -> 256 -> 32 (Output)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),      # <<< Increased width
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),                # <<< Increased dropout
            nn.Linear(128, 256),      # <<< Increased width
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),                # <<< Increased dropout
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        reconstructed = self.decoder(z)
        return reconstructed

In [3]:
# ===================================================================
# HÀM HUẤN LUYỆN (Optimized)
# ===================================================================

def train_model(model, train_loader, val_loader, epochs, patience, learning_rate, weight_decay, model_save_path, loss_fn):
    model.to(DEVICE)
    criterion = loss_fn
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Adam optimizer
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=8) # Increased patience

    best_val_loss = float('inf')
    epochs_no_improve = 0

    print(f"Bắt đầu huấn luyện model '{model.__class__.__name__}' với loss '{criterion.__class__.__name__}'...")
    start_time = time.time()

    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0.0
        for (data,) in train_loader:
            data = data.to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            reconstructed = output[0] if isinstance(output, tuple) else output
            loss = criterion(reconstructed, data)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item() * data.size(0)

        avg_epoch_train_loss = epoch_train_loss / len(train_loader.dataset) if len(train_loader.dataset) > 0 else 0

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for (data,) in val_loader:
                data = data.to(DEVICE)
                output = model(data)
                reconstructed = output[0] if isinstance(output, tuple) else output
                loss = criterion(reconstructed, data)
                val_loss += loss.item() * data.size(0)

        avg_val_loss = val_loss / len(val_loader.dataset) if len(val_loader.dataset) > 0 else 0
        scheduler.step(avg_val_loss)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_epoch_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")

        if avg_val_loss < best_val_loss - 1e-6:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), model_save_path)
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping được kích hoạt tại epoch {epoch+1}.")
            break

    end_time = time.time()
    best_loss_str = f"{best_val_loss:.6f}" if best_val_loss != float('inf') else "N/A"
    print(f"Huấn luyện hoàn tất trong {end_time - start_time:.2f} giây. Best Val Loss: {best_loss_str}. Model tốt nhất đã lưu vào '{model_save_path}'.\n")
    try:
        model.load_state_dict(torch.load(model_save_path))
    except FileNotFoundError: print(f"Warning: Could not load {model_save_path}.")
    except Exception as e: print(f"Warning: Error loading {model_save_path}: {e}.")
    return model

In [4]:
# ===================================================================
# HÀM HỖ TRỢ ĐÁNH GIÁ & TRỰC QUAN HÓA (More Robust)
# ===================================================================

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    try:
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    except ValueError:
        unique_preds=np.unique(y_pred); unique_true=np.unique(y_true); tn,fp,fn,tp=0,0,0,0
        if len(unique_true)==1:
            if unique_true[0]==0: tn=len(y_true);
            else: tp=len(y_true);
        elif len(unique_preds)==1:
             if unique_preds[0]==0: tn=np.sum(y_true==0); fn=np.sum(y_true==1);
             else: fp=np.sum(y_true==0); tp=np.sum(y_true==1);
        print("Warning: CM calculation issue.")
    fpr = fp/(fp+tn) if (fp+tn)>0 else 0.0
    fnr = fn/(fn+tp) if (fn+tp)>0 else 0.0
    return accuracy, precision, recall, f1, fpr, fnr, (tn, fp, fn, tp)

def plot_evaluation(y_true, scores, y_pred, threshold, title):
    print(f"\n--- Đang vẽ biểu đồ cho: {title} ---")
    auc = 0.5
    try:
        y_true=np.asarray(y_true).astype(int); scores=np.asarray(scores); y_pred=np.asarray(y_pred).astype(int);
        if not np.issubdtype(scores.dtype, np.number): raise TypeError("scores non-numeric")
        scores=np.nan_to_num(scores, nan=np.nanmedian(scores), posinf=np.nanmax(scores[np.isfinite(scores)]), neginf=np.nanmin(scores[np.isfinite(scores)]))
        if not np.all(np.isfinite(scores)): raise ValueError("Non-finite scores remain")
        unique_labels_true = np.unique(y_true)
        if len(unique_labels_true)<2: print(f"Warning: Only one class ({unique_labels_true}). Cannot calc AUC.")
        else: auc=roc_auc_score(y_true, scores)
    except ValueError as e: print(f"Warning: Cannot calc AUC. Error: {e}")
    except TypeError as e: print(f"Error calc AUC: {e}")

    accuracy, precision, recall, f1, fpr, fnr, (tn, fp, fn, tp) = calculate_metrics(y_true, y_pred)
    print(f"AUC: {auc:.4f} | F1-Score: {f1:.4f}"); print(f"Precision: {precision:.4f} | Recall (TPR): {recall:.4f}"); print(f"FPR: {fpr:.4f} | FNR: {fnr:.4f}"); print(f"TP: {tp} | FP: {fp} | TN: {tn} | FN: {fn}")
    plt.figure(figsize=(18, 6)); plt.suptitle(title, fontsize=16); plt.subplot(1, 3, 1)
    unique_labels_plot = np.unique(y_true)
    if len(unique_labels_plot) > 1:
        if 0 in unique_labels_plot: sns.histplot(scores[y_true==0], color='blue', label='Normal Scores', kde=True, bins=50, stat="density")
        if 1 in unique_labels_plot: sns.histplot(scores[y_true==1], color='red', label='Abnormal Scores', kde=True, bins=50, stat="density")
    elif len(unique_labels_plot) == 1: label_text = 'Normal' if unique_labels_plot[0] == 0 else 'Abnormal'; sns.histplot(scores, color='purple', label=f'{label_text} Scores Only', kde=True, bins=50, stat="density")
    else: plt.text(0.5, 0.5, 'No data', ha='center')
    plt.axvline(threshold, color='green', linestyle='--', label=f'Threshold ({threshold:.4f})'); plt.title('Phân phối Điểm/Lỗi Bất thường'); plt.legend(); plt.subplot(1, 3, 2)
    labels_cm = sorted(np.unique(y_true))
    if not np.all(np.isin(np.unique(y_pred), labels_cm)): print(f"Warning: Predictions contain labels not in y_true.")
    cm = confusion_matrix(y_true, y_pred, labels=labels_cm); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[f'{l}' for l in labels_cm], yticklabels=[f'{l}' for l in labels_cm]); plt.title('Confusion Matrix'); plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.subplot(1, 3, 3)
    if auc > 0.5 and len(unique_labels_plot) > 1 :
        fpr_roc, tpr_roc, _ = roc_curve(y_true, scores); plt.plot(fpr_roc, tpr_roc, label=f'AUC = {auc:.4f}')
        if (tn + fp) > 0 and (tp + fn) > 0: plt.scatter(fpr, recall, marker='o', color='red', zorder=5, s=100, label=f'Operating Point\n(FPR={fpr:.2f}, TPR={recall:.2f})')
    else: plt.text(0.5, 0.5, 'Cannot draw ROC Curve', ha='center')
    plt.title('ROC Curve'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend(); plt.grid(True); plt.tight_layout(rect=[0, 0.03, 1, 0.93]); plt.savefig(f"evaluation_{title.replace(' ', '_')}.png"); plt.show()


In [5]:
torch.manual_seed(42)
np.random.seed(42)

def make_dataloader_from_array(X, batch_size=128, shuffle=True):
    tensor = torch.tensor(X, dtype=torch.float32)
    ds = TensorDataset(tensor)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, drop_last=False)

def compute_reconstruction_errors(model, X, batch_size=1024):
    model.to(DEVICE)
    model.eval()
    dloader = make_dataloader_from_array(X, batch_size=batch_size, shuffle=False)
    errors = []
    with torch.no_grad():
        for (batch,) in dloader:
            batch = batch.to(DEVICE)
            output = model(batch)
            recon = output[0] if isinstance(output, tuple) else output
            # MSE per sample
            per_sample = torch.mean((recon - batch)**2, dim=1).cpu().numpy()
            errors.append(per_sample)
    return np.concatenate(errors, axis=0)

def get_latent(encoder_model, X, batch_size=1024):
    encoder_model.to(DEVICE)
    encoder_model.eval()
    dloader = make_dataloader_from_array(X, batch_size=batch_size, shuffle=False)
    latents = []
    with torch.no_grad():
        for (batch,) in dloader:
            batch = batch.to(DEVICE)
            out = encoder_model(batch)
            # encoder may be a Sequential returning z directly or model.forward returns tuple
            if isinstance(out, tuple):
                z = out[1]  # in case forward returns (reconstructed, z)
            else:
                # If whole AE provided, call encoder separately
                try:
                    z = encoder_model.encoder(batch)
                except Exception:
                    z = out
            latents.append(z.cpu().numpy())
    return np.concatenate(latents, axis=0)

def choose_threshold_by_val(scores_val, y_val, maximize="f1"):
    """Search threshold that maximizes F1 on validation set.
       scores: higher -> more anomalous
       returns best_threshold, best_metric
    """
    best_thr = None
    best_metric = -1
    # grid = unique values quantiles
    grid = np.unique(np.percentile(scores_val, np.linspace(0, 100, 200)))
    for thr in grid:
        pred = (scores_val >= thr).astype(int)  # 1 = anomaly
        if len(np.unique(y_val)) < 2:
            metric = 0
        else:
            metric = f1_score(y_val, pred, zero_division=0)
        if metric > best_metric:
            best_metric = metric
            best_thr = thr
    return best_thr, best_metric

def run_full_pipeline(csv_path,
                      label_col=None,
                      latent_dim=64,
                      bAE_epochs=100, bAE_lr=1e-4,
                      deepAE_epochs=80, deepAE_lr=1e-4,
                      ocsvm_nu=0.01,
                      batch_size=128,
                      test_size=0.2, val_size=0.2,
                      model_dir="models"):

    os.makedirs(model_dir, exist_ok=True)

    # 1) Load CSV
    df = pd.read_csv(csv_path)
    print("Before drop cols: \n", df.dtypes)
    
    df = df.drop(['Timestamp','Flow IAT Min','Fwd IAT Min'], axis=1)
    # - CHỈNH LABEL ĐỂ DỄ TRAIN TEST --
    
    print(df.dtypes)
    
    df['Label'] = (df['Label'] > 0).astype(int)
    
    print("🧩 Kiểm tra giá trị bất thường trong dữ liệu...")
    print("NaN:", np.isnan(df).sum())
    print("Inf:", np.isinf(df).sum())
    print("Max:", np.nanmax(df))
    print("Min:", np.nanmin(df))
    
    print(df['Label'].head(50))
    X_train = df.drop(columns=['Label']).values.astype(np.float32)
    y_train = df['Label'].astype(int).values
    print(f"Loaded X shape: {X_train.shape}, y distribution: {np.bincount(y_train)}")

    # Chia dữ liệu train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, stratify=y_train, random_state=42)
    
    print("Train:", X_train.shape, "\nVal: ", X_val.shape, "\nTest:", X_test.shape)

    # Loader cho bAE
    train_raw_loader = DataLoader(TensorDataset(torch.tensor(X_train)), batch_size=256, shuffle=True)
    val_raw_loader   = DataLoader(TensorDataset(torch.tensor(X_test)), batch_size=256, shuffle=False)
    
    #####
    # X = df.drop(columns=[label_col]).values.astype(np.float32)
    # y = df[label_col].astype(int).values
    # print(f"Loaded X shape: {X.shape}, y distribution: {np.bincount(y)}")

    # 3) Train bAE (unsupervised) on ALL train samples (both normal + abnormal is OK for dim-reducer)
    input_dim = X_train.shape[1]
    print(f"Input dimension: {input_dim}")
    
    bAE = AdvancedDimReducerAE(input_dim=input_dim, latent_dim=latent_dim)
    train_loader = make_dataloader_from_array(X_train, batch_size=batch_size, shuffle=True)
    val_loader = make_dataloader_from_array(X_val, batch_size=batch_size, shuffle=False)

    print("Training bottleneck AutoEncoder (bAE)...")
    bAE_path = os.path.join(model_dir, f"bAE_latent{latent_dim}.pth")
    bAE = train_model(bAE, train_loader, val_loader,
                      epochs=bAE_epochs, patience=15,
                      learning_rate=bAE_lr, weight_decay=1e-5,
                      model_save_path=bAE_path, loss_fn=nn.MSELoss())

    # 4) Encode latent for train/val/test
    print("Encoding latent representations...")
    # We want encoder only — if model returns (reconstructed, z) in forward, we can call forward and extract z
    
    def encode_with_bAE(model, X_arr):
        model.to(DEVICE); model.eval()
        loader = make_dataloader_from_array(X_arr, batch_size=1024, shuffle=False)
        zs = []
        with torch.no_grad():
            for (batch,) in loader:
                batch = batch.to(DEVICE)
                out = model(batch)
                # out might be (recons, z) per your class
                if isinstance(out, tuple) and len(out) == 2:
                    z = out[1]
                else:
                    # try to access encoder
                    try:
                        z = model.encoder(batch)
                    except Exception:
                        z = out
                zs.append(z.cpu().numpy())
        return np.concatenate(zs, axis=0)

    Z_train = encode_with_bAE(bAE, X_train)
    Z_val   = encode_with_bAE(bAE, X_val)
    Z_test  = encode_with_bAE(bAE, X_test)
    print("Latent shapes:", Z_train.shape, Z_test.shape)


    # 5) Train OneClassSVM on latent of normals only (y==0)
    
    
    
    NU_PARAM = 0.01
    print(f"Huấn luyện OCSVM với nu = {NU_PARAM} (ưu tiên giảm FP)...")
    ocsvm = OneClassSVM(nu=NU_PARAM, kernel="rbf", gamma='auto', cache_size=1000)
    
    # OCSVM TRAINING
    start_ocsvm = time.time()    
    ocsvm.fit(Z_train)
    end_ocsvm = time.time()
    
    print(f"OCSVM training took {end_ocsvm - start_ocsvm:.2f} seconds.")

    joblib.dump(ocsvm, os.path.join(model_dir, "ocsvm_latent.joblib"))

    # OCSVM scores: decision_function (higher -> more inlier). We invert so higher -> more anomalous
    
    scores_ocsvm_val = -ocsvm.decision_function(Z_val)
    scores_ocsvm_test = -ocsvm.decision_function(Z_test)
    print(f"Ngưỡng OCSVM được xác định là: {scores_ocsvm_test:.4f}\n")

    # 6) Train DeepAnomalyAE on latent normals only (unsupervised AE detector)
    deepAE = DeepAnomalyAE(input_dim=latent_dim, latent_dim=max(8, latent_dim//4))
    # Use only normal latent vectors from train and val for validation
    Z_val_norm = Z_val[y_val == 0] if np.sum(y_val == 0) > 0 else Z_val
    train_loader_deep = make_dataloader_from_array(Z_train, batch_size=batch_size, shuffle=True)
    val_loader_deep = make_dataloader_from_array(Z_val_norm, batch_size=batch_size, shuffle=False)

    deepAE_path = os.path.join(model_dir, f"deepAE_latent{latent_dim}.pth")
    print("Training DeepAnomalyAE (detector) on latent normals ...")
    deepAE = train_model(deepAE, train_loader_deep, val_loader_deep,
                         epochs=deepAE_epochs, patience=12,
                         learning_rate=deepAE_lr, weight_decay=1e-5,
                         model_save_path=deepAE_path, loss_fn=nn.MSELoss())

    # 7) Compute reconstruction errors (per-sample MSE) from DeepAE on val and test latent
    recon_err_val = compute_reconstruction_errors(deepAE, Z_val, batch_size=1024)
    recon_err_test = compute_reconstruction_errors(deepAE, Z_test, batch_size=1024)

    # 8) Combine scores: normalize each to 0-1 then average
    def minmax_scale(arr):
        if np.all(np.isclose(arr, arr[0])):
            return np.zeros_like(arr)
        a = np.array(arr, dtype=float)
        a = (a - np.nanmin(a)) / (np.nanmax(a) - np.nanmin(a))
        a = np.nan_to_num(a, nan=0.0)
        return a

    s_ocsvm_val = minmax_scale(scores_ocsvm_val)
    s_recon_val = minmax_scale(recon_err_val)
    combined_val = 0.5 * s_ocsvm_val + 0.5 * s_recon_val

    s_ocsvm_test = minmax_scale(scores_ocsvm_test)
    s_recon_test = minmax_scale(recon_err_test)
    combined_test = 0.5 * s_ocsvm_test + 0.5 * s_recon_test

    # 9) choose threshold on validation set (maximize F1)
    best_thr, best_f1 = choose_threshold_by_val(combined_val, y_val)
    print(f"Chosen threshold on validation: {best_thr:.6f} (val F1={best_f1:.4f})")

    y_pred_test = (combined_test >= best_thr).astype(int)  # 1 = anomalous
    # convert to same convention: y_true: 1=anomaly
    # evaluate and plot
    plot_evaluation(y_true=y_test, scores=combined_test, y_pred=y_pred_test, threshold=best_thr, title=f"Pipeline_Result_latent{latent_dim}")

    # Save models & artifacts
    torch.save(bAE.state_dict(), bAE_path)
    torch.save(deepAE.state_dict(), deepAE_path)
    joblib.dump({'scaler': None}, os.path.join(model_dir, "artifacts.joblib"))  # if you used scalers, save them here

    print("Pipeline finished. Models and evaluation saved in:", model_dir)
    return {
        "bAE": bAE,
        "deepAE": deepAE,
        "ocsvm": ocsvm,
        "Z_train": Z_train,
        "Z_val": Z_val,
        "Z_test": Z_test,
        "combined_test_scores": combined_test,
        "y_test": y_test,
        "y_pred_test": y_pred_test,
        "threshold": best_thr
    }

result = run_full_pipeline("C:/Users/hoang/Documents/Dataset_KLTN/gotham2025_extracted/gotham2025_proced_merged.csv", label_col="label", latent_dim=64)


Before drop cols: 
 Flow ID              float64
Src IP               float64
Src Port             float64
Dst IP               float64
Dst Port             float64
Protocol             float64
Timestamp             object
Flow Duration        float64
Tot Fwd Pkts         float64
Tot Bwd Pkts         float64
TotLen Fwd Pkts      float64
TotLen Bwd Pkts      float64
Fwd Pkt Len Max      float64
Fwd Pkt Len Min      float64
Fwd Pkt Len Mean     float64
Fwd Pkt Len Std      float64
Bwd Pkt Len Max      float64
Bwd Pkt Len Min      float64
Bwd Pkt Len Mean     float64
Bwd Pkt Len Std      float64
Flow Byts/s          float64
Flow Pkts/s          float64
Flow IAT Mean        float64
Flow IAT Std         float64
Flow IAT Max         float64
Flow IAT Min         float64
Fwd IAT Tot          float64
Fwd IAT Mean         float64
Fwd IAT Std          float64
Fwd IAT Max          float64
Fwd IAT Min          float64
Bwd IAT Tot          float64
Bwd IAT Mean         float64
Bwd IAT Std          fl

KeyboardInterrupt: 