<a href="https://colab.research.google.com/github/NZodasic/Cyber-Threat-Detection/blob/main/Super_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# train_all_models.py
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Optional libs:
# pip install pytorch-tabnet rtdl tabpfn

from pytorch_tabnet.tab_model import TabNetClassifier
import rtdl
from tabpfn import TabPFNClassifier

# -----------------------------
# 1. Load Data
# -----------------------------
CSV_PATH = "pe_features_extended.csv"
df = pd.read_csv(CSV_PATH)

# target
y = (df['label'].str.lower() == 'malware').astype(int).values
df = df.drop(columns=['label'])

# numeric only (cho đơn giản, bạn có thể thêm text feature như trước)
X = df.select_dtypes(include=[np.number]).fillna(0).values
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

results = []

# -----------------------------
# 2. Define Models
# -----------------------------

# ---- MLP ----
class MLP(nn.Module):
    def __init__(self, in_dim, hidden=[256,128]):
        super().__init__()
        layers = []
        last_dim = in_dim
        for h in hidden:
            layers.append(nn.Linear(last_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))
            last_dim = h
        layers.append(nn.Linear(last_dim, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x).squeeze(1)

def train_mlp(X_train, X_test, y_train, y_test, epochs=10, batch=64):
    model = MLP(X_train.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss()
    opt = optim.Adam(model.parameters(), lr=1e-3)

    train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                             torch.tensor(y_train, dtype=torch.float32))
    test_ds  = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(y_test, dtype=torch.float32))
    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=256)

    for epoch in range(epochs):
        model.train()
        for xb,yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            opt.step()

    model.eval()
    preds, probs = [], []
    with torch.no_grad():
        for xb,yb in test_loader:
            xb = xb.to(device)
            out = torch.sigmoid(model(xb))
            preds.extend((out.cpu().numpy()>=0.5).astype(int).ravel().tolist())
            probs.extend(out.cpu().numpy().ravel().tolist())

    return preds, probs

# ---- ResNet for tabular ----
class ResNetTabular(nn.Module):
    def __init__(self, in_dim, hidden=256, depth=4):
        super().__init__()
        self.fc_in = nn.Linear(in_dim, hidden)
        self.blocks = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden, hidden),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(hidden, hidden)
            ) for _ in range(depth)
        ])
        self.fc_out = nn.Linear(hidden, 1)

    def forward(self, x):
        x = self.fc_in(x)
        for block in self.blocks:
            residual = x
            x = block(x) + residual
            x = torch.relu(x)
        return self.fc_out(x).squeeze(1)

def train_resnet(X_train, X_test, y_train, y_test, epochs=10, batch=64):
    model = ResNetTabular(X_train.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss()
    opt = optim.Adam(model.parameters(), lr=1e-3)

    train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                             torch.tensor(y_train, dtype=torch.float32))
    test_ds  = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(y_test, dtype=torch.float32))
    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=256)

    for epoch in range(epochs):
        model.train()
        for xb,yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            opt.step()

    model.eval()
    preds, probs = [], []
    with torch.no_grad():
        for xb,yb in test_loader:
            xb = xb.to(device)
            out = torch.sigmoid(model(xb))
            preds.extend((out.cpu().numpy()>=0.5).astype(int).ravel().tolist())
            probs.extend(out.cpu().numpy().ravel().tolist())

    return preds, probs

# ---- FT-Transformer ----
def train_ft_transformer(X_train, X_test, y_train, y_test, epochs=10, batch=64):
    d_numerical = X_train.shape[1]
    model = rtdl.FTTransformer.make_baseline(
        d_numerical=d_numerical,
        categories=[],
        d_out=1  # binary classification
    ).to(device)

    criterion = nn.BCEWithLogitsLoss()
    opt = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)

    train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                             torch.tensor(y_train, dtype=torch.float32))
    test_ds  = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(y_test, dtype=torch.float32))
    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=256)

    for epoch in range(epochs):
        model.train()
        for xb,yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            opt.step()

    model.eval()
    preds, probs = [], []
    with torch.no_grad():
        for xb,yb in test_loader:
            xb = xb.to(device)
            out = torch.sigmoid(model(xb))
            preds.extend((out.cpu().numpy()>=0.5).astype(int).ravel().tolist())
            probs.extend(out.cpu().numpy().ravel().tolist())

    return preds, probs

# -----------------------------
# 3. Train & Evaluate
# -----------------------------
def evaluate_model(name, preds, probs, y_test):
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    results.append([name, acc, prec, rec, f1, auc])

print("Training MLP...")
mlp_preds, mlp_probs = train_mlp(X_train, X_test, y_train, y_test)
evaluate_model("MLP", mlp_preds, mlp_probs, y_test)

print("Training ResNet...")
res_preds, res_probs = train_resnet(X_train, X_test, y_train, y_test)
evaluate_model("ResNet", res_preds, res_probs, y_test)

print("Training TabNet...")
tabnet = TabNetClassifier(verbose=0, seed=42)
tabnet.fit(X_train, y_train, eval_set=[(X_test,y_test)], eval_metric=['auc'])
tab_preds = tabnet.predict(X_test)
tab_probs = tabnet.predict_proba(X_test)[:,1]
evaluate_model("TabNet", tab_preds, tab_probs, y_test)

print("Training FT-Transformer...")
ft_preds, ft_probs = train_ft_transformer(X_train, X_test, y_train, y_test)
evaluate_model("FT-Transformer", ft_preds, ft_probs, y_test)

print("Training TabPFN...")
tabpfn = TabPFNClassifier(N_ensemble_configurations=32)
tabpfn.fit(X_train, y_train)
tp_preds = tabpfn.predict(X_test)
tp_probs = tabpfn.predict_proba(X_test)[:,1]
evaluate_model("TabPFN", tp_preds, tp_probs, y_test)

# -----------------------------
# 4. Kết quả cuối cùng
# -----------------------------
df_results = pd.DataFrame(results, columns=["Model","Accuracy","Precision","Recall","F1","ROC AUC"])
print("\n=== Final Comparison ===")
print(df_results)
