In [None]:
!rm -rf '/content/DIS_Hughen'

In [None]:
!git clone https://github.com/NU-Academics/DIS_Hughen.git

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import shap
import time
import torch
import torch.nn as nn
import torch.optim as optim
from imblearn.over_sampling import SMOTE
from scipy.io import arff
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import  accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

In [None]:
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))

In [None]:
df = pd.read_csv("/content/DIS_Hughen/undersampled_CIC2019_dataset.csv")

In [None]:
df.shape

In [None]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])
X = df.drop(columns=["label"], errors="ignore").select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)
y = df["label"]
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)

In [None]:
def reduce_correlation(X, threshold=0.8):
    X = X.copy()
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(f"Dropping {len(to_drop)} highly correlated features.")
    return X.drop(columns=to_drop)

X_corr = X.copy()
X_reduced = reduce_correlation(X_corr, threshold=0.9)

print("Original features:", X_corr.shape[1])
print("Reduced features:", X_reduced.shape[1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42, stratify=y
)

X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

In [None]:
start_time = time.perf_counter()
smote = SMOTE(random_state=42, k=2)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Original training class distribution:")
print(y_train.value_counts())
print("\nAfter SMOTE:")
print(pd.Series(y_train_res).value_counts())
end_time = time.perf_counter()
elapsed_time_pandas = end_time - start_time
elapsed_time_pandas

In [None]:
model = XGBClassifier(
    objective="binary:logistic",
    n_estimators=2000,   
    max_depth=8,
    learning_rate=0.03,
    reg_lambda=2,
    reg_alpha=1,
    gamma=0.1,
    max_bin=512,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="mlogloss",
    random_state=42,
    tree_method="hist",
    device="cuda"
)
model.fit(X_train_res, y_train_res)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Weighted F1:", f1_score(y_test, y_pred, average='weighted'))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print("ROC-AUC:", roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted"))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[-15:]

plt.figure(figsize=(8,6))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), X.columns[indices])
plt.title("Top 15 Important IDS Features (XGBoost)")
plt.show()

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Case 1: Old SHAP → list of arrays (one per class)
if isinstance(shap_values, list):
    mean_shap = np.mean(
        [np.abs(sv).mean(axis=0) for sv in shap_values],
        axis=0
    )

# Case 2: New SHAP → 3D array (samples, features, classes)
elif len(shap_values.shape) == 3:
    # Take mean over samples and classes
    mean_shap = np.abs(shap_values).mean(axis=(0, 2))

# Case 3: Binary classification (2D array)
else:
    mean_shap = np.abs(shap_values).mean(axis=0)

# Now mean_shap is guaranteed 1D
print("Mean SHAP shape:", mean_shap.shape)
print("Feature count:", X_test.shape[1])

# Create DataFrame safely
shap_importance = pd.DataFrame({
    "feature": X_test.columns,
    "mean_abs_shap": mean_shap
}).sort_values(by="mean_abs_shap", ascending=False)

top_k = 20
top_features = shap_importance["feature"].iloc[:top_k].tolist()

print("\nTop SHAP Features:")
print(top_features)

In [None]:
X_train_top = X_train_res[top_features]
X_test_top = X_test[top_features]

model_top = XGBClassifier(
    objective="binary:logistic",
    n_estimators=2000,   
    max_depth=8,
    learning_rate=0.03,
    reg_lambda=2,
    reg_alpha=1,
    gamma=0.1,
    max_bin=512,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="mlogloss",
    random_state=42,
    tree_method="hist",
    device="cuda"
)

model_top.fit(X_train_top, y_train_res)

In [None]:
y_pred = model_top.predict(X_test_top)
y_prob = model_top.predict_proba(X_test_top)

accuracy = accuracy_score(y_test, y_pred)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
macro_f1 = f1_score(y_test, y_pred, average='macro')
roc_auc = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted")

print("\n===== Final Model Performance (Top Features) =====")
print("Accuracy:", round(accuracy, 4))
print("Weighted F1:", round(weighted_f1, 4))
print("Macro F1:", round(macro_f1, 4))
print("ROC-AUC:", round(roc_auc, 4))

# Class-wise report
print("\n===== Class-wise Performance =====")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")