In [1]:
# ---------------------------
# STACKED NDAE (greedy layer-wise AE pretraining) + RandomForest
# ---------------------------
# Run in notebook cell. Change paths if file names differ.
# ---------------------------

# Install required packages (uncomment if needed)
# %pip install tensorflow-cpu scikit-learn imbalanced-learn pandas numpy --quiet

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings("ignore")





In [2]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# ---------------------------
# Paths - change if needed
# ---------------------------
TRAIN_PATH = "../data/nsl-kdd/KDDTrain+.txt"
TEST_PATH  = "../data/nsl-kdd/KDDTest+.txt"

In [3]:
# ---------------------------
# 1) Column names for KDD/NSL-KDD
# ---------------------------
columns = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
    'wrong_fragment','urgent','hot','num_failed_logins','logged_in',
    'num_compromised','root_shell','su_attempted','num_root',
    'num_file_creations','num_shells','num_access_files','num_outbound_cmds',
    'is_host_login','is_guest_login','count','srv_count','serror_rate',
    'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
    'diff_srv_rate','srv_diff_host_rate','dst_host_count',
    'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
    'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
    'dst_host_srv_rerror_rate','label','difficulty'
]


In [4]:
# ---------------------------
# 2) Load dataset function (handles ARFF-like CSVs too)
# ---------------------------
def load_nsl_kdd(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found. Please upload KDDTrain+.txt / KDDTest+.txt to /mnt/data.")
    df = pd.read_csv(path, names=columns)
    return df

print("Loading data...")
train_df = load_nsl_kdd(TRAIN_PATH)
test_df  = load_nsl_kdd(TEST_PATH)
print("Train shape:", train_df.shape, "Test shape:", test_df.shape)

Loading data...
Train shape: (125973, 43) Test shape: (22544, 43)


In [5]:
# ---------------------------
# 3) Binary label: normal -> 0, others -> 1
# ---------------------------
for df in (train_df, test_df):
    df['binary_label'] = df['label'].apply(lambda x: 0 if str(x).strip().lower() == 'normal' else 1)

In [6]:
# ---------------------------
# 4) Encode categorical features (protocol_type, service, flag)
# LabelEncoder is simpler and faster here (paper used feature selection)
# ---------------------------
cat_cols = ['protocol_type','service','flag']
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    # Fit on combined to avoid unseen labels in test
    combined = pd.concat([train_df[col].astype(str), test_df[col].astype(str)], axis=0)
    le.fit(combined)
    le_dict[col] = le
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))


In [7]:
# ---------------------------
# 5) Feature selection - follow typical NSL-KDD: use all numeric + encoded categorical
# Drop label & difficulty & binary_label from features
# ---------------------------
drop_cols = ['label','difficulty','binary_label']
feature_cols = [c for c in train_df.columns if c not in drop_cols]
print("Using feature columns:", len(feature_cols))

X_train_raw = train_df[feature_cols].copy()
y_train = train_df['binary_label'].astype(int).copy()

X_test_raw = test_df[feature_cols].copy()
y_test = test_df['binary_label'].astype(int).copy()

Using feature columns: 41


In [8]:
# ---------------------------
# 6) Scale features (fit on train only)
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled  = scaler.transform(X_test_raw)

# Save scaler
joblib.dump(scaler, "../backend/models/scaler_nslkdd.joblib")

['../backend/models/scaler_nslkdd.joblib']

In [9]:

# ---------------------------
# 7) Stacked NDAE via greedy layer-wise pretraining
#    We'll train a sequence of shallow autoencoders (encoder + decoder),
#    keep the encoder part, transform data layer-by-layer to produce final code.
#    This matches the "stacked non-symmetric deep autoencoder" pretrain approach.
# ---------------------------

def train_shallow_autoencoder(X, hidden_dim, epochs=10, batch_size=2048, lr=1e-3, verbose=0):
    inp_dim = X.shape[1]
    inp = Input(shape=(inp_dim,))
    encoded = Dense(hidden_dim, activation='relu')(inp)
    decoded = Dense(inp_dim, activation='linear')(encoded)
    ae = Model(inp, decoded)
    encoder = Model(inp, encoded)
    ae.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    ae.fit(X, X, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=verbose)
    return ae, encoder

# Network of hidden sizes (non-symmetric: gradually reduce)
# You can tune this list. Paper used same neurons as features in NDAE layers; here we pick a decreasing configuration.
hidden_layers = [64, 32, 16]  # final encoding_dim = 16
X_current = X_train_scaled.copy()
encoders = []

print("Layer-wise pretraining of shallow autoencoders...")
for i, hdim in enumerate(hidden_layers):
    print(f" Pretraining layer {i+1}/{len(hidden_layers)} -> hidden_dim={hdim}")
    ae, enc = train_shallow_autoencoder(X_current, hidden_dim=hdim, epochs=15, batch_size=1024, lr=1e-3, verbose=1)
    # store encoder model
    encoders.append(enc)
    # transform X_current for next layer (greedy stacking)
    X_current = enc.predict(X_current)
    # Save shallow AE and encoder
    ae.save(f"../backend/models/ae_layer_{i+1}.keras", include_optimizer=False)
    enc.save(f"../backend/models/encoder_layer_{i+1}.keras")
print("Pretraining complete.")


Layer-wise pretraining of shallow autoencoders...
 Pretraining layer 1/3 -> hidden_dim=64
Epoch 1/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6830
Epoch 2/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2870
Epoch 3/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1709
Epoch 4/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1135
Epoch 5/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0799
Epoch 6/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0573
Epoch 7/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0413
Epoch 8/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0303
Epoch 9/15
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0227
E

In [10]:
# ---------------------------
# 8) Build stacked encoder (compose encoders)
# ---------------------------
# Create a model that maps from original input to final code by applying each encoder sequentially.
input_dim = X_train_scaled.shape[1]
stack_input = Input(shape=(input_dim,))
x = stack_input
for enc in encoders:
    # each enc is a keras Model; call its internal layers on x by using enc.layers after input layer
    # safer: use enc(x) because enc is a Model mapping input->hidden
    x = enc(x)
stacked_encoder = Model(stack_input, x)
# Save stacked encoder
stacked_encoder.save("../backend/models/stacked_encoder.keras")
print("Stacked encoder built. Final code dim:", stacked_encoder.output_shape)


Stacked encoder built. Final code dim: (None, 16)


In [11]:
# ---------------------------
# 9) Encode train and test sets to get compact representations
# ---------------------------
X_train_code = stacked_encoder.predict(X_train_scaled)
X_test_code  = stacked_encoder.predict(X_test_scaled)
print("Encoded shapes:", X_train_code.shape, X_test_code.shape)

[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 728us/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 889us/step
Encoded shapes: (125973, 16) (22544, 16)


In [12]:
# ---------------------------
# 10) Handle class imbalance with SMOTE on training codes
# ---------------------------
print("Before SMOTE:", np.bincount(y_train))
sm = SMOTE(k_neighbors=4, random_state=RANDOM_STATE)
X_train_res, y_train_res = sm.fit_resample(X_train_code, y_train)
print("After SMOTE:", np.bincount(y_train_res))
print("Resampled shape:", X_train_res.shape)


Before SMOTE: [67343 58630]
After SMOTE: [67343 67343]
Resampled shape: (134686, 16)


In [13]:
# ---------------------------
# 11) Train RandomForest on encoded features (use class_weight for robustness)
# ---------------------------
clf = RandomForestClassifier(n_estimators=400, class_weight={0:1, 1:6}, random_state=RANDOM_STATE, n_jobs=-1)
clf.fit(X_train_res, y_train_res)
joblib.dump(clf, "../backend/models/rf_encoded.joblib")
print("RandomForest trained on encoded features.")


RandomForest trained on encoded features.


In [14]:
# ---------------------------
# 12) Prediction & Evaluation
# ---------------------------
probs_test = clf.predict_proba(X_test_code)[:,1]
# Choose a threshold. We provide results for default 0.5 and a lowered 0.3 to favor recall.
for threshold in (0.5, 0.3, 0.25):
    y_pred_thr = (probs_test >= threshold).astype(int)
    print("\n-----------------------------")
    print(f"THRESHOLD = {threshold}")
    print(classification_report(y_test, y_pred_thr, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_thr))
    try:
        print("ROC-AUC:", round(roc_auc_score(y_test, probs_test), 4))
    except:
        print("ROC-AUC: n/a")

# Also print ROC curve points (optional)
fpr, tpr, _ = roc_curve(y_test, probs_test)
print(f"\nROC curve summary: FPR@TPR=0.9 -> approx FPR: {np.interp(0.9, tpr, fpr):.4f}")



-----------------------------
THRESHOLD = 0.5
              precision    recall  f1-score   support

           0     0.6477    0.9797    0.7798      9711
           1     0.9749    0.5967    0.7403     12833

    accuracy                         0.7617     22544
   macro avg     0.8113    0.7882    0.7600     22544
weighted avg     0.8339    0.7617    0.7573     22544

Confusion matrix:
 [[9514  197]
 [5176 7657]]
ROC-AUC: 0.9417

-----------------------------
THRESHOLD = 0.3
              precision    recall  f1-score   support

           0     0.6634    0.9731    0.7890      9711
           1     0.9686    0.6264    0.7608     12833

    accuracy                         0.7758     22544
   macro avg     0.8160    0.7998    0.7749     22544
weighted avg     0.8371    0.7758    0.7729     22544

Confusion matrix:
 [[9450  261]
 [4794 8039]]
ROC-AUC: 0.9417

-----------------------------
THRESHOLD = 0.25
              precision    recall  f1-score   support

           0     0.6775  

In [15]:

# ---------------------------
# 13) Real-time detection function (uses saved scaler, stacked_encoder, clf)
# ---------------------------
def load_models(scaler_path="../backend/models/scaler_nslkdd.joblib",
                encoder_path="../backend/models/stacked_encoder.keras",
                rf_path="../backend/models/rf_encoded.joblib"):
    scaler = joblib.load(scaler_path)
    enc = tf.keras.models.load_model(encoder_path)
    rf = joblib.load(rf_path)
    return scaler, enc, rf

scaler_loaded, encoder_loaded, rf_loaded = load_models()

def detect_event(event_dict, threshold=0.3):
    # Build dataframe row with all feature cols
    row = {c: event_dict.get(c, 0) for c in feature_cols}
    dfrow = pd.DataFrame([row])

    # NO LABEL ENCODER HERE (values already numeric)
    # Scale features
    X_scaled = scaler_loaded.transform(dfrow[feature_cols].values)

    # Encode with stacked encoder
    code = encoder_loaded.predict(X_scaled)

    # RF probability
    prob = rf_loaded.predict_proba(code)[:,1][0]

    return {
        "anomaly_score": float(prob),
        "is_anomaly": int(prob >= threshold)
    }



In [16]:
# ---------------------------
# 14) Example usage
# ---------------------------
sample_event = dict(zip(feature_cols, X_test_raw.iloc[5].values))  # take a real row from test
sample_event['src_bytes'] = float(sample_event['src_bytes']) * 4  # make it more 'suspicious'
res = detect_event(sample_event, threshold=0.3)
print("\nExample detection result:", res)

# ---------------------------
# Done. Models saved to /mnt/data (stacked_encoder.keras, rf_encoded.joblib, scaler_nslkdd.joblib, encoders/ae layers)
# ---------------------------

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step

Example detection result: {'anomaly_score': 0.0, 'is_anomaly': 0}
