In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Imports and file paths
import os
import joblib
from datetime import datetime
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# File paths (keep same as your original)
file_path = '/content/drive/MyDrive/UNSW_NB15_training-set.csv'   # dataset with 'label' and 'attack_cat'
save_dir = '/content/drive/MyDrive/UNSW_NB15_IDS_Models'                # where models/encoders will be saved
new_data_path = '/content/drive/MyDrive/UNSW_NB15_testing-set.csv'   # optional: external CSV to run predictions on

os.makedirs(save_dir, exist_ok=True)

# Mode for final prediction: 'binary', 'multi', or 'binary_then_multi'
mode = 'binary_then_multi'
print('file_path:', file_path)
print('save_dir:', save_dir)
print('new_data_path:', new_data_path)
print('prediction mode:', mode)


file_path: /content/drive/MyDrive/UNSW_NB15_training-set.csv
save_dir: /content/drive/MyDrive/UNSW_NB15_IDS_Models
new_data_path: /content/drive/MyDrive/UNSW_NB15_testing-set.csv
prediction mode: binary_then_multi


In [None]:
# Load dataset
print("Loading dataset from:", file_path)
df = pd.read_csv(file_path)
print("Dataset shape:", df.shape)
print("Columns preview:", df.columns.tolist()[:30])

# Sanity checks
if 'label' not in df.columns or 'attack_cat' not in df.columns:
    raise ValueError("Dataset must contain both 'label' (binary) and 'attack_cat' (multi-class).")


Loading dataset from: /content/drive/MyDrive/UNSW_NB15_training-set.csv
Dataset shape: (82332, 45)
Columns preview: ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth']


In [None]:
# Preprocessing: keep attack_cat (multi-class target) and label (binary target)
drop_cols = [c for c in ['id'] if c in df.columns]
X_df = df.drop(drop_cols + ['label', 'attack_cat'], axis=1, errors='ignore')
y_binary = df['label'].astype(int)
y_multi_raw = df['attack_cat'].astype(str)

print("Feature dataframe shape:", X_df.shape)

# Encode object columns in features
feature_encoders = {}
for col in X_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_df[col] = X_df[col].astype(str)
    X_df[col] = le.fit_transform(X_df[col])
    feature_encoders[col] = le
    print(f"Encoded feature column: {col} -> {len(le.classes_)} classes")

# Encode multi-class attack target
attack_label_encoder = LabelEncoder()
y_multi = attack_label_encoder.fit_transform(y_multi_raw)
print("Attack classes:", list(attack_label_encoder.classes_))

# Train-test split (use same indices for both tasks to compare)
from sklearn.model_selection import train_test_split
X_train_df, X_test_df, yb_train, yb_test, ym_train, ym_test = train_test_split(
    X_df, y_binary, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

print("Train/Test shapes:", X_train_df.shape, X_test_df.shape)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)


Feature dataframe shape: (82332, 42)
Encoded feature column: proto -> 131 classes
Encoded feature column: service -> 13 classes
Encoded feature column: state -> 7 classes
Attack classes: ['Analysis', 'Backdoor', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Normal', 'Reconnaissance', 'Shellcode', 'Worms']
Train/Test shapes: (65865, 42) (16467, 42)


In [None]:
# Define base classifiers (same model choices)
rf_bin = RandomForestClassifier(n_estimators=150, random_state=42)
xgb_bin = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
lr_bin = LogisticRegression(max_iter=500, random_state=42)

rf_multi = RandomForestClassifier(n_estimators=150, random_state=42)
xgb_multi = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lr_multi = LogisticRegression(max_iter=500, random_state=42)

# Train binary models
print("Training binary models...")
rf_bin.fit(X_train, yb_train)
xgb_bin.fit(X_train, yb_train)
lr_bin.fit(X_train, yb_train)

# Evaluate binary base models
from sklearn.metrics import accuracy_score
for name, m in [('RandomForest', rf_bin), ('XGBoost', xgb_bin), ('LogisticRegression', lr_bin)]:
    y_pred_tmp = m.predict(X_test)
    print(f"{name} (binary) Accuracy: {accuracy_score(yb_test, y_pred_tmp):.4f}")

# Voting classifier for binary
voting_bin = VotingClassifier(estimators=[('rf', rf_bin), ('xgb', xgb_bin), ('lr', lr_bin)], voting='soft')
voting_bin.fit(X_train, yb_train)
yb_pred = voting_bin.predict(X_test)
print("\nBinary VotingClassifier Accuracy:", accuracy_score(yb_test, yb_pred))
from sklearn.metrics import classification_report as cr
print("\nBinary Classification Report:")
print(cr(yb_test, yb_pred, target_names=['Normal','Attack']))

# Train multi-class models
print("\nTraining multi-class models...")
rf_multi.fit(X_train, ym_train)
xgb_multi.fit(X_train, ym_train)
lr_multi.fit(X_train, ym_train)

for name, m in [('RandomForest', rf_multi), ('XGBoost', xgb_multi), ('LogisticRegression', lr_multi)]:
    y_pred_tmp = m.predict(X_test)
    print(f"{name} (multi) Accuracy: {accuracy_score(ym_test, y_pred_tmp):.4f}")

# Voting classifier for multi-class
voting_multi = VotingClassifier(estimators=[('rf', rf_multi), ('xgb', xgb_multi), ('lr', lr_multi)], voting='soft')
voting_multi.fit(X_train, ym_train)
ym_pred = voting_multi.predict(X_test)
print("\nMulti VotingClassifier Accuracy:", accuracy_score(ym_test, ym_pred))
from sklearn.metrics import classification_report
print("\nMulti-class Classification Report:")
print(classification_report(ym_test, ym_pred, target_names=attack_label_encoder.classes_))


Training binary models...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RandomForest (binary) Accuracy: 0.9777
XGBoost (binary) Accuracy: 0.9785
LogisticRegression (binary) Accuracy: 0.8967


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Binary VotingClassifier Accuracy: 0.978441731948746

Binary Classification Report:
              precision    recall  f1-score   support

      Normal       0.97      0.98      0.98      7400
      Attack       0.99      0.97      0.98      9067

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467


Training multi-class models...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RandomForest (multi) Accuracy: 0.8689
XGBoost (multi) Accuracy: 0.8813
LogisticRegression (multi) Accuracy: 0.8012


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Multi VotingClassifier Accuracy: 0.8760551405841988

Multi-class Classification Report:
                precision    recall  f1-score   support

      Analysis       0.91      0.07      0.14       135
      Backdoor       0.50      0.01      0.02       117
           DoS       0.43      0.48      0.46       818
      Exploits       0.70      0.70      0.70      2227
       Fuzzers       0.70      0.70      0.70      1212
       Generic       0.99      0.98      0.98      3774
        Normal       0.95      0.99      0.97      7400
Reconnaissance       0.90      0.79      0.84       699
     Shellcode       0.60      0.39      0.48        76
         Worms       0.00      0.00      0.00         9

      accuracy                           0.88     16467
     macro avg       0.67      0.51      0.53     16467
  weighted avg       0.87      0.88      0.87     16467



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save models and encoders
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
joblib.dump(voting_bin, os.path.join(save_dir, f'voting_binary_model_{timestamp}.pkl'))
joblib.dump(voting_multi, os.path.join(save_dir, f'voting_multi_model_{timestamp}.pkl'))
joblib.dump(scaler, os.path.join(save_dir, f'scaler_{timestamp}.pkl'))
joblib.dump(feature_encoders, os.path.join(save_dir, f'feature_encoders_{timestamp}.pkl'))
joblib.dump(attack_label_encoder, os.path.join(save_dir, f'attack_label_encoder_{timestamp}.pkl'))

print("Saved models and encoders to:", save_dir)

Saved models and encoders to: /content/drive/MyDrive/UNSW_NB15_IDS_Models


In [None]:
# Final prediction and display with attack names
print("--- Predictions on held-out test set ---")
if mode in ('binary', 'binary_then_multi'):
    preds_bin_test = voting_bin.predict(X_test)
if mode in ('multi', 'binary_then_multi'):
    preds_multi_test = voting_multi.predict(X_test)
    decoded_multi_test = attack_label_encoder.inverse_transform(preds_multi_test)

# Print first 50 samples from test set
n_show = min(50, X_test.shape[0])
for i in range(n_show):
    if mode == 'binary':
        p = preds_bin_test[i]
        print(f"Test sample {i+1}: {'Normal traffic' if p==0 else ' Attack detected!'}")
    elif mode == 'multi':
        label = decoded_multi_test[i]
        if str(label).lower() == 'normal':
            print(f"Test sample {i+1}: Normal traffic")
        else:
            print(f"Test sample {i+1}: Attack detected! — {label}")
    elif mode == 'binary_then_multi':
        p = preds_bin_test[i]
        if p == 0:
            print(f"Test sample {i+1}: Normal traffic")
        else:
            attack_name = decoded_multi_test[i]
            print(f"Test sample {i+1}: Attack detected! — {attack_name}")

--- Predictions on held-out test set ---
Test sample 1: Attack detected! — Generic
Test sample 2: Normal traffic
Test sample 3: Attack detected! — Normal
Test sample 4: Normal traffic
Test sample 5: Normal traffic
Test sample 6: Attack detected! — Generic
Test sample 7: Attack detected! — Fuzzers
Test sample 8: Normal traffic
Test sample 9: Attack detected! — Generic
Test sample 10: Normal traffic
Test sample 11: Normal traffic
Test sample 12: Normal traffic
Test sample 13: Attack detected! — Generic
Test sample 14: Normal traffic
Test sample 15: Normal traffic
Test sample 16: Normal traffic
Test sample 17: Attack detected! — Generic
Test sample 18: Attack detected! — Generic
Test sample 19: Normal traffic
Test sample 20: Attack detected! — Generic
Test sample 21: Attack detected! — Normal
Test sample 22: Normal traffic
Test sample 23: Attack detected! — Generic
Test sample 24: Attack detected! — Exploits
Test sample 25: Normal traffic
Test sample 26: Attack detected! — Generic
Test sa