<a href="https://colab.research.google.com/github/Khalidaman9555/IDS-AI/blob/main/ML_Models_RF_DT_LG_XGboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Model-specific imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier # Make sure xgboost is installed: pip install xgboost

import os
import joblib
import pickle
import h5py

# --- Configuration ---
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results"
RESULTS_FILE_PATH = os.path.join(OUTPUT_PATH, "classification_models_results.txt")

os.makedirs(OUTPUT_PATH, exist_ok=True)

# --- Load Dataset ---
print(f"Loading dataset: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH, low_memory=False)
print("Dataset loaded successfully.")
print(f"Initial dataset shape: {df.shape}")

# --- Preprocessing ---
df.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
df.dropna(inplace=True)
print(f"Dataset shape after dropping NA: {df.shape}")

if df.empty:
    print("Dataset is empty after dropping NA values. Exiting.")
    exit()

# --- Feature Selection (Consistent with previous script) ---
drop_cols = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4",
             "arp.dst.proto_ipv4", "http.file_data", "http.request.full_uri",
             "icmp.transmit_timestamp", "tcp.options", "tcp.payload",
             "mqtt.conack.flags", "mqtt.msg", "mqtt.protoname", "mqtt.topic",
             "mqtt.uuid", "mqtt.conflags",
             # "Attack_label", "Attack_type", "Label", # These are handled separately or are targets
             "icmp.unused", "http.request.method", "http.referer", "http.request.version",
             "dns.qry.name", "dns.resp.name", "tcp.flags", "udp.port", "tcp.port",
             "mqtt.conack.flags_tree",
             "tcp.options.mss", "tcp.window_size", "tcp.hdr_len", "tcp.seq", "tcp.ack",
             "ip.src", "ip.dst", "arp.opcode", "arp.hw.type", "arp.src.hw_mac",
             "arp.dst.hw_mac", "icmp.type", "icmp.code", "icmp.checksum",
             "icmp.ident", "icmp.seq_le", "udp.srcport", "udp.dstport", "udp.checksum",
             "dns.id", "dns.flags.response", "dns.flags.opcode", "dns.flags.authoritative",
             "dns.flags.truncated", "dns.flags.recursion_desired", "dns.flags.recursion_available",
             "dns.flags.z", "dns.flags.authenticated", "dns.flags.checking_disabled", "dns.flags.rcode",
             "dns.count.queries", "dns.count.answers", "dns.count.auth_rr", "dns.count.add_rr",
             "mqtt.clientid", "mqtt.qos", "mqtt.retain", "mqtt.dupflag", "mqtt.sessionpresent",
             "mqtt.proto_len", "mqtt.topic_len", "mqtt.ver", "mqtt.willmsg", "mqtt.willtopic",
             "mqtt.dup", "mqtt.msgtype", "mqtt.kalive", "mqtt.msgid", "mqtt.password",
             "mqtt.username", "mqtt.client_id_len",
             "mqtt.topic_val", "mqtt.msg_len",
             "mqtt.payload", "mqtt.ciphersuite", "mqtt.pk_id", "mqtt.reason_code", "mqtt.session_expiry_interval",
             "mqtt.will_flag", "mqtt.will_qos", "mqtt.will_retain", "mqtt.will_message_len", "mqtt.will_message",
             "mqtt.will_topic_len", "mqtt.will_topic", "mqtt.var_header.length", "mqtt.var_header.qos",
             "mqtt.var_header.retain", "mqtt.var_header.dup", "mqtt.var_header.message_identifier",
             "mqtt.var_header.topic_name_length", "mqtt.var_header.topic_name", "mqtt.var_header.packet_identifier",
             "mqtt.var_header.properties.message_expiry_interval", "mqtt.var_header.properties.content_type",
             "mqtt.var_header.properties.correlation_data", "mqtt.var_header.properties.payload_format_indicator",
             "mqtt.var_header.properties.request_response_information", "mqtt.var_header.properties.response_topic",
             "mqtt.var_header.properties.session_expiry_interval", "mqtt.var_header.properties.subscription_identifier",
             "mqtt.var_header.properties.topic_alias", "mqtt.var_header.properties.user_property",
             "mqtt.var_header.properties.will_delay_interval", "mqtt.var_header.properties.will_payload_format_indicator",
             "mqtt.var_header.properties.will_content_type", "mqtt.var_header.properties.will_response_topic",
             "mqtt.var_header.properties.will_correlation_data", "mqtt.var_header.properties.will_user_property",
             "mqtt.var_header.properties.will_subscription_identifier", "mqtt.var_header.properties.will_topic_alias",
             "mqtt.var_header.properties.will_retained_message", "mqtt.var_header.properties.will_message_expiry_interval",
             "mqtt.var_header.properties.will_content_type_len", "mqtt.var_header.properties.will_content_type_val",
             "mqtt.var_header.properties.will_response_topic_len", "mqtt.var_header.properties.will_response_topic_val",
             "mqtt.var_header.properties.will_correlation_data_len", "mqtt.var_header.properties.will_correlation_data_val",
             "mqtt.var_header.properties.will_user_property_len", "mqtt.var_header.properties.will_user_property_val",
             "mqtt.var_header.properties.will_subscription_identifier_len", "mqtt.var_header.properties.will_subscription_identifier_val",
             "mqtt.var_header.properties.will_topic_alias_len", "mqtt.var_header.properties.will_topic_alias_val",
             "mqtt.var_header.properties.will_retained_message_len", "mqtt.var_header.properties.will_retained_message_val",
             "mqtt.var_header.properties.will_message_expiry_interval_len", "mqtt.var_header.properties.will_message_expiry_interval_val"
            ]

potential_target_cols = ["Attack_label", "Attack_type", "Label"]
cols_to_drop_for_X = sorted(list(set(drop_cols + potential_target_cols)))


if "Attack_label" not in df.columns:
    print("Target variable 'Attack_label' not found. Exiting.")
    exit()
y = df["Attack_label"]

# Create X by dropping specified columns and selecting numeric types
X_candidate_features = df.drop(columns=[col for col in cols_to_drop_for_X if col in df.columns], errors="ignore")
X = X_candidate_features.select_dtypes(include=np.number)

if X.empty:
    print("No numeric features available after selection. Exiting.")
    exit()

feature_names_list = X.columns.tolist() # Save for HDF5 attributes
print(f"Selected features for training: {feature_names_list}")
print(f"Number of selected features: {len(feature_names_list)}")

# --- Data Splitting ---
if y.empty or len(y.unique()) < 2:
    print("Target variable y is empty or has only one class. Stratified splitting not possible. Exiting.")
    exit()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15/0.85), random_state=42, stratify=y_train_val)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# --- Feature Scaling ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
print("Feature scaling complete.")

# Save the scaler (it's common to all models)
SCALER_SAVE_PATH_JOBLIB = os.path.join(OUTPUT_PATH, "common_scaler.joblib")
SCALER_SAVE_PATH_PKL = os.path.join(OUTPUT_PATH, "common_scaler.pkl")
joblib.dump(scaler, SCALER_SAVE_PATH_JOBLIB)
print(f"Scaler saved to {SCALER_SAVE_PATH_JOBLIB}")
with open(SCALER_SAVE_PATH_PKL, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to {SCALER_SAVE_PATH_PKL}")

# Serialize scaler for HDF5 inclusion
scaler_bytes = pickle.dumps(scaler)

# --- Master Results String ---
all_models_results_summary = "Classification Models Performance Results:\n"
all_models_results_summary += f"Dataset: {DATASET_PATH}\n"
all_models_results_summary += f"Number of features used: {len(feature_names_list)}\n"
all_models_results_summary += f"Features: {feature_names_list}\n\n"
all_models_results_summary += f"Scaler saved to: {SCALER_SAVE_PATH_JOBLIB}, {SCALER_SAVE_PATH_PKL}\n\n"

# --- Model Definitions ---
models_to_train = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42, solver='liblinear', max_iter=1000),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# --- Iterate Through Models: Train, Evaluate, Save ---
for model_name, model_instance in models_to_train.items():
    print(f"\n--- Training {model_name} ---")
    model_instance.fit(X_train, y_train)
    print(f"{model_name} training complete.")

    # --- Save Model (Joblib, Pickle, HDF5) ---
    model_joblib_path = os.path.join(OUTPUT_PATH, f"{model_name.lower()}_model.joblib")
    model_pkl_path = os.path.join(OUTPUT_PATH, f"{model_name.lower()}_model.pkl")
    model_h5_path = os.path.join(OUTPUT_PATH, f"{model_name.lower()}_model_and_scaler.h5")

    # Joblib
    joblib.dump(model_instance, model_joblib_path)
    print(f"Saved {model_name} to {model_joblib_path}")
    # Pickle
    with open(model_pkl_path, 'wb') as f:
        pickle.dump(model_instance, f)
    print(f"Saved {model_name} to {model_pkl_path}")
    # HDF5
    try:
        model_bytes = pickle.dumps(model_instance)
        with h5py.File(model_h5_path, 'w') as h5f:
            h5f.create_dataset(f'{model_name.lower()}_model', data=np.void(model_bytes))
            h5f.create_dataset('scaler', data=np.void(scaler_bytes)) # Common scaler bytes
            h5f.attrs['feature_names'] = feature_names_list # Direct list of strings
        print(f"Saved {model_name} and scaler to {model_h5_path}")
    except Exception as e:
        print(f"Error saving {model_name} to HDF5 ({model_h5_path}): {e}")

    # --- Evaluate Model ---
    # Validation Set
    y_val_pred = model_instance.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred, zero_division=0)
    print(f"\n{model_name} Validation Accuracy: {val_accuracy:.4f}")
    print(f"{model_name} Validation Classification Report:\n{val_report}")

    # Test Set
    y_test_pred = model_instance.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, zero_division=0)
    print(f"\n{model_name} Test Accuracy: {test_accuracy:.4f}")
    print(f"{model_name} Test Classification Report:\n{test_report}")

    # --- Append to Master Results String ---
    all_models_results_summary += f"--- {model_name} Model Results ---\n"
    all_models_results_summary += f"Model saved (joblib): {model_joblib_path}\n"
    all_models_results_summary += f"Model saved (pickle): {model_pkl_path}\n"
    all_models_results_summary += f"Model & Scaler saved (HDF5): {model_h5_path}\n\n"
    all_models_results_summary += f"Validation Accuracy: {val_accuracy:.4f}\n"
    all_models_results_summary += f"Validation Classification Report:\n{val_report}\n\n"
    all_models_results_summary += f"Test Accuracy: {test_accuracy:.4f}\n"
    all_models_results_summary += f"Test Classification Report:\n{test_report}\n"
    all_models_results_summary += "---------------------------------------\n\n"

# --- Save Consolidated Results to File ---
try:
    with open(RESULTS_FILE_PATH, "w") as f:
        f.write(all_models_results_summary)
    print(f"\nAll model results saved to {RESULTS_FILE_PATH}")
except Exception as e:
    print(f"Error writing results to file: {e}")

print("\nScript finished successfully.")

Loading dataset: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Dataset loaded successfully.
Initial dataset shape: (157800, 63)
Dataset shape after dropping NA: (157800, 63)
Selected features for training: ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Number of selected features: 27
Training set size: 110460
Validation set size: 23670
Test set size: 23670
Feature scaling complete.
Scaler saved to /content/drive/MyDrive/Colab Notebooks/results/common_scaler.joblib
Scaler saved to /content/drive/MyDrive/Colab Note

Parameters: { "use_label_encoder" } are not used.



XGBoost training complete.
Saved XGBoost to /content/drive/MyDrive/Colab Notebooks/results/xgboost_model.joblib
Saved XGBoost to /content/drive/MyDrive/Colab Notebooks/results/xgboost_model.pkl
Saved XGBoost and scaler to /content/drive/MyDrive/Colab Notebooks/results/xgboost_model_and_scaler.h5

XGBoost Validation Accuracy: 0.9743
XGBoost Validation Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3645
           1       0.98      0.99      0.98     20025

    accuracy                           0.97     23670
   macro avg       0.95      0.95      0.95     23670
weighted avg       0.97      0.97      0.97     23670


XGBoost Test Accuracy: 0.9775
XGBoost Test Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3645
           1       0.98      0.99      0.99     20025

    accuracy                           0.98     23670
   macro avg

In [None]:
from google.colab import drive
drive.mount('/content/drive')