Setup, Load Data, and Clean

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# --- 1. Load the Cleaned Data ---
df_sampled = pd.read_csv("Dataset.csv")

# --- 2. Separate Features (X) and Target (y) ---
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# --- 3. Re-apply the Feature Cleanup (CRITICAL!) ---
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

# --- 4. Critical Fix: Remove NaN in Target (y) ---
nan_mask = y.isnull()
if nan_mask.any():
    X = X[~nan_mask]
    y = y[~nan_mask]

# --- 5. Split the Data (70% Train, 30% Test, stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# --- 6. Prepare Unsupervised Training Data (Normal only) ---
X_train_normal = X_train[y_train == 0]

print(f"Data prepared. Unsupervised training size (Normal only): {X_train_normal.shape[0]} rows")

  df_sampled = pd.read_csv("Dataset.csv")


Data prepared. Unsupervised training size (Normal only): 15148 rows


Train and Evaluate

In [2]:
# Calculate the contamination (Outlier Fraction) for Isolation Forest
contamination_rate = y_train.value_counts()[1] / y_train.shape[0]

# Initialize and train the Isolation Forest model
if_model = IsolationForest(
    n_estimators=100,
    contamination=contamination_rate, # Tells the model the expected ratio of anomalies
    random_state=42,
    n_jobs=-1
)

# IF is trained on the entire training set (X_train), it doesn't need to be normal-only,
# but we will train it on normal only for a direct comparison with OCSVM and AE.
if_model.fit(X_train_normal)
print("\nIsolation Forest training complete (on Normal data).")

# Predict on the Test Set (1 for Normal, -1 for Anomaly). We convert to 0 and 1.
y_pred_if = if_model.predict(X_test)
y_pred_if = np.where(y_pred_if == 1, 0, 1) # Convert 1 (Normal) to 0, and -1 (Anomaly) to 1

# Note: IF doesn't easily provide probabilities for ROC-AUC, so we skip it for now.

# Calculate Metrics
accuracy_if = accuracy_score(y_test, y_pred_if)
precision_if = precision_score(y_test, y_pred_if)
recall_if = recall_score(y_test, y_pred_if)
f1_if = f1_score(y_test, y_pred_if)

print("\n--- Isolation Forest Centralized Baseline Results ---")
print(f"Accuracy:  {accuracy_if:.4f}")
print(f"Precision: {precision_if:.4f}")
print(f"Recall:    {recall_if:.4f}")
print(f"F1 Score:  {f1_if:.4f}")


Isolation Forest training complete (on Normal data).

--- Isolation Forest Centralized Baseline Results ---
Accuracy:  0.8977
Precision: 0.5502
Recall:    0.9978
F1 Score:  0.7093
