Setup, Load Data, and Clean

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM  # The missing import!
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

# -------------------------------------------------------------------
# 0. Setup, Load Data, and Clean
# -------------------------------------------------------------------

# --- 1. Load the Cleaned Data ---
df_sampled = pd.read_csv("Dataset.csv", low_memory=False)

# --- 2. Separate Features (X) and Target (y) ---
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# --- 3. Re-apply the Feature Cleanup (CRITICAL!) ---
# Drop the leaky features (Attack Category OHE columns and FTP Command Count)
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

# --- 4. Critical Fix: Remove NaN in Target (y) ---
nan_mask = y.isnull()
if nan_mask.any():
    print(f"Warning: Found {nan_mask.sum()} rows with NaN in the 'Label' column. Dropping these rows.")
    X = X[~nan_mask]
    y = y[~nan_mask]

# -------------------------------------------------------------------
# 5. ðŸš¨ CRITICAL FIX: Ensure ALL remaining features are numeric
# This fixes the "could not convert string to float: 'False'" error.
# -------------------------------------------------------------------
for col in X.columns:
    # Coerce to float, replacing any non-numeric strings (like 'False') with NaN.
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Fill any new NaNs created during the coercion with 0.
X = X.fillna(0)
print("SUCCESS: Final feature matrix X is completely numerical.")
# -------------------------------------------------------------------

# --- 6. Split the Data (70% Train, 30% Test, stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# --- 7. Prepare Unsupervised Training Data (Normal only) ---
X_train_normal = X_train[y_train == 0]

# --- 8. Calculate Contamination Rate (Needed for OCSVM) ---
y_train_counts = Counter(y_train)
contamination_rate = y_train_counts[1] / len(y_train)

print(f"Data prepared. Unsupervised training size (Normal only): {X_train_normal.shape[0]} rows")
print(f"Calculated Contamination Rate (nu): {contamination_rate:.4f}")

# -------------------------------------------------------------------
# 9. Train and Evaluate One-Class SVM (OCSVM)
# -------------------------------------------------------------------

# Initialize the OCSVM model with the RBF kernel.
ocsvm_model = OneClassSVM(kernel='rbf', nu=contamination_rate)

print("\nStarting One-Class SVM training...")
# Note: This step is computationally heavy and will take time.
ocsvm_model.fit(X_train_normal)
print("One-Class SVM training complete (on Normal data).")

# Predict on the Test Set
y_pred_ocsvm = ocsvm_model.predict(X_test)
# Convert prediction outputs: 1 (Normal) -> 0, -1 (Anomaly) -> 1
y_pred_ocsvm = np.where(y_pred_ocsvm == 1, 0, 1)

# Calculate Metrics
accuracy_ocsvm = accuracy_score(y_test, y_pred_ocsvm)
precision_ocsvm = precision_score(y_test, y_pred_ocsvm)
recall_ocsvm = recall_score(y_test, y_pred_ocsvm)
f1_ocsvm = f1_score(y_test, y_pred_ocsvm)

print("\n--- One-Class SVM Centralized Baseline Results ---")
print(f"Accuracy:  {accuracy_ocsvm:.4f}")
print(f"Precision: {precision_ocsvm:.4f}")
print(f"Recall:    {recall_ocsvm:.4f}")
print(f"F1 Score:  {f1_ocsvm:.4f}")

SUCCESS: Final feature matrix X is completely numerical.
Data prepared. Unsupervised training size (Normal only): 305729 rows
Calculated Contamination Rate (nu): 0.1265

Starting One-Class SVM training...
