
Setup, Load Data, and Clean

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# --- 1. Load the Cleaned Data ---
df_sampled = pd.read_csv("Dataset.csv")

# --- 2. Separate Features (X) and Target (y) ---
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# --- 3. Re-apply the Feature Cleanup (CRITICAL!) ---
# Drop the leaky features (Attack Category OHE columns and FTP Command Count)
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

# --- 4. Critical Fix: Remove NaN in Target (y) ---
# Ensure y is clean before stratification
nan_mask = y.isnull()
if nan_mask.any():
    print(f"Warning: Found {nan_mask.sum()} rows with NaN in the 'Label' column. Dropping these rows.")
    X = X[~nan_mask]
    y = y[~nan_mask]

# --- 5. Split the Data (70% Train, 30% Test, stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y  # Ensures the attack ratio is maintained
)

print(f"Data prepared. Training features shape: {X_train.shape}")

  df_sampled = pd.read_csv("Dataset.csv")


Data prepared. Training features shape: (69278, 199)


Train and Evaluate Random Forest

In [2]:
# Initialize the Random Forest model
# We use class_weight='balanced' to handle the class imbalance.
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced', # Imbalance handling
    n_jobs=-1 # Use all available cores for speed
)

print("\nStarting Random Forest training...")
rf_model.fit(X_train, y_train)
print("Random Forest training complete.")

# Predict and Evaluate
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate Metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_proba_rf)

# Print the Centralized RF Baseline Results
print("\n--- Random Forest Centralized Baseline Results ---")
print(f"Accuracy:  {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall:    {recall_rf:.4f} (Crucial for security!)")
print(f"F1 Score:  {f1_rf:.4f}")
print(f"ROC-AUC:   {roc_auc_rf:.4f} (RF Benchmark!)")


Starting Random Forest training...
Random Forest training complete.

--- Random Forest Centralized Baseline Results ---
Accuracy:  0.9925
Precision: 0.9701
Recall:    0.9706 (Crucial for security!)
F1 Score:  0.9704
ROC-AUC:   0.9996 (RF Benchmark!)
