Supervised Baseline (Multi-Layer Perceptron)

This model is critical because the Federated Averaging (FedAvg) protocol works optimally on Neural Networks (weights). This will serve as your best direct comparison for a federated classification task.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from collections import Counter

# --- 1. Load the Cleaned Data ---
df_sampled = pd.read_csv("Dataset.csv", low_memory=False)

# --- 2. Separate Features (X) and Target (y) ---
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# --- 3. Re-apply the Feature Cleanup (CRITICAL!) ---
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

# --- 4. Critical Fix: Remove NaN in Target (y) ---
nan_mask = y.isnull()
X = X[~nan_mask]
y = y[~nan_mask]

# -------------------------------------------------------------------
# 5. Ensure ALL features are numeric (Final Safety Check)
# -------------------------------------------------------------------
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')
X = X.fillna(0)
# -------------------------------------------------------------------

# --- 6. Split the Data (70% Train, 30% Test, stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print(f"Data prepared. Training features shape: {X_train.shape}")

Data prepared. Training features shape: (173191, 199)


In [3]:
# --- 1. Calculate Class Weights for Imbalance Handling ---
# Neural Networks use a weighted loss function to handle imbalance.
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(f"Class Weights calculated: {class_weight_dict}") # Should show weight for 1 (Attack) is much higher

# --- 2. Define the MLP Model ---
input_dim = X_train.shape[1]

mlp_model = Sequential()
mlp_model.add(Dense(128, activation='relu', input_shape=(input_dim,))) # Input layer
mlp_model.add(Dense(64, activation='relu'))
mlp_model.add(Dense(32, activation='relu'))
mlp_model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification (0 or 1)

# Compile the model
mlp_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# --- 3. Train the Model ---
print("\nStarting Supervised MLP training...")
history = mlp_model.fit(
    X_train, y_train,
    epochs=15, # Use a low number of epochs for a quick baseline
    batch_size=256,
    shuffle=True,
    class_weight=class_weight_dict, # CRITICAL: Apply weights to handle imbalance
    verbose=0
)
print("Supervised MLP training complete.")

Class Weights calculated: {0: np.float64(0.5728238508199216), 1: np.float64(3.9329412299028066)}

Starting Supervised MLP training...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Supervised MLP training complete.


In [4]:
# Predict and Evaluate
y_proba_mlp = mlp_model.predict(X_test, verbose=0)
y_pred_mlp = (y_proba_mlp > 0.5).astype("int32") # Convert probabilities to 0 or 1

# Calculate Metrics
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp)
recall_mlp = recall_score(y_test, y_pred_mlp)
f1_mlp = f1_score(y_test, y_pred_mlp)
roc_auc_mlp = roc_auc_score(y_test, y_proba_mlp)

# Print the Centralized MLP Baseline Results
print("\n--- Supervised MLP Centralized Baseline Results ---")
print(f"Accuracy:  {accuracy_mlp:.4f}")
print(f"Precision: {precision_mlp:.4f}")
print(f"Recall:    {recall_mlp:.4f}")
print(f"F1 Score:  {f1_mlp:.4f}")
print(f"ROC-AUC:   {roc_auc_mlp:.4f} (MLP Benchmark!)")


--- Supervised MLP Centralized Baseline Results ---
Accuracy:  0.9885
Precision: 0.9170
Recall:    0.9997
F1 Score:  0.9566
ROC-AUC:   0.9995 (MLP Benchmark!)
