Load Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# --- Load the Cleaned Data ---
df_sampled = pd.read_csv("Dataset.csv")

# --- Separate Features (X) and Target (y) ---
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# --- Re-apply the Feature Cleanup (CRITICAL!) ---
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

# -------------------------------------------------------------------
# CRITICAL FIX: Remove rows where the target (y) is NaN

'''
we encountered error due to some null values present in the Label column...
it hapens even after data cleaning sometimes when we merge some datatsets

'''
# -------------------------------------------------------------------
nan_mask = y.isnull()
if nan_mask.any():
    print(f"Warning: Found {nan_mask.sum()} rows with NaN in the 'Label' column. Dropping these rows.")
    X = X[~nan_mask] # Keep rows where nan_mask is False
    y = y[~nan_mask] # Keep rows where nan_mask is False
else:
    print("SUCCESS: Target variable 'y' is clean (no NaNs found).")
# -------------------------------------------------------------------


# --- Re-split the Data (70% Train, 30% Test, stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y  # Stratification is now safe
)

print(f"Data re-loaded and split. Training features size: {X_train.shape}")

  df_sampled = pd.read_csv("Dataset.csv")


Data re-loaded and split. Training features size: (271502, 199)


Prepare Data for Autoencoder

In [3]:
# Filter the training data to include ONLY Normal (Label = 0) connections
X_train_normal = X_train[y_train == 0]

print(f"Autoencoder Training Data size (Normal only): {X_train_normal.shape[0]} rows")

Autoencoder Training Data size (Normal only): 237142 rows


Define and Train the Autoencoder

In [4]:
# Define the Autoencoder Model Architecture
input_dim = X_train_normal.shape[1] # The number of features

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(32, activation='relu')(encoded)
bottleneck = Dense(16, activation='relu', name='bottleneck')(encoded) # Bottleneck layer

# Decoder (Mirror Image)
decoded = Dense(32, activation='relu')(bottleneck)
decoded = Dense(64, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='linear')(decoded)

# Build the model
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

print("Starting Autoencoder training...")
history = autoencoder.fit(
    X_train_normal, X_train_normal, # Input = Output
    epochs=50,
    batch_size=256,
    shuffle=True,
    verbose=0
)
print("Autoencoder training complete.")

Starting Autoencoder training...
Autoencoder training complete.


Evaluate the Autoencoder Baseline

In [5]:
# 1. Find the Anomaly Threshold on the Normal training data
reconstructions_train = autoencoder.predict(X_train_normal)
mse_train = np.mean(np.power(X_train_normal - reconstructions_train, 2), axis=1)
threshold = np.percentile(mse_train, 99) # 99th percentile threshold
print(f"\nCalculated Anomaly Threshold (99th percentile): {threshold:.6f}")

# 2. Evaluate on the Test Set (contains both Normal and Attack)
reconstructions_test = autoencoder.predict(X_test)
mse_test = np.mean(np.power(X_test - reconstructions_test, 2), axis=1)

# Predict: Attack (1) if error > threshold, Normal (0) otherwise
y_pred_ae = (mse_test > threshold).astype(int)

# Calculate Metrics
roc_auc_ae = roc_auc_score(y_test, mse_test)
accuracy_ae = accuracy_score(y_test, y_pred_ae)
precision_ae = precision_score(y_test, y_pred_ae)
recall_ae = recall_score(y_test, y_pred_ae)
f1_ae = f1_score(y_test, y_pred_ae)

# Print the Unsupervised Baseline Results
print("\n--- Autoencoder Centralized Baseline Results (Unsupervised) ---")
print(f"Accuracy:  {accuracy_ae:.4f}")
print(f"Precision: {precision_ae:.4f}")
print(f"Recall:    {recall_ae:.4f}")
print(f"F1 Score:  {f1_ae:.4f}")
print(f"ROC-AUC:   {roc_auc_ae:.4f}")

[1m7411/7411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step

Calculated Anomaly Threshold (99th percentile): 0.000056
[1m3637/3637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step

--- Autoencoder Centralized Baseline Results (Unsupervised) ---
Accuracy:  0.9760
Precision: 0.9235
Recall:    0.8839
F1 Score:  0.9033
ROC-AUC:   0.9937


now to make this into federated we need to split the data (client data)

In [6]:
#Prepare Data for FL Clients
# -------------------------------------------------------------------

# We will use the cleaned X and y variables.
import numpy as np
from sklearn.model_selection import train_test_split
# Ensure you import numpy and train_test_split if they aren't already imported

NUM_CLIENTS = 10
client_data_splits = []

# Split the data into 10 chunks.
X_chunks = np.array_split(X, NUM_CLIENTS)
y_chunks = np.array_split(y, NUM_CLIENTS)

for i in range(NUM_CLIENTS):
    # Split each client's data into their local train and test sets
    # We use a 70/30 split for each client's local data
    X_train_client, X_test_client, y_train_client, y_test_client = train_test_split(
        X_chunks[i], y_chunks[i],
        test_size=0.3,
        random_state=42,
        stratify=y_chunks[i] # Ensures each client has the same attack ratio
    )

    client_data_splits.append({
        'X_train': X_train_client,
        'y_train': y_train_client,
        'X_test': X_test_client,
        'y_test': y_test_client,
    })

print(f"Data split across {NUM_CLIENTS} simulated clients.")
print(f"Example Client 1 Training Data Size: {client_data_splits[0]['X_train'].shape[0]} rows")

  return bound(*args, **kwds)
  return bound(*args, **kwds)


Data split across 10 simulated clients.
Example Client 1 Training Data Size: 27150 rows
