In [1]:
# INSTALLATION AND ENVIRONMENT SETUP
!pip install -U --force-reinstall "flwr[simulation]" scikit-learn numpy pandas

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting numpy
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flwr[simulation]
  Downloading flwr-1.22.0-py3-none-any.whl.metadata (14 kB)
Collecting click<8.2.0 (from flwr[simulation])
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting cryptography<45.0.0,>=44.0.1 (from flwr[simulation])
  Downloading cryptography-44.0.3-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting grpcio!=1.65.0

In [1]:
# -------------------------------------------------------------------
# BLOCK 1: DATA PREPARATION AND PARTITIONING
# -------------------------------------------------------------------

# 0. Setup and Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from collections import Counter
import flwr as fl
import pickle
from flwr.server.strategy import Strategy
from typing import List, Optional, Tuple, Union
from flwr.common import Parameters

print("1. Loading and Cleaning Data...")
df_sampled = pd.read_csv("Dataset.csv", low_memory=False)

# A. Separate Features and Target
X = df_sampled.drop('Label', axis=1)
y = df_sampled['Label']

# B. Cleanup: Drop leaky columns, handle NaNs
leaky_cols = [col for col in X.columns if 'Attack Category' in col]
leaky_cols.append('FTP Command Count')
X = X.drop(columns=leaky_cols, errors='ignore')

nan_mask = y.isnull()
X = X[~nan_mask]; y = y[~nan_mask]

for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')
X = X.fillna(0)

# C. Define global variables and CRITICAL type casting
input_dim = X.shape[1]
NUM_CLIENTS = 10

# Define the global test set (CRITICAL: Cast to float32 for model consistency)
X_test_global_np = X.to_numpy().astype('float32')
y_test_global_np = y.to_numpy().astype('float32')

# D. Prepare FL Data Splits
print("2. Partitioning Data for FL Clients...")
client_data_splits = []
X_chunks = np.array_split(X, NUM_CLIENTS)
y_chunks = np.array_split(y, NUM_CLIENTS)

for i in range(NUM_CLIENTS):
    X_train_client, _, y_train_client, _ = train_test_split(
        X_chunks[i], y_chunks[i],
        test_size=0.3, random_state=42, stratify=y_chunks[i]
    )
    # Isolation Forest trains on the local client's training data
    X_train_fif = X_train_client.to_numpy().astype('float32')

    client_data_splits.append({
        'X_train_fif': X_train_fif,
    })
print(f"Data split across {NUM_CLIENTS} simulated clients. Total features: {input_dim}")

1. Loading and Cleaning Data...


  return datetime.utcnow().replace(tzinfo=utc)


2. Partitioning Data for FL Clients...


  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Data split across 10 simulated clients. Total features: 199


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [2]:
# -------------------------------------------------------------------
# BLOCK 2: FIF CLIENT LOGIC
# -------------------------------------------------------------------

# Helper functions for FIF model transfer (using Python's built-in serialization)
def serialize_model(model):
    """Converts the trained IF model object into a list of NumPy arrays (bytes)."""
    return [np.array(pickle.dumps(model), dtype=np.uint8)]

def deserialize_model(parameters):
    """Converts the bytes back into a trained IF model object."""
    return pickle.loads(parameters[0].tobytes())


# --- FIF Client Definition ---
class FederatedIsolationForestClient(fl.client.NumPyClient):
    def __init__(self, client_id):
        self.client_id = client_id
        self.data = client_data_splits[client_id]
        # Initialize Isolation Forest with optimal memory settings
        self.model = IsolationForest(n_estimators=50, contamination='auto', random_state=42, bootstrap=False)

    def get_parameters(self, config):
        # Return a dummy parameter list since the full model is returned in fit
        return [np.array([0], dtype=np.float32)]

    def fit(self, parameters, config):
        # 1. Train the client's local IF model
        X_train = self.data['X_train_fif']
        self.model.fit(X_train)

        # 2. Return the serialized model as parameters
        serialized_model = serialize_model(self.model)

        # Return serialized model, data size, and empty metrics dict
        return serialized_model, len(X_train), {}

    def evaluate(self, parameters, config):
        return 0.0, 0, {}

def client_fn_if(cid: str) -> fl.client.Client:
    # Use .to_client() for compatibility
    return FederatedIsolationForestClient(int(cid)).to_client()

print("3. Federated Client Logic Defined (FIF).")

3. Federated Client Logic Defined (FIF).


In [3]:
# -------------------------------------------------------------------
# BLOCK 3: FIF SERVER AND SIMULATION
# -------------------------------------------------------------------

# --- FIF Strategy (Custom Aggregation) ---
class CustomIFStrategy(Strategy):
    """Aggregates Isolation Forest models by collecting them for ensemble prediction."""

    def __init__(self):
        super().__init__()
        self.global_models: List[IsolationForest] = []

    # --- Use simplified function signatures to avoid environment error ---
    def initialize_parameters(self, client_manager):
        # Start with a dummy parameter list
        return Parameters(tensors=[b'\x00'], tensor_type='numpy.ndarray')

    def configure_fit(self, server_round: int, parameters, client_manager):
        # Standard configuration
        config = {'server_round': server_round}
        sampled_clients = client_manager.sample(num_clients=5, min_num_clients=5)
        return [(client, config) for client in sampled_clients]

    def aggregate_fit(self, server_round, results, failures):
        # Deserialize and store models from clients for ensemble evaluation
        for serialized_params, num_examples in results:
             # The result object contains the parameters wrapped in a container
             model = deserialize_model(serialized_params.parameters.tensors)
             self.global_models.append(model)

        print(f"Server Round {server_round}: Aggregated {len(results)} models.")

        # Return the latest received model parameters (as the new 'global' model)
        if results:
            return results[-1][0].parameters, {}
        return None, {}

    def evaluate(self, server_round: int, parameters):
        # CORE OF THE FIF ENSEMBLE EVALUATION
        if not self.global_models:
             return None

        # 1. Collect all prediction scores (anomaly distances) from all models
        all_scores = []
        for model in self.global_models:
            # -decision_function gives a continuous anomaly score (positive for anomalies)
            all_scores.append(-model.decision_function(X_test_global_np))

        # 2. Average the anomaly scores from all models (ENSEMBLE AGGREGATION)
        avg_score = np.mean(all_scores, axis=0)

        # 3. Calculate ROC-AUC based on the ensemble score
        roc_auc = roc_auc_score(y_test_global_np, avg_score)

        print(f"Server Round {server_round}: Global ROC-AUC (FIF Ensemble) = {roc_auc:.4f}")
        return (1.0 - roc_auc), {"roc_auc": roc_auc}

    def configure_evaluate(self, server_round: int, parameters, client_manager):
         # Run evaluation after every round on a sample of clients
        return [(client, {}) for client in client_manager.sample(num_clients=5, min_num_clients=5)]

    def aggregate_evaluate(self, server_round: int, results, failures):
        return None, {}

# -------------------------------------------------------------------
# Simulation Start
# -------------------------------------------------------------------

print("\n4. Starting Federated Learning Simulation (FIF)...")
history = fl.simulation.start_simulation(
    client_fn=client_fn_if,
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=10), # Run 10 rounds
    strategy=CustomIFStrategy(),
)
print("Federated Learning Simulation Complete.")

	Instead, use the `flwr run` CLI command to start a local simulation in your Flower app, as shown for example below:

		$ flwr new  # Create a new Flower app from a template

		$ flwr run  # Run the Flower app in Simulation Mode

	Using `start_simulation()` is deprecated.

            This is a deprecated feature. It will be removed
            entirely in future versions of Flower.
        
	Instead, use the `flwr run` CLI command to start a local simulation in your Flower app, as shown for example below:

		$ flwr new  # Create a new Flower app from a template

		$ flwr run  # Run the Flower app in Simulation Mode

	Using `start_simulation()` is deprecated.

            This is a deprecated feature. It will be removed
            entirely in future versions of Flower.
        
[92mINFO [0m:      Starting Flower simulation, config: num_rounds=10, no round_timeout



4. Starting Federated Learning Simulation (FIF)...


  return datetime.utcnow().replace(tzinfo=utc)
2025-10-03 06:17:13,012	INFO worker.py:1771 -- Started a local Ray instance.
[92mINFO [0m:      Flower VCE: Ray initialized with resources: {'node:__internal_head__': 1.0, 'CPU': 2.0, 'object_store_memory': 3922995609.0, 'node:172.28.0.12': 1.0, 'memory': 7845991220.0, 'GPU': 1.0, 'accelerator_type:T4': 1.0}
[92mINFO [0m:      Optimize your simulation with Flower VCE: https://flower.ai/docs/framework/how-to-run-simulations.html
[92mINFO [0m:      No `client_resources` specified. Using minimal resources for clients.
[92mINFO [0m:      Flower VCE: Resources for each Virtual Client: {'num_cpus': 1, 'num_gpus': 0.0}
[92mINFO [0m:      Flower VCE: Creating VirtualClientEngineActorPool with 2 actors
[92mINFO [0m:      [INIT]
[92mINFO [0m:      Using initial global parameters provided by strategy
[92mINFO [0m:      Starting evaluation of initial global parameters
[92mINFO [0m:      Evaluation returned no results (`None`)
[92mIN

Server Round 1: Aggregated 0 models.
Server Round 2: Aggregated 0 models.
Server Round 3: Aggregated 0 models.
Server Round 4: Aggregated 0 models.
Server Round 5: Aggregated 0 models.
Server Round 6: Aggregated 0 models.
Server Round 7: Aggregated 0 models.
Server Round 8: Aggregated 0 models.


[92mINFO [0m:      aggregate_evaluate: received 0 results and 5 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 9]
[92mINFO [0m:      configure_fit: strategy sampled 5 clients (out of 10)
[92mINFO [0m:      aggregate_fit: received 0 results and 5 failures
[92mINFO [0m:      configure_evaluate: strategy sampled 5 clients (out of 10)
[92mINFO [0m:      aggregate_evaluate: received 0 results and 5 failures
[92mINFO [0m:      
[92mINFO [0m:      [ROUND 10]
[92mINFO [0m:      configure_fit: strategy sampled 5 clients (out of 10)
[92mINFO [0m:      aggregate_fit: received 0 results and 5 failures
[92mINFO [0m:      configure_evaluate: strategy sampled 5 clients (out of 10)
[92mINFO [0m:      aggregate_evaluate: received 0 results and 5 failures
[92mINFO [0m:      
[92mINFO [0m:      [SUMMARY]
[92mINFO [0m:      Run finished 10 round(s) in 0.29s
[92mINFO [0m:      


Server Round 9: Aggregated 0 models.
Server Round 10: Aggregated 0 models.
Federated Learning Simulation Complete.


  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# Assuming your client_data_splits, X_test_global_np, and y_test_global_np are in memory.
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
import numpy as np
from collections import Counter

# 1. Extract one client's training data (Client 0)
X_local_train = client_data_splits[0]['X_train_fif']
contamination_rate = Counter(y_test_global_np)[1] / len(y_test_global_np)

# 2. Define and train an Isolation Forest model on JUST Client 0's data
local_if_model = IsolationForest(n_estimators=50, contamination=contamination_rate, random_state=42, bootstrap=False)
print("Starting Local-Only IF training (Final Data Point)...")
local_if_model.fit(X_local_train)
print("Local-Only IF training complete.")

# 3. Evaluate the Local Model on the Global Test Set
local_if_scores = -local_if_model.decision_function(X_test_global_np)

# 4. Calculate Local-Only ROC-AUC
local_if_roc_auc = roc_auc_score(y_test_global_np, local_if_scores)

print("\n--- FINAL REQUIRED BENCHMARK ---")
print(f"Local-Only Isolation Forest ROC-AUC (L-IF): {local_if_roc_auc:.4f}")

Starting Local-Only IF training (Final Data Point)...


  return datetime.utcnow().replace(tzinfo=utc)


Local-Only IF training complete.


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)



--- FINAL REQUIRED BENCHMARK ---
Local-Only Isolation Forest ROC-AUC (L-IF): 0.9149


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
