# Flower FLML Experiments - Measured
## Description
This notebook is for running federated learning machine learning experiment for ISDC project. This notebook uses flower framework for implementing federated learning concepts.

## Imports

In [1]:
from pyarrow import csv
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import glob

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import flwr as fl
from flwr.common import Metrics
from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth

In [4]:
import math
from typing import Dict, List, Tuple

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [6]:
from tensorflow.keras.models import load_model

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, average_precision_score, f1_score

## Functions and Classes

In [8]:
# ANN Model Creationg Function
def create_ann_model(input_shape=(94,), hidden_units=None):
    """
    Create a feedforward neural network model for binary classification with
    the specified input shape.

    Parameters:
    - input_shape: Tuple specifying the shape of the input data.
    - hidden_units: List of integers specifying the number of units in each hidden layer.
                    Default is [64, 32].

    Returns:
    - A compiled Keras model.
    """

    if hidden_units is None:
        hidden_units = [64, 32]

    model = Sequential()

    # Input layer
    model.add(Dense(hidden_units[0], activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))

    # Hidden layers
    for units in hidden_units[1:]:
        model.add(Dense(units, activation='relu'))
        model.add(Dropout(0.2))

    # Output layer for binary classification
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

In [9]:
def model_report_list(model, df_test):
    result = []
    
    x_test = df_test.iloc[:, :-1]
    y_test = df_test["Class"]
    
    # Get model predictions for the test data
    y_pred_prob = model.predict(x_test, verbose=0)
    y_pred = (y_pred_prob > 0.5).astype("int32")
    
    df_test["PLabel"] = y_pred
    
    # # Calculate overall accuracy
    # overall_accuracy = accuracy_score(y_test, y_pred)
    # result.append(('Overall Accuracy', overall_accuracy))

    # Calculate F1 Score
    f1 = f1_score(y_test, y_pred)
    result.append(('F1 Score', f1))

    # Calculate AUC
    auc_score = roc_auc_score(y_test, y_pred_prob)
    result.append(('AUC', auc_score))
    
    # # Calculate AUC-PR
    # precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
    # auc_pr_score = auc(recall, precision)
    # result.append(('AUC-PR', auc_pr_score))
    
    # # Calculate and print per-class accuracy
    # unique_classes = np.unique(y_test)
    # confusion_mat = confusion_matrix(y_test, y_pred, labels=unique_classes)
    # per_class_accuracy = np.diag(confusion_mat) / np.sum(confusion_mat, axis=1)
    # for class_id, accuracy in zip(unique_classes, per_class_accuracy):
    #     result.append((f'Accuracy Class {class_id}', accuracy))

    # # Calculate False Positives and False Negatives
    # FP = confusion_mat[0][1]
    # FN = confusion_mat[1][0]
    # result.append(('False Positives', FP))
    # result.append(('False Negatives', FN))
    
    return result

In [10]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, x_train, y_train,model) -> None:
        # Create model
        self.model = model
        self.x_train, self.y_train = x_train, y_train

    def get_parameters(self, config):
        return self.model.get_weights()

    def fit(self, parameters, config):

        self.model.set_weights(parameters)
        self.model.fit(
             self.x_train,  self.y_train, epochs=3, batch_size=32, verbose=VERBOSE
        )
        return self.model.get_weights(), len(self.x_train), {}

In [11]:
def get_client_fn(dataset_partitions,model):
    """Return a function to be executed by the VirtualClientEngine in order to construct
    a client."""

    def client_fn(cid: str) -> fl.client.Client:
        """Construct a FlowerClient with its own dataset partition."""

        # Extract partition for client with id = cid
        x_train, y_train = dataset_partitions[int(cid)]
        
        # Create and return client
        return FlowerClient(x_train, y_train, model)

    return client_fn


def partition_dataset(x, y, num_clients):
    partition_size = math.floor(len(x) / num_clients)
    partitions = []
    for i in range(num_clients):
        idx_from, idx_to = i * partition_size, (i + 1) * partition_size
        partitions.append((x[idx_from:idx_to], y[idx_from:idx_to]))
    return partitions

def get_evaluate_fn(model,file_path):
    """Return an evaluation function for server-side (i.e. centralized) evaluation."""

    # The `evaluate` function will be called after every round by the strategy
    def evaluate(
        server_round: int,
        parameters: fl.common.NDArrays,
        config: Dict[str, fl.common.Scalar],
    ):
        model.set_weights(parameters)  # Update model with the latest parameters
        model.save(file_path)
        # model_report(model,x_test,y_test)
        return 

    return evaluate


## Data Preparation - Measured

In [15]:
def get_partitions(data_path):
    partitions = []
    
    for f in glob.glob(data_path):
        data = pd.read_csv(f)

        data.columns = data.columns.str.strip()
        data = data[data["Class"] != ' null']
        data["Class"] = data["Class"].replace({' "-1"' : 0 , ' "Facet"' : 1 })
        data["Class"] = data["Class"].replace({'-1' : 0, 'Facet' : 1 })
        data["Class"] = data["Class"].replace({' -1' : 0, ' Facet' : 1 })
        data["Class"] = data["Class"].replace(np.nan,'benign')
        
        data = data.sample(frac = 1).reset_index(drop=True)
        
        x_data = data.iloc[:,:-1]
        y_data = data["Class"]
    
        partitions.append((x_data.to_numpy(),y_data.to_numpy()))

    return partitions

## Testing Data Preparation

In [16]:
df = pq.read_table("/data/important_data_test.parquet").to_pandas()

In [17]:
df_b = df[df['BLabel'] == 0].reset_index(drop=True)
df_a = df[df['BLabel'] == 1].reset_index(drop=True)

In [18]:
df_b = df_b.sample(n=len(df_a),random_state=5467).reset_index(drop=True)

In [None]:
df = pd.concat([df_a,df_b],ignore_index=True)

In [None]:
test_data = pd.DataFrame()
test_data = pd.DataFrame(df['Packet Size Distribution'].to_list())
test_data["Class"] = df["BLabel"]

## Global Setting

In [20]:
VERBOSE = 0
NUM_CLIENTS = 18

# With a dictionary, you tell Flower's VirtualClientEngine that each
# client needs exclusive access to these many resources in order to run
client_resources = {"num_cpus":2,"num_gpus": 2}

## Model Training

In [21]:
# Enable GPU growth in your main process
enable_tf_gpu_growth()

2024-02-01 09:25:06.001458: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-01 09:25:06.001695: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-01 09:25:06.034955: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [None]:
result = []

partitions = get_partitions("./dataset/isdc/*")

for _ in range(50):

    hidden_units = [64,32]
    output_model = f"./models/i{_}.keras"
    
    model = create_ann_model(hidden_units=hidden_units)
    
    
    strategy = fl.server.strategy.FedAvg(
    fraction_fit=1,
    # min_fit_clients=10,
    fraction_evaluate = 1,
    min_available_clients = NUM_CLIENTS, 
    evaluate_fn = get_evaluate_fn(model,output_model),
    )
    
    
    # Start simulation
    history = fl.simulation.start_simulation(
    client_fn=get_client_fn(partitions,model),
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=1),
    strategy=strategy,
    client_resources=client_resources,
    actor_kwargs={
    "on_actor_init_fn": enable_tf_gpu_growth  
    },
    )
    del history
    del strategy
    del model
    
    model = load_model(output_model)
    s = model_report_list(model,test_data.copy())

    result.append(s)
    del model
    

INFO flwr 2024-02-01 09:27:22,004 | app.py:178 | Starting Flower simulation, config: ServerConfig(num_rounds=1, round_timeout=None)
INFO:flwr:Starting Flower simulation, config: ServerConfig(num_rounds=1, round_timeout=None)
2024-02-01 09:27:25,455	INFO worker.py:1621 -- Started a local Ray instance.
INFO flwr 2024-02-01 09:27:26,303 | app.py:213 | Flower VCE: Ray initialized with resources: {'node:__internal_head__': 1.0, 'node:35.16.80.216': 1.0, 'CPU': 64.0, 'GPU': 2.0, 'accelerator_type:G': 1.0, 'object_store_memory': 157559513088.0, 'memory': 357638863872.0}
INFO:flwr:Flower VCE: Ray initialized with resources: {'node:__internal_head__': 1.0, 'node:35.16.80.216': 1.0, 'CPU': 64.0, 'GPU': 2.0, 'accelerator_type:G': 1.0, 'object_store_memory': 157559513088.0, 'memory': 357638863872.0}
INFO flwr 2024-02-01 09:27:26,303 | app.py:219 | Optimize your simulation with Flower VCE: https://flower.dev/docs/framework/how-to-run-simulations.html
INFO:flwr:Optimize your simulation with Flower V

In [None]:
data = {}

# Iterate through each tuple in the first list to use as keys and create lists for their values
for metric, value in result[0]:
    data[metric] = []

# Fill the lists with values from each corresponding position in the remaining lists
for res in result:
    for i, (metric, value) in enumerate(res):
        data[metric].append(value)

# Convert the dictionary into a DataFrame
result_df = pd.DataFrame(data)

In [None]:
result_df.mean(axis=0)

## Model Testing

In [45]:
def model_report_list(model, df_test):
    result = []

    x_test = df_test.iloc[:, :-1]
    y_test = df_test["Class"]

    # Get model predictions for the test data
    y_pred_prob = model.predict(x_test, verbose=0)
    
    optimal_threshold = 0.5
    
    y_pred = (y_pred_prob > optimal_threshold).astype("int32")

    df_test["PLabel"] = y_pred
    
    # Calculate F1 Score
    f1 = f1_score(y_test, y_pred)
    result.append(('F1 Score', f1))

    # Calculate AUC
    auc_score = roc_auc_score(y_test, y_pred_prob)
    result.append(('AUC', auc_score))

    # Calculate Average Precision
    avg_precision = average_precision_score(y_test, y_pred_prob)
    result.append(('Average Precision', avg_precision))
    
    # Calculate Precision
    precision = precision_score(y_test, y_pred)
    result.append(('Precision', precision))
    
    # Calculate Recall
    recall = recall_score(y_test, y_pred)
    result.append(('Recall', recall))

    confusion_mat = confusion_matrix(y_test, y_pred)

    FP = confusion_mat[0][1]
    FN = confusion_mat[1][0]

    TP = confusion_mat[1][1]
    TN = confusion_mat[0][0]

    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0  # Avoid division by zero
    FNR = FN / (FN + TP) if (FN + TP) > 0 else 0  # Avoid division by zero

    result.append(('False Positive Rate', FPR))
    result.append(('False Negative Rate', FNR))

    del model
    
    return result




In [None]:
result = []

for _ in range(50):
    
    output_model = f"./models/i_{_}.keras"
    
    model = load_model(output_model)
    s = model_report_list(model, test_data.copy())

    result.append(s)
    del model

In [None]:
data = {}

# Iterate through each tuple in the first list to use as keys and create lists for their values
for metric, value in result[0]:
    data[metric] = []

# Fill the lists with values from each corresponding position in the remaining lists
for res in result:
    for i, (metric, value) in enumerate(res):
        data[metric].append(value)

# Convert the dictionary into a DataFrame
result_df = pd.DataFrame(data)

print(model_name)
print("-"*100)
print(result_df.mean(axis=0))
print("-"*100)