In [1]:
from dataset_names.dataset_names import DatasetNames
from attacks.attack_names import AttackNames
from detections.detection_names import DetectionNames
from evaluation.utils import get_output_path

In [2]:
import os
import json
from statistics import fmean

def get_metrics(base_path):
    if not os.path.exists(base_path):
        raise RuntimeError(f"Path {base_path} does not exist! The experiment has not been conducted yet!")
    TP_list = []
    FP_list = []
    FN_list = []
    Precision_list = []
    Recall_list = []
    malicious_clients_list = []
    detected_clients_list = []
    effectivenesses_list = []
    time_per_iteration_list = []
    final_acc_list = []
    for root, dirs, files in os.walk(base_path):
        relative_path = os.path.relpath(root, base_path)
        path_parts = relative_path.split(os.sep)

        # We expect 2 parts: date, time
        if len(path_parts) >= 2:
            for file_name in files:
                file_path = os.path.join(root, file_name)
                if "Precision_Recall" in file_name:
                    with open(file_path, 'r') as f:
                        precision_recall = json.load(f)
                    TP_list.append(precision_recall["TP"])
                    FP_list.append(precision_recall["FP"])
                    FN_list.append(precision_recall["FN"])
                    Precision_list.append(precision_recall["Precision"])
                    Recall_list.append(precision_recall["Recall"])
                elif "clients" in file_name:
                    with open(file_path, 'r') as f:
                        clients = json.load(f)
                    num_clients = clients["num_clients"]
                    malicious_clients = clients["malicious_clients"]
                    malicious_clients_list.append(malicious_clients)
                    detected_clients_list.append(clients["detected_clients"])
                elif "round_metrics" in file_name:
                    with open(file_path, 'r') as f:
                        round_metrics = json.load(f)
                elif "time" in file_name:
                    with open(file_path, 'r') as f:
                        time = json.load(f)
                    time_per_iteration = time["time_per_iteration"]
                    time_per_iteration_list.append(time_per_iteration)


            # Calculate the Effectiveness of the run
            acc_0 = round_metrics[0]["accuracy"]    # We use the accuracy of the first (0) round.
            acc_n = round_metrics[-1]["accuracy"]
            final_acc_list.append(acc_n)
            clients_done = []
            effectivenesses = []
            for idx, round_metric in enumerate(round_metrics):
                detected_FRs = round_metric["detected_FR"]
                acc_i_1 = round_metrics[idx-1]["accuracy"]  # Get the accuracy of the previous global model.
                if detected_FRs:
                    for det_FR in detected_FRs:
                        # Check whether the FR is malicious and if it has not already been detected in a previous round
                        if det_FR in malicious_clients and det_FR not in clients_done:
                            # In this round, a new FR was detected.
                            # Calculate the Effectiveness for this FR.
                            effectiveness = (1 - (acc_i_1/acc_n)) / (1 - (acc_0/acc_n))
                            effectivenesses.append(effectiveness)
                            clients_done.append(det_FR)
            
            not_detected_FR = [cid for cid in malicious_clients if cid not in clients_done]
            for n_d_FR in not_detected_FR:
                effectivenesses.append(0.0)
            effectiveness_all = fmean(effectivenesses) if effectivenesses else 0

            effectivenesses_list.append(effectiveness_all)

    TP_avg = fmean(TP_list)
    FP_avg = fmean(FP_list)
    FN_avg = fmean(FN_list)
    Precision_avg = fmean(Precision_list)
    Recalls_avg = fmean(Recall_list)
    effectiveness_avg = fmean(effectivenesses_list)
    time_per_iteration_avg = fmean(time_per_iteration_list)
    final_acc_avg = fmean(final_acc_list)

    return {
        "TP_avg": TP_avg,
        "FP_avg": FP_avg,
        "FN_avg": FN_avg,
        "Precision_avg": Precision_avg,
        "Recall_avg": Recalls_avg,
        "Effectiveness_avg": effectiveness_avg,
        "Time_per_iteration_avg": time_per_iteration_avg,
        "Final_acc_avg": final_acc_avg
    }

#### Note
It is possible that the effectiveness is negative. This can be the case, if acc_i_1 < acc_n, i.e. the accuracy at the last round is slightly worse than e.g. in the penultimate round. Thus, a negative effectiveness indicates an effectiveness of 0.

# MNIST experiment evaluation

In [3]:
from typing import List
def get_experiment_results(dataset: DatasetNames, n_clients: List, perc_malicious: List, attacks: List, detections: List):
    results = {}

    for detection in detections:
        for n_c in n_clients:
            for perc_m in perc_malicious:
                for attack in attacks:
                    path_name = get_output_path(dataset, n_c, perc_m, attack, detection)
                    path = "./outputs/" + path_name
                    metrics = get_metrics(path)
                    results[path_name] = metrics

    for name in results.keys():
        print(f"{name}:\n", results[name], "\n")
    return results

## Baseline Runs

### MNIST

In [4]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.5076402187347413, 'Final_acc_avg': 0.9871} 

mnist--n=100--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.21855194568634, 'Final_acc_avg': 0.9342666666666667} 



### CIFAR10

In [5]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 17.44803235133489, 'Final_acc_avg': 0.8077666666666667} 

cifar10--n=100--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 22.264522767066953, 'Final_acc_avg': 0.6450666666666667} 



## Benchmark Runs

### MNIST

#### Delta_DAGMM detection

In [6]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.delta_dagmm_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.07900256557398959, 'Time_per_iteration_avg': 3.3588565786679587, 'Final_acc_avg': 0.9866999999999999} 

mnist--n=10--10%--advanced_delta_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.007285833336767071, 'Time_per_iteration_avg': 3.384461836020152, 'Final_acc_avg': 0.9860666666666665} 

mnist--n=10--10%--advanced_free_rider--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.009609518625246694, 'Time_per_iteration_avg': 3.433012835184733, 'Final_acc_avg': 0.9862000000000001} 

mnist--n=10--10%--adaptive--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.007027355970988906,

#### FGFL detection

In [7]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.fgfl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 4.333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.18888888888888888, 'Recall_avg': 1.0, 'Effectiveness_avg': 1.0, 'Time_per_iteration_avg': 3.054606827100118, 'Final_acc_avg': 0.9848666666666667} 

mnist--n=10--10%--advanced_delta_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 4.0, 'FN_avg': 0.0, 'Precision_avg': 0.20555555555555557, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.42154347338261533, 'Time_per_iteration_avg': 2.9509992281595867, 'Final_acc_avg': 0.9828333333333333} 

mnist--n=10--10%--advanced_free_rider--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 3.3333333333333335, 'FN_avg': 0.0, 'Precision_avg': 0.2888888888888889, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.7119522361158048, 'Time_per_iteration_avg': 3.206501507759094, 'Final_acc_avg': 0.9843000000000001} 

mnist--n=10--10%--adaptive--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 3.6666666666666665, 'FN_avg': 0.0, 'Precision_avg': 0.2333333333333333, 'Recall_avg': 1.0, 'Effectivenes

#### RFFL detection

In [8]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.rffl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--rffl:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.059072143494653645, 'Time_per_iteration_avg': 3.4714203159014385, 'Final_acc_avg': 0.9854666666666668} 

mnist--n=10--10%--advanced_delta_weights--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.496500527858734, 'Final_acc_avg': 0.9856333333333334} 

mnist--n=10--10%--advanced_free_rider--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.435094408194224, 'Final_acc_avg': 0.9865} 

mnist--n=10--10%--adaptive--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.4393803675969443, 'Final_acc_avg': 0.9874} 

mnist--n=10--30%--random_weights--rffl:
 {'TP_

#### Viceroy detection

In [9]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.viceroy_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--viceroy:
 {'TP_avg': 1.0, 'FP_avg': 7.0, 'FN_avg': 0.0, 'Precision_avg': 0.125, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.20465078965241812, 'Time_per_iteration_avg': 2.4725465416908263, 'Final_acc_avg': 0.9511333333333333} 

mnist--n=10--10%--advanced_delta_weights--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.3761274973551436, 'Final_acc_avg': 0.9865333333333334} 

mnist--n=10--10%--advanced_free_rider--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.4464804569880165, 'Final_acc_avg': 0.9852333333333334} 

mnist--n=10--10%--adaptive--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.4347713351249696, 'Final_acc_avg':