In [1]:
from dataset_names.dataset_names import DatasetNames
from attacks.attack_names import AttackNames
from detections.detection_names import DetectionNames
from evaluation.utils import get_output_path

In [2]:
import os
import json
from statistics import fmean

def get_metrics(base_path):
    if not os.path.exists(base_path):
        raise RuntimeError(f"Path {base_path} does not exist! The experiment has not been conducted yet!")
    TP_list = []
    FP_list = []
    FN_list = []
    Precision_list = []
    Recall_list = []
    malicious_clients_list = []
    detected_clients_list = []
    effectivenesses_list = []
    time_per_iteration_list = []
    final_acc_list = []
    for root, dirs, files in os.walk(base_path):
        relative_path = os.path.relpath(root, base_path)
        path_parts = relative_path.split(os.sep)

        # We expect 2 parts: date, time
        if len(path_parts) >= 2:
            for file_name in files:
                file_path = os.path.join(root, file_name)
                if "Precision_Recall" in file_name:
                    with open(file_path, 'r') as f:
                        precision_recall = json.load(f)
                    TP_list.append(precision_recall["TP"])
                    FP_list.append(precision_recall["FP"])
                    FN_list.append(precision_recall["FN"])
                    Precision_list.append(precision_recall["Precision"])
                    Recall_list.append(precision_recall["Recall"])
                elif "clients" in file_name:
                    with open(file_path, 'r') as f:
                        clients = json.load(f)
                    num_clients = clients["num_clients"]
                    malicious_clients = clients["malicious_clients"]
                    malicious_clients_list.append(malicious_clients)
                    detected_clients_list.append(clients["detected_clients"])
                elif "round_metrics" in file_name:
                    with open(file_path, 'r') as f:
                        round_metrics = json.load(f)
                elif "time" in file_name:
                    with open(file_path, 'r') as f:
                        time = json.load(f)
                    time_per_iteration = time["time_per_iteration"]
                    time_per_iteration_list.append(time_per_iteration)

            if not TP_list:
                # No data found, but directories were created -> the experiment failed because all clients were removed by the detection!
                return {"FAILED!"}
            
            # Calculate the Effectiveness of the run
            acc_0 = round_metrics[0]["accuracy"]    # We use the accuracy of the first (0) round.
            acc_n = round_metrics[-1]["accuracy"]
            final_acc_list.append(acc_n)
            clients_done = []
            effectivenesses = []
            for idx, round_metric in enumerate(round_metrics):
                detected_FRs = round_metric["detected_FR"]
                acc_i_1 = round_metrics[idx-1]["accuracy"]  # Get the accuracy of the previous global model.
                if detected_FRs:
                    for det_FR in detected_FRs:
                        # Check whether the FR is malicious and if it has not already been detected in a previous round
                        if det_FR in malicious_clients and det_FR not in clients_done:
                            # In this round, a new FR was detected.
                            # Calculate the Effectiveness for this FR (min-max scaling with respect to the accuracy of the global model)
                            effectiveness = 1 - ((acc_i_1 - acc_0) / (acc_n - acc_0))
                            effectivenesses.append(effectiveness)
                            clients_done.append(det_FR)
            
            not_detected_FR = [cid for cid in malicious_clients if cid not in clients_done]
            for n_d_FR in not_detected_FR:
                effectivenesses.append(0.0)
            effectiveness_all = fmean(effectivenesses) if effectivenesses else 0

            effectivenesses_list.append(effectiveness_all)

    # Check if the experiment failed # TODO test!
    if not TP_list:
        # No data found, but directories were created -> the experiment failed because all clients were removed by the detection!
        return {"FAILED!"}

    TP_avg = fmean(TP_list)
    FP_avg = fmean(FP_list)
    FN_avg = fmean(FN_list)
    Precision_avg = fmean(Precision_list)
    Recalls_avg = fmean(Recall_list)
    effectiveness_avg = fmean(effectivenesses_list)
    time_per_iteration_avg = fmean(time_per_iteration_list)
    final_acc_avg = fmean(final_acc_list)

    return {
        "TP_avg": TP_avg,
        "FP_avg": FP_avg,
        "FN_avg": FN_avg,
        "Precision_avg": Precision_avg,
        "Recall_avg": Recalls_avg,
        "Effectiveness_avg": effectiveness_avg,
        "Time_per_iteration_avg": time_per_iteration_avg,
        "Final_acc_avg": final_acc_avg
    }

In [3]:
from typing import List
def get_experiment_results(dataset: DatasetNames, n_clients: List, perc_malicious: List, attacks: List, detections: List):
    results = {}

    for detection in detections:
        for n_c in n_clients:
            for perc_m in perc_malicious:
                for attack in attacks:
                    path_name = get_output_path(dataset, n_c, perc_m, attack, detection)
                    path = "./outputs/" + path_name
                    metrics = get_metrics(path)
                    results[path_name] = metrics

    for name in results.keys():
        print(f"{name}:\n", results[name], "\n")
    return results

#### Note
It is possible that the effectiveness is negative. This can be the case, if acc_i_1 < acc_n, i.e. the accuracy at the last round is slightly worse than e.g. in the penultimate round. Thus, a negative effectiveness indicates an effectiveness of 0.

# Experiment evaluation

## Baseline Runs (no attack, no detection)

### MNIST

In [4]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.5076402187347413, 'Final_acc_avg': 0.9871} 

mnist--n=100--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.21855194568634, 'Final_acc_avg': 0.9342666666666667} 



### CIFAR10

In [17]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 17.44803235133489, 'Final_acc_avg': 0.8077666666666667} 

cifar10--n=100--0%--no_attack--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 21.992423649628957, 'Final_acc_avg': 0.7122} 



## Comparison Runs (Attacks, no detections)

### MNIST

In [6]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [30]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--30%--random_weights--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.2915021975835166, 'Final_acc_avg': 0.9201666666666667} 

mnist--n=10--30%--advanced_delta_weights--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.331667884190877, 'Final_acc_avg': 0.9847} 

mnist--n=10--30%--advanced_free_rider--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.3529720544815063, 'Final_acc_avg': 0.9841000000000001} 

mnist--n=10--30%--adaptive--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.33696368932724, 'Final_acc_avg': 0.9843000000000001} 

mnist--n=100--30%--r

### CIFAR10

In [16]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [30]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.no_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--30%--random_weights--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 13.69570397933324, 'Final_acc_avg': 0.16206666666666666} 

cifar10--n=10--30%--advanced_delta_weights--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 14.207055735588073, 'Final_acc_avg': 0.7827333333333334} 

cifar10--n=10--30%--advanced_free_rider--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 14.472739116350809, 'Final_acc_avg': 0.6746333333333334} 

cifar10--n=10--30%--adaptive--no_detection:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 3.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 14.845909496148428, 'Final_acc_avg': 0.7821666666666666}

## Control Runs (Detections, no attacks)

### MNIST

In [7]:
dataset = DatasetNames.mnist
n_clients = [10]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.delta_dagmm_detection, DetectionNames.rffl_detection, DetectionNames.fgfl_detection, DetectionNames.viceroy_detection, DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--0%--no_attack--delta_dagmm:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.548964019616445, 'Final_acc_avg': 0.9870666666666666} 

mnist--n=10--0%--no_attack--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.5710649013519284, 'Final_acc_avg': 0.9859666666666667} 

mnist--n=10--0%--no_attack--fgfl:
 {'TP_avg': 0.0, 'FP_avg': 5.333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.0891584475835163, 'Final_acc_avg': 0.9835666666666666} 

mnist--n=10--0%--no_attack--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 1.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.3865855216979983, 'Final_acc_avg': 0.9852666666666666} 

mnist--n=10--0%--no_attack--we

In [8]:
dataset = DatasetNames.mnist
n_clients = [100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.delta_dagmm_detection, DetectionNames.rffl_detection, DetectionNames.fgfl_detection, DetectionNames.viceroy_detection, DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=100--0%--no_attack--delta_dagmm:
 {'TP_avg': 0.0, 'FP_avg': 2.6666666666666665, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.217063069343567, 'Final_acc_avg': 0.9319333333333333} 

mnist--n=100--0%--no_attack--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.2104353388150533, 'Final_acc_avg': 0.9303666666666667} 

mnist--n=100--0%--no_attack--fgfl:
 {'TP_avg': 0.0, 'FP_avg': 24.333333333333332, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 2.8988393584887184, 'Final_acc_avg': 0.9293999999999999} 

mnist--n=100--0%--no_attack--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 34.666666666666664, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.2742070396741227, 'Final_acc_avg': 0.9293666666666667} 

mnist--n=1

### CIFAR10

In [4]:
dataset = DatasetNames.cifar10
n_clients = [10]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.delta_dagmm_detection, DetectionNames.rffl_detection, DetectionNames.fgfl_detection, DetectionNames.viceroy_detection, DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--0%--no_attack--delta_dagmm:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 17.355650838216146, 'Final_acc_avg': 0.8077666666666667} 

cifar10--n=10--0%--no_attack--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 17.72415406703949, 'Final_acc_avg': 0.8043666666666667} 

cifar10--n=10--0%--no_attack--fgfl:
 {'TP_avg': 0.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 17.24144670963287, 'Final_acc_avg': 0.8051} 

cifar10--n=10--0%--no_attack--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 18.287866814931235, 'Final_acc_avg': 0.8099333333333334} 

cifar10--n=10--0%--no_attack--wef:
 {'TP_avg': 0.

In [5]:
dataset = DatasetNames.cifar10
n_clients = [100]
perc_malicious = [0]
attacks = [AttackNames.no_attack]
detections = [DetectionNames.delta_dagmm_detection, DetectionNames.rffl_detection, DetectionNames.fgfl_detection, DetectionNames.viceroy_detection, DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=100--0%--no_attack--delta_dagmm:
 {'TP_avg': 0.0, 'FP_avg': 1.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 21.659704389174777, 'Final_acc_avg': 0.7108666666666666} 

cifar10--n=100--0%--no_attack--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 24.081753265857696, 'Final_acc_avg': 0.7068666666666666} 

cifar10--n=100--0%--no_attack--fgfl:
 {'TP_avg': 0.0, 'FP_avg': 19.333333333333332, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 19.492140644788744, 'Final_acc_avg': 0.7034666666666666} 

cifar10--n=100--0%--no_attack--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 35.24374766548475, 'Final_acc_avg': 0.7108} 

cifar10--n=100--0%--no_attack--wef:
 {'TP_av

## Benchmark Runs

### MNIST

#### Delta_DAGMM detection

In [6]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.delta_dagmm_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.07900256557398959, 'Time_per_iteration_avg': 3.3588565786679587, 'Final_acc_avg': 0.9866999999999999} 

mnist--n=10--10%--advanced_delta_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.0072858333367670225, 'Time_per_iteration_avg': 3.384461836020152, 'Final_acc_avg': 0.9860666666666665} 

mnist--n=10--10%--advanced_free_rider--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.009609518625246763, 'Time_per_iteration_avg': 3.433012835184733, 'Final_acc_avg': 0.9862000000000001} 

mnist--n=10--10%--adaptive--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.007027355970988913

#### FGFL detection

In [7]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.fgfl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 4.333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.18888888888888888, 'Recall_avg': 1.0, 'Effectiveness_avg': 1.0, 'Time_per_iteration_avg': 3.054606827100118, 'Final_acc_avg': 0.9848666666666667} 

mnist--n=10--10%--advanced_delta_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 4.0, 'FN_avg': 0.0, 'Precision_avg': 0.20555555555555557, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.4215434733826154, 'Time_per_iteration_avg': 2.9509992281595867, 'Final_acc_avg': 0.9828333333333333} 

mnist--n=10--10%--advanced_free_rider--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 3.3333333333333335, 'FN_avg': 0.0, 'Precision_avg': 0.2888888888888889, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.7119522361158049, 'Time_per_iteration_avg': 3.206501507759094, 'Final_acc_avg': 0.9843000000000001} 

mnist--n=10--10%--adaptive--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 3.6666666666666665, 'FN_avg': 0.0, 'Precision_avg': 0.2333333333333333, 'Recall_avg': 1.0, 'Effectiveness

#### RFFL detection

In [8]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.rffl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--rffl:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.05907214349465365, 'Time_per_iteration_avg': 3.4714203159014385, 'Final_acc_avg': 0.9854666666666668} 

mnist--n=10--10%--advanced_delta_weights--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.496500527858734, 'Final_acc_avg': 0.9856333333333334} 

mnist--n=10--10%--advanced_free_rider--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.435094408194224, 'Final_acc_avg': 0.9865} 

mnist--n=10--10%--adaptive--rffl:
 {'FAILED!'} 

mnist--n=10--30%--random_weights--rffl:
 {'TP_avg': 3.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.05477267385991876, 'Time_per_iteration_avg': 3.2931492765744523, 'Fi

#### Viceroy detection

In [9]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.viceroy_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--viceroy:
 {'TP_avg': 1.0, 'FP_avg': 7.0, 'FN_avg': 0.0, 'Precision_avg': 0.125, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.2046507896524181, 'Time_per_iteration_avg': 2.4725465416908263, 'Final_acc_avg': 0.9511333333333333} 

mnist--n=10--10%--advanced_delta_weights--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.3761274973551436, 'Final_acc_avg': 0.9865333333333334} 

mnist--n=10--10%--advanced_free_rider--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.4464804569880165, 'Final_acc_avg': 0.9852333333333334} 

mnist--n=10--10%--adaptive--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 3.4347713351249696, 'Final_acc_avg': 

#### WEF detection

In [10]:
dataset = DatasetNames.mnist
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

mnist--n=10--10%--random_weights--wef:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.12792313794326268, 'Time_per_iteration_avg': 3.4385547240575156, 'Final_acc_avg': 0.9866333333333334} 

mnist--n=10--10%--advanced_delta_weights--wef:
 {'TP_avg': 1.0, 'FP_avg': 1.6666666666666667, 'FN_avg': 0.0, 'Precision_avg': 0.38888888888888884, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.11689645330916087, 'Time_per_iteration_avg': 3.4108649373054507, 'Final_acc_avg': 0.9857666666666667} 

mnist--n=10--10%--advanced_free_rider--wef:
 {'TP_avg': 1.0, 'FP_avg': 2.0, 'FN_avg': 0.0, 'Precision_avg': 0.3611111111111111, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.1108382678203917, 'Time_per_iteration_avg': 3.4112536032994587, 'Final_acc_avg': 0.9860666666666668} 

mnist--n=10--10%--adaptive--wef:
 {'TP_avg': 1.0, 'FP_avg': 1.6666666666666667, 'FN_avg': 0.0, 'Precision_avg': 0.38888888888888884, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.11342

### CIFAR10

#### Delta_DAGMM detection

In [11]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.delta_dagmm_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--10%--random_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.2582693147250092, 'Time_per_iteration_avg': 15.795902109146118, 'Final_acc_avg': 0.8075000000000001} 

cifar10--n=10--10%--advanced_delta_weights--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.23250749609497715, 'Time_per_iteration_avg': 15.885946039358776, 'Final_acc_avg': 0.8038333333333334} 

cifar10--n=10--10%--advanced_free_rider--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.23082876253795026, 'Time_per_iteration_avg': 15.949745313326519, 'Final_acc_avg': 0.8009666666666666} 

cifar10--n=10--10%--adaptive--delta_dagmm:
 {'TP_avg': 1.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 0.0, '

#### RFFL detection

In [12]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.rffl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--10%--random_weights--rffl:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.18790932206546795, 'Time_per_iteration_avg': 16.416970694065096, 'Final_acc_avg': 0.8118666666666666} 

cifar10--n=10--10%--advanced_delta_weights--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 16.51779570579529, 'Final_acc_avg': 0.8029666666666667} 

cifar10--n=10--10%--advanced_free_rider--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 16.614342391490936, 'Final_acc_avg': 0.8018333333333333} 

cifar10--n=10--10%--adaptive--rffl:
 {'TP_avg': 0.0, 'FP_avg': 0.0, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 16.71630551815033, 'Final_acc_avg': 0.7999999999999999} 

cifar10--n=10--

#### FGFL detection

In [4]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.fgfl_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--10%--random_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.46067792821126713, 'Time_per_iteration_avg': 16.10825476249059, 'Final_acc_avg': 0.8065666666666668} 

cifar10--n=10--10%--advanced_delta_weights--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 1.0, 'FN_avg': 0.0, 'Precision_avg': 0.611111111111111, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.5303272355136676, 'Time_per_iteration_avg': 16.259436865647633, 'Final_acc_avg': 0.7948000000000001} 

cifar10--n=10--10%--advanced_free_rider--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 0.3333333333333333, 'FN_avg': 0.0, 'Precision_avg': 0.8333333333333334, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.5282728702815435, 'Time_per_iteration_avg': 15.934076273441315, 'Final_acc_avg': 0.7995333333333333} 

cifar10--n=10--10%--adaptive--fgfl:
 {'TP_avg': 1.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 0.0, 'Precision_avg': 0.6666666666666666, 'Recall_av

#### Viceroy detection

In [5]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.viceroy_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--10%--random_weights--viceroy:
 {'FAILED!'} 

cifar10--n=10--10%--advanced_delta_weights--viceroy:
 {'TP_avg': 0.3333333333333333, 'FP_avg': 1.0, 'FN_avg': 0.6666666666666666, 'Precision_avg': 0.16666666666666666, 'Recall_avg': 0.3333333333333333, 'Effectiveness_avg': 0.06879381194102004, 'Time_per_iteration_avg': 16.623489622275034, 'Final_acc_avg': 0.7963} 

cifar10--n=10--10%--advanced_free_rider--viceroy:
 {'TP_avg': 0.0, 'FP_avg': 0.6666666666666666, 'FN_avg': 1.0, 'Precision_avg': 0.0, 'Recall_avg': 0.0, 'Effectiveness_avg': 0.0, 'Time_per_iteration_avg': 16.817859466870626, 'Final_acc_avg': 0.7957333333333333} 

cifar10--n=10--10%--adaptive--viceroy:
 {'TP_avg': 0.6666666666666666, 'FP_avg': 0.0, 'FN_avg': 0.3333333333333333, 'Precision_avg': 0.6666666666666666, 'Recall_avg': 0.6666666666666666, 'Effectiveness_avg': 0.11127515980984293, 'Time_per_iteration_avg': 17.39697668949763, 'Final_acc_avg': 0.8048666666666667} 

cifar10--n=10--30%--random_weights--viceroy:
 

#### WEF detection

In [15]:
dataset = DatasetNames.cifar10
n_clients = [10, 100]
perc_malicious = [10, 30, 70]
attacks = [AttackNames.random_weights_attack, AttackNames.advanced_delta_weights_attack, AttackNames.advanced_free_rider_attack, AttackNames.adaptive_attack]
detections = [DetectionNames.wef_detection]

results = get_experiment_results(dataset, n_clients, perc_malicious, attacks, detections)

cifar10--n=10--10%--random_weights--wef:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.48161160742419656, 'Time_per_iteration_avg': 15.993499088287352, 'Final_acc_avg': 0.8104} 

cifar10--n=10--10%--advanced_delta_weights--wef:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.5364706176798784, 'Time_per_iteration_avg': 15.999587984879811, 'Final_acc_avg': 0.8033666666666667} 

cifar10--n=10--10%--advanced_free_rider--wef:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.5259338882714611, 'Time_per_iteration_avg': 16.128350631395975, 'Final_acc_avg': 0.8042666666666666} 

cifar10--n=10--10%--adaptive--wef:
 {'TP_avg': 1.0, 'FP_avg': 0.0, 'FN_avg': 0.0, 'Precision_avg': 1.0, 'Recall_avg': 1.0, 'Effectiveness_avg': 0.5371855427152302, 'Time_per_iteration_avg': 16.235665353139243, 'Final_acc_avg': 0.8028