In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from pathlib import Path
from typing import Any
from logging import INFO, DEBUG
import json
import os
import numpy as np
from torch import nn
from torch.utils.data import DataLoader
from scipy.signal import medfilt
from flwr.common import log, ndarrays_to_parameters
import matplotlib.pyplot as plt
import math

from src.common.client_utils import (
    load_femnist_dataset,
    get_network_generator_cnn as get_network_generator,
    get_device,
    get_model_parameters,
    aggregate_weighted_average,
)


from src.flwr_core import (
    set_all_seeds,
    get_paths,
    decompress_dataset,
    get_flower_client_generator,
    sample_random_clients,
    get_federated_evaluation_function,
    create_iid_partition,
)

from src.estimate import (
    compute_critical_batch,
)

from src.experiments_simulation import (
    run_simulation,
    centralized_experiment,
)

from src.utils import get_centralized_acc_from_hist

PathType = Path | str | None

In [9]:
set_all_seeds()

PATHS = get_paths()

HOME_DIR = PATHS["home_dir"]
DATASET_DIR = PATHS["dataset_dir"]
DATA_DIR = PATHS["data_dir"]
CENTRALIZED_PARTITION = PATHS["centralized_partition"]
CENTRALIZED_MAPPING = PATHS["centralized_mapping"]
FEDERATED_PARTITION = PATHS["federated_partition"]
FEDERATED_IID_PARTITION = PATHS["iid_partition"]

# extract dataset from tar.gz
decompress_dataset(PATHS)

In [10]:
max_clients = 1000
create_iid_partition(PATHS, num_clients=max_clients)

Distributing data to client 0
Distributing data to client 1
Distributing data to client 2
Distributing data to client 3
Distributing data to client 4
Distributing data to client 5
Distributing data to client 6
Distributing data to client 7
Distributing data to client 8
Distributing data to client 9
Distributing data to client 10
Distributing data to client 11
Distributing data to client 12
Distributing data to client 13
Distributing data to client 14
Distributing data to client 15
Distributing data to client 16
Distributing data to client 17
Distributing data to client 18
Distributing data to client 19
Distributing data to client 20
Distributing data to client 21
Distributing data to client 22
Distributing data to client 23
Distributing data to client 24
Distributing data to client 25
Distributing data to client 26
Distributing data to client 27
Distributing data to client 28
Distributing data to client 29
Distributing data to client 30
Distributing data to client 31
Distributing data 

In [11]:
NETWORK_GENERATOR = get_network_generator()
SEED_NET = NETWORK_GENERATOR()
SEED_MODEL_PARAMS = get_model_parameters(SEED_NET)
CID_CLIENT_GENERATOR = get_flower_client_generator(NETWORK_GENERATOR, FEDERATED_IID_PARTITION)

In [12]:
# FL experiments
experiment_batch_sizes = [32, 64, 128, 256, 512]
cohort_sizes = [5, 10, 20, 50, 75, 100, 150]


# Federated configuration dictionary
federated_train_config = {
    "epochs": 10,
    "batch_size": 32,
    "client_learning_rate": 0.01,
    "weight_decay": 0,
    "num_workers": 0,
    "max_batches": 100,
}

federated_test_config: dict[str, Any] = {
    "batch_size": 32,
    "num_workers": 0,
    "max_batches": 100,
}

num_rounds = 10
num_total_clients = 100
num_evaluate_clients = 0
num_clients_per_round = 10

initial_parameters = ndarrays_to_parameters(SEED_MODEL_PARAMS)

federated_evaluation_function = get_federated_evaluation_function(
    batch_size=federated_test_config["batch_size"],
    num_workers=federated_test_config["num_workers"],
    model_generator=NETWORK_GENERATOR,
    criterion=nn.CrossEntropyLoss(),
    max_batches=None if "max_batches" not in federated_test_config else federated_test_config["max_batches"],
)

server_learning_rate = 1.0
server_momentum = 0.0
accept_failures = False


CID_CLIENT_GENERATOR = get_flower_client_generator(NETWORK_GENERATOR, FEDERATED_IID_PARTITION)

list_of_ids = sample_random_clients(
    num_total_clients, federated_train_config["batch_size"],
    CID_CLIENT_GENERATOR, max_clients=max_clients,
)

federated_client_generator = (
    get_flower_client_generator(
        NETWORK_GENERATOR, FEDERATED_IID_PARTITION, lambda seq_id: list_of_ids[seq_id]
    )
)

INFO flwr 2025-03-18 11:54:01,537 | flwr_core.py:107 | cid: 632
INFO flwr 2025-03-18 11:54:01,544 | flwr_core.py:107 | cid: 947
INFO flwr 2025-03-18 11:54:01,549 | flwr_core.py:107 | cid: 546
INFO flwr 2025-03-18 11:54:01,554 | flwr_core.py:107 | cid: 726
INFO flwr 2025-03-18 11:54:01,559 | flwr_core.py:107 | cid: 374
INFO flwr 2025-03-18 11:54:01,564 | flwr_core.py:107 | cid: 584
INFO flwr 2025-03-18 11:54:01,571 | flwr_core.py:107 | cid: 599
INFO flwr 2025-03-18 11:54:01,577 | flwr_core.py:107 | cid: 749
INFO flwr 2025-03-18 11:54:01,582 | flwr_core.py:107 | cid: 169
INFO flwr 2025-03-18 11:54:01,588 | flwr_core.py:107 | cid: 793
INFO flwr 2025-03-18 11:54:01,594 | flwr_core.py:107 | cid: 844
INFO flwr 2025-03-18 11:54:01,599 | flwr_core.py:107 | cid: 340
INFO flwr 2025-03-18 11:54:01,605 | flwr_core.py:107 | cid: 392
INFO flwr 2025-03-18 11:54:01,610 | flwr_core.py:107 | cid: 650
INFO flwr 2025-03-18 11:54:01,615 | flwr_core.py:107 | cid: 808
INFO flwr 2025-03-18 11:54:01,621 | flwr

In [None]:
"""
metric_keys = ['training_time', 'samples_processed', 'noise_scale', 'train_loss', 'actual_batches']
import gc

B_simples = []
results = []
batch_sizes = [16, 32, 64, 128, 256]
batch_times = []
for batch_size in batch_sizes:
    train_cfg = federated_train_config.copy()
    train_cfg["batch_size"] = batch_size
    ratio = np.sqrt(batch_size / 256)
    train_cfg["client_learning_rate"] = ratio * 0.01 # Same as centralized, but should be lower for FL

    test_cfg = federated_test_config.copy()
    test_cfg["batch_size"] = batch_size

    local_list_of_ids = sample_random_clients(num_total_clients, train_cfg["batch_size"], CID_CLIENT_GENERATOR, max_clients=max_clients)
    local_federated_client_generator = get_flower_client_generator(NETWORK_GENERATOR, FEDERATED_IID_PARTITION, lambda seq_id: local_list_of_ids[seq_id])

    parameters_for_each_round, hist = run_simulation(
        num_rounds = 1,
        num_total_clients = num_total_clients,
        num_clients_per_round = num_clients_per_round,
        num_evaluate_clients = num_evaluate_clients,
        min_available_clients = num_total_clients,
        min_fit_clients = num_clients_per_round,
        min_evaluate_clients = num_evaluate_clients,
        evaluate_fn = federated_evaluation_function,
        on_fit_config_fn = lambda _: train_cfg,
        on_evaluate_config_fn = lambda _: test_cfg,
        initial_parameters = initial_parameters,
        fit_metrics_aggregation_fn = aggregate_weighted_average,
        evaluate_metrics_aggregation_fn = aggregate_weighted_average,
        federated_client_generator = local_federated_client_generator,
        server_learning_rate=server_learning_rate,
        server_momentum=server_momentum,
        accept_failures=accept_failures,
        )

    times = []
    for round_idx, round_metrics in hist.metrics_distributed_fit['training_time']:
        if round_idx > num_rounds:
            break
        round_times = [t for _, t in round_metrics['all']]
        times.append(np.mean(round_times))

    cumulative_time = np.sum(times)
    batch_times.append((batch_size, cumulative_time, times))
    #n_params = len(hist.metrics_distributed_fit.keys()) - 5
    #param_keys = list(set(hist.metrics_distributed_fit.keys()) - set(metric_keys))
    #hist_metrics = {key: hist.metrics_distributed_fit[key] for key in metric_keys}
    #params = [hist.metrics_distributed_fit[key] for key in param_keys]
    #del hist
    #gc.collect()

    #res = (batch_size, parameters_for_each_round, hist_metrics, params)
    #results.append(res)
"""

In [None]:
#for b, cumulative_round_times, total_times in batch_times:
#    print("Batch size: ", b)
#    print("Total time: ", cumulative_round_times)
#    print("Times per round: ", total_times)
"""

Batch size:  16
Total time:  22.73178415029806
Times per round:  [22.73178415029806]
Batch size:  32
Total time:  13.338960419098475
Times per round:  [13.338960419098475]
Batch size:  64
Total time:  5.612612966999677
Times per round:  [5.612612966999677]
Batch size:  128
Total time:  3.1369846047005012
Times per round:  [3.1369846047005012]
Batch size:  256
Total time:  1.3977464394006347
Times per round:  [1.3977464394006347]

"""



In [None]:
import pickle

def save_experiment(save_file_name, batch_size, parameters_for_each_round, hist):
    """Save experiment results using pickle.
    
    Args:
        save_file_name (str): Path to save the results
        batch_size (int): Batch size used in experiment
        parameters_for_each_round (list): List of model parameters for each round
        hist (History): Flower History object containing metrics
    """
    
    results_dict = {
        'batch_size': batch_size,
        'parameters_for_each_round': parameters_for_each_round,
        'history': hist
    }
    
    with open(save_file_name, 'wb') as f:  # Note: 'wb' for binary write mode
        pickle.dump(results_dict, f)

def load_experiment(file_name):
    """Load experiment results from a pickle file.
    
    Args:
        file_name (str): Path to the results file
        
    Returns:
        tuple: (batch_size, parameters_for_each_round, hist)
    """
    with open(file_name, 'rb') as f:  # Note: 'rb' for binary read mode
        results_dict = pickle.load(f)
    
    return (
        results_dict['batch_size'],
        results_dict['parameters_for_each_round'],
        results_dict['history'],
    )

total_batch_results = []

experiment_batch_sizes = [256, 128, 64, 32, 16]
for batch_size in experiment_batch_sizes:
    print(f"------------------------------------------------- BATCH SIZE: {batch_size} ---------------------------------------------------------------")
    train_cfg = federated_train_config.copy()
    train_cfg["batch_size"] = batch_size
    ratio = np.sqrt(batch_size / 256) # non-iid ratio, 32 not working, 64 not working, 128 not working, 
    learning_rate = ratio * 0.01 # Same as centralized, but should be lower for FL
    train_cfg["client_learning_rate"] = learning_rate

    test_cfg = federated_test_config.copy()
    test_cfg["batch_size"] = batch_size

    local_list_of_ids = sample_random_clients(num_total_clients, train_cfg["batch_size"], CID_CLIENT_GENERATOR, max_clients=max_clients)
    local_federated_client_generator = get_flower_client_generator(NETWORK_GENERATOR, FEDERATED_IID_PARTITION, lambda seq_id: local_list_of_ids[seq_id])

    parameters_for_each_round, hist = run_simulation(
        num_rounds = num_rounds,
        num_total_clients = num_total_clients,
        num_clients_per_round = num_clients_per_round,
        num_evaluate_clients = num_evaluate_clients,
        min_available_clients = num_total_clients,
        min_fit_clients = num_clients_per_round,
        min_evaluate_clients = num_evaluate_clients,
        evaluate_fn = federated_evaluation_function,
        on_fit_config_fn = lambda _: train_cfg,
        on_evaluate_config_fn = lambda _: test_cfg,
        initial_parameters = initial_parameters,
        fit_metrics_aggregation_fn = aggregate_weighted_average,
        evaluate_metrics_aggregation_fn = aggregate_weighted_average,
        federated_client_generator = local_federated_client_generator,
        server_learning_rate=server_learning_rate,
        server_momentum=server_momentum,
        accept_failures=accept_failures,
        target_accuracy=0.60,
        use_target_accuracy=True,
        )

    total_batch_results.append((batch_size, parameters_for_each_round, hist))
    save_experiment(fr"results/IID_federated_local_batch_results_{batch_size}.pkl", batch_size, parameters_for_each_round, hist)
    # open a file, and append to it batch size, learnign rate, number of rounds taken to reach target accuracy
    with open("bruh_file.txt", "a") as f:
        f.write(f"Batch size: {batch_size}, Learning rate: {learning_rate}, Number of rounds taken to reach target accuracy: {len(hist.metrics_centralized['accuracy'])}\n")

In [13]:
import pickle

def save_experiment(save_file_name, batch_size, parameters_for_each_round, hist):
    """Save experiment results using pickle.
    
    Args:
        save_file_name (str): Path to save the results
        batch_size (int): Batch size used in experiment
        parameters_for_each_round (list): List of model parameters for each round
        hist (History): Flower History object containing metrics
    """
    
    results_dict = {
        'batch_size': batch_size,
        'parameters_for_each_round': parameters_for_each_round,
        'history': hist
    }
    
    with open(save_file_name, 'wb') as f:  # Note: 'wb' for binary write mode
        pickle.dump(results_dict, f)

def load_experiment(file_name):
    """Load experiment results from a pickle file.
    
    Args:
        file_name (str): Path to the results file
        
    Returns:
        tuple: (batch_size, parameters_for_each_round, hist)
    """
    with open(file_name, 'rb') as f:  # Note: 'rb' for binary read mode
        results_dict = pickle.load(f)
    
    return (
        results_dict['batch_size'],
        results_dict['parameters_for_each_round'],
        results_dict['history'],
    )

total_cohort_results = []
cohort_sizes =  [5, 10, 20, 50, 75, 100]
for cohort_size in cohort_sizes:
    train_cfg = federated_train_config.copy()
    ratio = np.sqrt(cohort_size / 100)
    train_cfg["client_learning_rate"] = ratio * 0.01

    test_cfg = federated_test_config.copy()

    parameters_for_each_round, hist = run_simulation(
        num_rounds = 10,
        num_total_clients = num_total_clients,
        num_clients_per_round = cohort_size,
        num_evaluate_clients = num_evaluate_clients,
        min_available_clients = num_total_clients,
        min_fit_clients = cohort_size,
        min_evaluate_clients = num_evaluate_clients,
        evaluate_fn = federated_evaluation_function,
        on_fit_config_fn = lambda _: train_cfg,
        on_evaluate_config_fn = lambda _: test_cfg,
        initial_parameters = initial_parameters,
        fit_metrics_aggregation_fn = aggregate_weighted_average,
        evaluate_metrics_aggregation_fn = aggregate_weighted_average,
        federated_client_generator = federated_client_generator,
        server_learning_rate=server_learning_rate,
        server_momentum=server_momentum,
        accept_failures=accept_failures,
        target_accuracy=0.60,
        use_target_accuracy=True,
        )

    total_cohort_results.append((cohort_size, parameters_for_each_round, hist))
    save_experiment(f"results/IID_federated_cohort_results_{cohort_size}.pkl", cohort_size, parameters_for_each_round=parameters_for_each_round, hist=hist)

INFO flwr 2025-03-18 11:54:20,165 | experiments_simulation.py:232 | FL will execute for 10 rounds
INFO flwr 2025-03-18 11:54:20,170 | app.py:149 | Starting Flower simulation, config: ServerConfig(num_rounds=10, round_timeout=None)
INFO flwr 2025-03-18 11:54:20,171 | flwr_core.py:264 | Initializing global parameters
INFO flwr 2025-03-18 11:54:20,172 | server_returns_parameters.py:273 | Using initial parameters provided by strategy
INFO flwr 2025-03-18 11:54:20,173 | flwr_core.py:269 | Evaluating initial parameters
 11%|█         | 100/891 [00:01<00:14, 54.06it/s]
INFO flwr 2025-03-18 11:54:22,586 | flwr_core.py:272 | initial parameters (loss, other metrics): 413.6843070983887, {'accuracy': 0.0065625}
INFO flwr 2025-03-18 11:54:22,587 | flwr_core.py:280 | FL starting - Target accuracy: 0.6
DEBUG flwr 2025-03-18 11:54:22,587 | server_returns_parameters.py:223 | fit_round 1: strategy sampled 5 clients (out of 100)
INFO flwr 2025-03-18 11:54:22,588 | flwr_core.py:107 | cid: 985
INFO flwr 20

In [14]:
import pickle

def save_experiment(save_file_name, batch_size, parameters_for_each_round, hist):
    """Save experiment results using pickle.
    
    Args:
        save_file_name (str): Path to save the results
        batch_size (int): Batch size used in experiment
        parameters_for_each_round (list): List of model parameters for each round
        hist (History): Flower History object containing metrics
    """
    
    results_dict = {
        'batch_size': batch_size,
        'parameters_for_each_round': parameters_for_each_round,
        'history': hist
    }
    
    with open(save_file_name, 'wb') as f:  # Note: 'wb' for binary write mode
        pickle.dump(results_dict, f)

def load_experiment(file_name):
    """Load experiment results from a pickle file.
    
    Args:
        file_name (str): Path to the results file
        
    Returns:
        tuple: (batch_size, parameters_for_each_round, hist)
    """
    with open(file_name, 'rb') as f:  # Note: 'rb' for binary read mode
        results_dict = pickle.load(f)
    
    return (
        results_dict['batch_size'],
        results_dict['parameters_for_each_round'],
        results_dict['history'],
    )

total_global_batch_results = []
cs_bs_pairs = [(5, 20), (20, 50), (50, 200), (100, 250), (100, 1000), (100, 2000), (100, 4000), (100, 12000)]
for cohort_size, batch_size in cs_bs_pairs:
    global_batch_size = batch_size * cohort_size
    train_cfg = federated_train_config.copy()
    ratio = np.sqrt(cohort_size * batch_size / 1e6)
    # if i multiply by batch size, i want to divide
    train_cfg["client_learning_rate"] = ratio * 0.01
    #train_cfg["max_batches"] = 1000

    test_cfg = federated_test_config.copy()

    parameters_for_each_round, hist = run_simulation(
        num_rounds = 10,
        num_total_clients = num_total_clients,
        num_clients_per_round = cohort_size,
        num_evaluate_clients = num_evaluate_clients,
        min_available_clients = num_total_clients,
        min_fit_clients = cohort_size,
        min_evaluate_clients = num_evaluate_clients,
        evaluate_fn = federated_evaluation_function,
        on_fit_config_fn = lambda _: train_cfg,
        on_evaluate_config_fn = lambda _: test_cfg,
        initial_parameters = initial_parameters,
        fit_metrics_aggregation_fn = aggregate_weighted_average,
        evaluate_metrics_aggregation_fn = aggregate_weighted_average,
        federated_client_generator = federated_client_generator,
        server_learning_rate=server_learning_rate,
        server_momentum=server_momentum,
        accept_failures=accept_failures,
        target_accuracy=0.60,
        use_target_accuracy=True,
        )

    total_global_batch_results.append((global_batch_size, parameters_for_each_round, hist))
    save_experiment(f"results/IID_federated_global_batch_results_{global_batch_size}.pkl", global_batch_size, parameters_for_each_round=parameters_for_each_round, hist=hist)

INFO flwr 2025-03-18 12:31:32,695 | experiments_simulation.py:232 | FL will execute for 10 rounds
INFO flwr 2025-03-18 12:31:32,698 | app.py:149 | Starting Flower simulation, config: ServerConfig(num_rounds=10, round_timeout=None)
INFO flwr 2025-03-18 12:31:32,699 | flwr_core.py:264 | Initializing global parameters
INFO flwr 2025-03-18 12:31:32,700 | server_returns_parameters.py:273 | Using initial parameters provided by strategy
INFO flwr 2025-03-18 12:31:32,702 | flwr_core.py:269 | Evaluating initial parameters
 11%|█         | 100/891 [00:01<00:12, 64.02it/s]
INFO flwr 2025-03-18 12:31:34,453 | flwr_core.py:272 | initial parameters (loss, other metrics): 413.6843070983887, {'accuracy': 0.0065625}
INFO flwr 2025-03-18 12:31:34,454 | flwr_core.py:280 | FL starting - Target accuracy: 0.6
DEBUG flwr 2025-03-18 12:31:34,454 | server_returns_parameters.py:223 | fit_round 1: strategy sampled 5 clients (out of 100)
INFO flwr 2025-03-18 12:31:34,455 | flwr_core.py:107 | cid: 985
INFO flwr 20