# Noise/Sampling error trade-offs

We consider some hyper-parameter values are manually set before the training of the private model begins:
- Number of DPSGD iterations (`num_steps`)
- Per-sample gradients' clipping bound (`max_grad_norm`)
- Target DP parameters (`target_epsilons` and `target_delta`). 

Particularly, in the FL scenarios, the following variables are also considered as the hyper-parameters:
- Number of the communication rounds (`num_rounds`)
- Client-level sampling probability (`client_rate`) 

In [1]:
import importlib
import numpy as np
import os
import pandas as pd
import sys
import torch
from torch.utils.data import DataLoader
import warnings # ignore warnings for clarity
warnings.simplefilter("ignore")
sys.path.append("..")

from configs.config_utils import read_config, get_config_file_path

from myopacus.accountants.rpdp_utils import GENERATE_EPSILONS_FUNC

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
dataset = "mnist"
device = torch.device(f"cuda:7" if torch.cuda.is_available() else "cpu")
module_name = f"datasets.fed_{dataset}"
try:
    dataset_modules = importlib.import_module(module_name)
    FedClass = dataset_modules.FedClass
    RawClass = dataset_modules.RawClass
    BaselineModel = dataset_modules.BaselineModel
    BaselineLoss = dataset_modules.BaselineLoss
    Optimizer = dataset_modules.Optimizer
    metric = dataset_modules.metric

except ModuleNotFoundError as e:
    print(f'{module_name} import failed: {e}')

project_abspath = os.path.abspath(os.path.join(os.getcwd(),"../.."))
dict = read_config(get_config_file_path(dataset_name=f"fed_{dataset}", debug=False))
save_filename = os.path.join(f"results_test_sigmas_fedavg_{dataset}.csv")

NUM_CLIENTS = dict["fedavg"]["num_clients"]
NUM_STEPS = dict["fedavg"]["num_steps"]
NUM_ROUNDS = dict["fedavg"]["num_rounds"]
CLIENT_RATE = dict["fedavg"]["client_rate"]
BATCH_SIZE = dict["fedavg"]["batch_size"]
LR = dict["fedavg"]["learning_rate"]

LR_DP = dict["dpfedavg"]["learning_rate"]
MAX_GRAD_NORM = dict["dpfedavg"]["max_grad_norm"]
TARGET_DELTA = dict["dpfedavg"]["target_delta"]
MAX_PHYSICAL_BATCH_SIZE = dict["dpfedavg"]["max_physical_batch_size"]

""" Prepare personalized epsilons """
# different distributions & different settings
SETTINGS = dict["rpdpfedavg"]["settings"]
MIN_EPSILON, MAX_EPSILON = dict["rpdpfedavg"]["min_epsilon"], dict["rpdpfedavg"]["max_epsilon"]
BoundedFunc = lambda values: np.array([min(max(x, MIN_EPSILON), MAX_EPSILON) for x in values])

""" Prepare local datasets """
data_path = os.path.join(project_abspath, dict["dataset_dir"][f"iid_{NUM_CLIENTS}"])
rawdata = RawClass(data_path=data_path)
test_dls, training_dls, target_epsilons = [], [], []
for i in range(NUM_CLIENTS): # NUM_CLIENTS
    train_data = FedClass(rawdata=rawdata, center=i, train=True)
    train_dl = DataLoader(train_data, batch_size=len(train_data))
    training_dls.append(train_dl)
    target_epsilons.append(BoundedFunc(GENERATE_EPSILONS_FUNC["BoundedMixGauss"](len(train_dl.dataset), SETTINGS["BoundedMixGauss"][0])))

    test_data = FedClass(rawdata=rawdata, center=i, train=False)
    test_dl = DataLoader(test_data, batch_size=BATCH_SIZE)
    test_dls.append(test_dl)

""" Prepare model and loss """
# We set model and dataloaders to be the same for each rep
global_init = BaselineModel.to(device)
criterion = BaselineLoss()

training_args = {
    "training_dataloaders": training_dls,
    "test_dataloaders": test_dls,
    "loss": criterion,
    "optimizer_class": Optimizer,
    "learning_rate": LR_DP,
    "num_steps": NUM_STEPS,
    "num_rounds": NUM_ROUNDS,
    "client_rate": CLIENT_RATE,
    "device": device,
    "metric": metric
}

ValueError: The string /home/junxu/rPDP-FL/experiments/datasets/fed_mnist/iid_10 is not a valid path.

## The noise scales (aka. `noise_multiplier`) reflect how much Gaussian noise introduced in the training process

In [None]:
NOISE_MULTIPLIERS = [1., 1.2, 1.5, 2., 5., 10.]

In [None]:
import copy
import datetime
from myopacus import PrivacyEngine
from myopacus.strategies import FedAvg

for noise_multiplier in NOISE_MULTIPLIERS:
    privacy_engine = PrivacyEngine(accountant="fed_rdp", n_clients=NUM_CLIENTS)
    privacy_engine.prepare_fedrpdp(
        num_steps = NUM_STEPS,
        num_rounds = NUM_ROUNDS,
        client_rate = CLIENT_RATE,
        target_epsilons = target_epsilons,
        target_delta = TARGET_DELTA,
        noise_multiplier = noise_multiplier,
        max_grad_norm = MAX_GRAD_NORM,
        max_physical_batch_size = MAX_PHYSICAL_BATCH_SIZE
    )
    current_args = copy.deepcopy(training_args)
    current_args["model"] = copy.deepcopy(global_init)
    current_args["privacy_engine"] = privacy_engine

    s = FedAvg(**current_args, log=False)
    cm, perf = s.run()
    mean_perf = np.mean(perf[-3:])
    expected_batch_size = [int(sum(acct.sample_rate)) for acct in s.privacy_engine.accountant.accountants]
    
    print(f"Mean performance of BoundedMixGauss, min_eps={min(target_epsilons[0]):.4f}, max_eps={max(target_epsilons[0]):.4f}, delta={TARGET_DELTA}, Perf={mean_perf:.4f}")
    results_dict = [{
        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
        "mean_perf": round(mean_perf,4), "perf": perf, 
        "e": f"BoundedMixGauss-0-Ours", 
        "d": TARGET_DELTA, 
        "nm": round(s.privacy_engine.default_noise_multiplier, 2), 
        "norm": MAX_GRAD_NORM, 
        "bs": expected_batch_size, 
        "lr": LR_DP,
        "num_clients": NUM_CLIENTS,
        "client_rate": CLIENT_RATE}]
    results = pd.DataFrame.from_dict(results_dict)
    results.to_csv(save_filename, mode='a', index=False)
    del privacy_engine, s, cm, mean_perf