# Hyperparameter Finetuning

We want to find the right parameters for the Generator Network.

By using the best values for the following parameters:

- Number of Epochs (meaning `num_epochs` and `num_steps`)
- Learning Rate
- Batch Size
- Number of Noise Batches
- Number of Layers
- Regularization term
- Number of Neurons for each Network

In [1]:
from src.fyemu_tunable import main, evaluate
import torch
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import optuna
import random
from typing import Dict
from torchvision.models import resnet18

from src.metrics import kl_divergence_between_models

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [2]:
def load_models_dict(path: str="data/new/models") -> Dict[str, torch.nn.Module]:
    de = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(num_classes = 10).to(de)
    
    # load all the models
    md = {}
    for list in os.listdir(path):
        
        model.load_state_dict(torch.load(f=os.path.join(path, list), map_location=DEVICE, weights_only=True))
        model.eval()
        md[len(md)] = model

Check out this little tutorial, to see how we handle the optimization using save states:

https://optuna.readthedocs.io/en/stable/tutorial/20_recipes/001_rdb.html

In [3]:
import logging
import sys
import pickle
import optuna

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "GeneratorOpti2"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)

if os.path.exists("sampler2.pkl"):
    restored_sampler = pickle.load(open("sampler2.pkl", "rb"))
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, sampler=restored_sampler)
else:
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

[I 2025-01-18 19:35:45,406] Using an existing study with name 'GeneratorOpti2' instead of creating a new one.


Using an existing study with name 'GeneratorOpti2' instead of creating a new one.


In [4]:
def objective(trial):

    opt_Epochs = trial.suggest_int('opt_Epochs', 1, 10)
    opt_Steps = trial.suggest_int('opt_Steps', 1, 20)
    opt_Learning_Rate = trial.suggest_float('opt_Learning_Rate', 0.01, 0.3)
    opt_Batch_Size = trial.suggest_int('opt_Batch_Size', 32, 512)
    opt_Number_of_Noise_Batches = trial.suggest_int('opt_Number_of_Noise_Batches', 1, 10)
    opt_Regularization_term = trial.suggest_float('opt_Regularization_term', 0.01, 0.3)
    opt_Noise_Dim = trial.suggest_int('opt_Noise_Dim', 1, 512)

    # print(f"Epochs: {opt_Epochs} |\nSteps: {opt_Steps} |\nLearning Rate: {opt_Learning_Rate} |\nBatch Size: {opt_Batch_Size} |\nNoise Batches: {opt_Number_of_Noise_Batches} |\nRegularization Term: {opt_Regularization_term} |\nNoise Dim: {opt_Noise_Dim}")
    n_layers = trial.suggest_int('n_layers', 1, 8)

    Layers = [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,]
    Layers = Layers[:n_layers]
    # print("Layers: ", Layers)

    mod = main(
        t_Epochs = opt_Epochs,
        t_Steps= opt_Steps,
        t_Learning_Rate = opt_Learning_Rate,
        t_Batch_Size = opt_Batch_Size,
        t_Number_of_Noise_Batches = opt_Number_of_Noise_Batches,
        t_Regularization_term = opt_Regularization_term,
        t_Layers = Layers,
        t_Noise_Dim = opt_Noise_Dim,
        new_baseline=False,
        logs=False,
        model_eval_logs=False,
    )
    
    data_dir = f'data{os.sep}cifar10'

    transform_test = tt.Compose([
        tt.ToTensor(),
        tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
    valid_dl = DataLoader(valid_ds, 256,)

    exact = resnet18(num_classes = 10)
    n = random.randint(0, len(os.listdir("data/retrain/models"))-1)
    exact.load_state_dict(torch.load(f"data/retrain/models/ResNET18_CIFAR10_RETRAIN_CLASSES_{n}.pt", map_location=DEVICE, weights_only=True))
    div = kl_divergence_between_models(
        model1 = exact,
        model2 = mod,
        data_loader = valid_dl,
    )

    return div

study.optimize(objective, n_trials=15)

import pickle

# Save the sampler with pickle to be loaded later.
with open("sampler2.pkl", "wb") as fout:
    pickle.dump(study.sampler, fout)

[I 2025-01-18 19:44:13,660] Trial 36 finished with value: 2.475285187363625 and parameters: {'opt_Epochs': 2, 'opt_Steps': 9, 'opt_Learning_Rate': 0.03828025424426709, 'opt_Batch_Size': 244, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.23029181440784674, 'opt_Noise_Dim': 183, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


Trial 36 finished with value: 2.475285187363625 and parameters: {'opt_Epochs': 2, 'opt_Steps': 9, 'opt_Learning_Rate': 0.03828025424426709, 'opt_Batch_Size': 244, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.23029181440784674, 'opt_Noise_Dim': 183, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 19:55:19,597] Trial 37 finished with value: 2.4421422153711325 and parameters: {'opt_Epochs': 3, 'opt_Steps': 13, 'opt_Learning_Rate': 0.061887100278658784, 'opt_Batch_Size': 233, 'opt_Number_of_Noise_Batches': 6, 'opt_Regularization_term': 0.25923289343932054, 'opt_Noise_Dim': 96, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


Trial 37 finished with value: 2.4421422153711325 and parameters: {'opt_Epochs': 3, 'opt_Steps': 13, 'opt_Learning_Rate': 0.061887100278658784, 'opt_Batch_Size': 233, 'opt_Number_of_Noise_Batches': 6, 'opt_Regularization_term': 0.25923289343932054, 'opt_Noise_Dim': 96, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 19:59:49,937] Trial 38 finished with value: 2.5746352642774584 and parameters: {'opt_Epochs': 1, 'opt_Steps': 8, 'opt_Learning_Rate': 0.024695075675066146, 'opt_Batch_Size': 159, 'opt_Number_of_Noise_Batches': 5, 'opt_Regularization_term': 0.28272537374548085, 'opt_Noise_Dim': 160, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


Trial 38 finished with value: 2.5746352642774584 and parameters: {'opt_Epochs': 1, 'opt_Steps': 8, 'opt_Learning_Rate': 0.024695075675066146, 'opt_Batch_Size': 159, 'opt_Number_of_Noise_Batches': 5, 'opt_Regularization_term': 0.28272537374548085, 'opt_Noise_Dim': 160, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:08:29,698] Trial 39 finished with value: 2.4010619416832917 and parameters: {'opt_Epochs': 2, 'opt_Steps': 10, 'opt_Learning_Rate': 0.05965992069552965, 'opt_Batch_Size': 203, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.20922097005021456, 'opt_Noise_Dim': 223, 'n_layers': 7}. Best is trial 17 with value: 2.2544206410646437.


Trial 39 finished with value: 2.4010619416832917 and parameters: {'opt_Epochs': 2, 'opt_Steps': 10, 'opt_Learning_Rate': 0.05965992069552965, 'opt_Batch_Size': 203, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.20922097005021456, 'opt_Noise_Dim': 223, 'n_layers': 7}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:21:42,092] Trial 40 finished with value: 2.5436935648322114 and parameters: {'opt_Epochs': 4, 'opt_Steps': 13, 'opt_Learning_Rate': 0.01001910274694122, 'opt_Batch_Size': 282, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.22134418791927576, 'opt_Noise_Dim': 103, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


Trial 40 finished with value: 2.5436935648322114 and parameters: {'opt_Epochs': 4, 'opt_Steps': 13, 'opt_Learning_Rate': 0.01001910274694122, 'opt_Batch_Size': 282, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.22134418791927576, 'opt_Noise_Dim': 103, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:34:28,089] Trial 41 finished with value: 2.5541708528995515 and parameters: {'opt_Epochs': 5, 'opt_Steps': 12, 'opt_Learning_Rate': 0.10283287299168228, 'opt_Batch_Size': 200, 'opt_Number_of_Noise_Batches': 6, 'opt_Regularization_term': 0.24277132466528273, 'opt_Noise_Dim': 19, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


Trial 41 finished with value: 2.5541708528995515 and parameters: {'opt_Epochs': 5, 'opt_Steps': 12, 'opt_Learning_Rate': 0.10283287299168228, 'opt_Batch_Size': 200, 'opt_Number_of_Noise_Batches': 6, 'opt_Regularization_term': 0.24277132466528273, 'opt_Noise_Dim': 19, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:40:05,473] Trial 42 finished with value: 2.6224380791187287 and parameters: {'opt_Epochs': 7, 'opt_Steps': 5, 'opt_Learning_Rate': 0.02999365754281076, 'opt_Batch_Size': 101, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.257221539577231, 'opt_Noise_Dim': 394, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


Trial 42 finished with value: 2.6224380791187287 and parameters: {'opt_Epochs': 7, 'opt_Steps': 5, 'opt_Learning_Rate': 0.02999365754281076, 'opt_Batch_Size': 101, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.257221539577231, 'opt_Noise_Dim': 394, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:47:09,522] Trial 43 finished with value: 2.33573172390461 and parameters: {'opt_Epochs': 3, 'opt_Steps': 7, 'opt_Learning_Rate': 0.052655277383312576, 'opt_Batch_Size': 178, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.28726073532728874, 'opt_Noise_Dim': 463, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


Trial 43 finished with value: 2.33573172390461 and parameters: {'opt_Epochs': 3, 'opt_Steps': 7, 'opt_Learning_Rate': 0.052655277383312576, 'opt_Batch_Size': 178, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.28726073532728874, 'opt_Noise_Dim': 463, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 20:55:31,797] Trial 44 finished with value: 2.3632215186953545 and parameters: {'opt_Epochs': 3, 'opt_Steps': 7, 'opt_Learning_Rate': 0.08706531911307087, 'opt_Batch_Size': 153, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2868481802280207, 'opt_Noise_Dim': 452, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


Trial 44 finished with value: 2.3632215186953545 and parameters: {'opt_Epochs': 3, 'opt_Steps': 7, 'opt_Learning_Rate': 0.08706531911307087, 'opt_Batch_Size': 153, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2868481802280207, 'opt_Noise_Dim': 452, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:03:06,076] Trial 45 finished with value: 2.5314333558082582 and parameters: {'opt_Epochs': 3, 'opt_Steps': 6, 'opt_Learning_Rate': 0.07321977746852842, 'opt_Batch_Size': 155, 'opt_Number_of_Noise_Batches': 10, 'opt_Regularization_term': 0.2876249360608091, 'opt_Noise_Dim': 455, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


Trial 45 finished with value: 2.5314333558082582 and parameters: {'opt_Epochs': 3, 'opt_Steps': 6, 'opt_Learning_Rate': 0.07321977746852842, 'opt_Batch_Size': 155, 'opt_Number_of_Noise_Batches': 10, 'opt_Regularization_term': 0.2876249360608091, 'opt_Noise_Dim': 455, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:13:11,837] Trial 46 finished with value: 2.8321766793727874 and parameters: {'opt_Epochs': 3, 'opt_Steps': 8, 'opt_Learning_Rate': 0.08406480121761428, 'opt_Batch_Size': 185, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2990682481895076, 'opt_Noise_Dim': 469, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


Trial 46 finished with value: 2.8321766793727874 and parameters: {'opt_Epochs': 3, 'opt_Steps': 8, 'opt_Learning_Rate': 0.08406480121761428, 'opt_Batch_Size': 185, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2990682481895076, 'opt_Noise_Dim': 469, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:20:24,940] Trial 47 finished with value: 2.894526633620262 and parameters: {'opt_Epochs': 6, 'opt_Steps': 7, 'opt_Learning_Rate': 0.05305614282115011, 'opt_Batch_Size': 45, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2680580606048082, 'opt_Noise_Dim': 512, 'n_layers': 7}. Best is trial 17 with value: 2.2544206410646437.


Trial 47 finished with value: 2.894526633620262 and parameters: {'opt_Epochs': 6, 'opt_Steps': 7, 'opt_Learning_Rate': 0.05305614282115011, 'opt_Batch_Size': 45, 'opt_Number_of_Noise_Batches': 9, 'opt_Regularization_term': 0.2680580606048082, 'opt_Noise_Dim': 512, 'n_layers': 7}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:25:29,493] Trial 48 finished with value: 2.6553657263517376 and parameters: {'opt_Epochs': 1, 'opt_Steps': 4, 'opt_Learning_Rate': 0.1468701433498582, 'opt_Batch_Size': 140, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.281348249846726, 'opt_Noise_Dim': 425, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


Trial 48 finished with value: 2.6553657263517376 and parameters: {'opt_Epochs': 1, 'opt_Steps': 4, 'opt_Learning_Rate': 0.1468701433498582, 'opt_Batch_Size': 140, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.281348249846726, 'opt_Noise_Dim': 425, 'n_layers': 4}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:34:05,628] Trial 49 finished with value: 2.4068859398365023 and parameters: {'opt_Epochs': 3, 'opt_Steps': 6, 'opt_Learning_Rate': 0.1782616289648098, 'opt_Batch_Size': 179, 'opt_Number_of_Noise_Batches': 10, 'opt_Regularization_term': 0.0587673532602592, 'opt_Noise_Dim': 414, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


Trial 49 finished with value: 2.4068859398365023 and parameters: {'opt_Epochs': 3, 'opt_Steps': 6, 'opt_Learning_Rate': 0.1782616289648098, 'opt_Batch_Size': 179, 'opt_Number_of_Noise_Batches': 10, 'opt_Regularization_term': 0.0587673532602592, 'opt_Noise_Dim': 414, 'n_layers': 6}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 21:39:57,725] Trial 50 finished with value: 2.4679413437843327 and parameters: {'opt_Epochs': 4, 'opt_Steps': 2, 'opt_Learning_Rate': 0.02697893912930327, 'opt_Batch_Size': 106, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.015293660285069816, 'opt_Noise_Dim': 484, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


Trial 50 finished with value: 2.4679413437843327 and parameters: {'opt_Epochs': 4, 'opt_Steps': 2, 'opt_Learning_Rate': 0.02697893912930327, 'opt_Batch_Size': 106, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.015293660285069816, 'opt_Noise_Dim': 484, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


In [7]:
study.best_params

{'opt_Epochs': 1,
 'opt_Steps': 8,
 'opt_Learning_Rate': 0.011837483759601058,
 'opt_Batch_Size': 278,
 'opt_Number_of_Noise_Batches': 1,
 'opt_Regularization_term': 0.27394634876638047,
 'opt_Noise_Dim': 308,
 'n_layers': 2}

In [6]:
trials_df = study.trials_dataframe()
best10_df = trials_df.sort_values("value").head(10)
best10_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_n_layers,params_opt_Batch_Size,params_opt_Epochs,params_opt_Learning_Rate,params_opt_Noise_Dim,params_opt_Number_of_Noise_Batches,params_opt_Regularization_term,params_opt_Steps,state
17,17,2.254421,2025-01-17 22:19:20.289658,2025-01-17 22:23:22.093268,0 days 00:04:01.803610,2,278,1,0.011837,308,1,0.273946,8,COMPLETE
5,5,2.305901,2025-01-17 12:31:05.669089,2025-01-17 12:31:37.217590,0 days 00:00:31.548501,3,186,10,0.206408,426,5,0.180352,3,COMPLETE
43,43,2.335732,2025-01-18 20:40:05.484886,2025-01-18 20:47:09.470360,0 days 00:07:03.985474,4,178,3,0.052655,463,7,0.287261,7,COMPLETE
21,21,2.350066,2025-01-17 23:01:46.201408,2025-01-17 23:07:35.174347,0 days 00:05:48.972939,3,234,2,0.016953,146,6,0.25571,8,COMPLETE
44,44,2.363222,2025-01-18 20:47:09.542484,2025-01-18 20:55:31.755392,0 days 00:08:22.212908,6,153,3,0.087065,452,9,0.286848,7,COMPLETE
26,26,2.36977,2025-01-18 08:53:15.729876,2025-01-18 09:01:29.863646,0 days 00:08:14.133770,3,239,2,0.039383,144,7,0.234037,11,COMPLETE
39,39,2.401062,2025-01-18 19:59:49.947392,2025-01-18 20:08:29.672904,0 days 00:08:39.725512,7,203,2,0.05966,223,9,0.209221,10,COMPLETE
49,49,2.406886,2025-01-18 21:25:29.519228,2025-01-18 21:34:05.582655,0 days 00:08:36.063427,6,179,3,0.178262,414,10,0.058767,6,COMPLETE
16,16,2.424112,2025-01-17 22:15:48.429246,2025-01-17 22:19:20.247258,0 days 00:03:31.818012,3,170,1,0.013416,316,2,0.230344,6,COMPLETE
37,37,2.442142,2025-01-18 19:44:13.672352,2025-01-18 19:55:19.560128,0 days 00:11:05.887776,4,233,3,0.061887,96,6,0.259233,13,COMPLETE


In [7]:
print(f"Value {float(best10_df["value"].mean())}")
print(f"Number of Layers {float(best10_df["params_n_layers"].mean())} ")
print(f"Batch Size for Training {float(best10_df["params_opt_Batch_Size"].mean())} ")
print(f"Epochs for Noise Training {float(best10_df["params_opt_Epochs"].mean())} ")
print(f"LR for Noise Training {float(best10_df["params_opt_Learning_Rate"].mean())} ")
print(f"Noise Dim of Generator {float(best10_df["params_opt_Noise_Dim"].mean())}")
print(f"Number of Noise Batches Used {float(best10_df["params_opt_Number_of_Noise_Batches"].mean())} ")
print(f"Regularization Term {float(best10_df["params_opt_Regularization_term"].mean())} ")
print(f"Learning Steps for Noise Training {float(best10_df["params_opt_Steps"].mean())}")

Value 2.818575150200299
Number of Layers 4.5 
Batch Size for Training 270.5 
Epochs for Noise Training 6.0 
LR for Noise Training 0.14101033415351125 
Noise Dim of Generator 209.1
Number of Noise Batches Used 5.5 
Regularization Term 0.09653515835859165 
Learning Steps for Noise Training 12.1


^^^This will represent the values used as default^^^

___

### Standard Parameters

In [None]:
# n0 = 5000
# n2 = 5000
# batch_size = 128

# standard_model, standard_history = main(
#     t_Epochs = 5,
#     t_Steps= int((n0 + n2)/(2 * batch_size)), # The Idea is to have the same amount of updates as their are samples to unlearn
#     t_Learning_Rate = 0.1,
#     t_Batch_Size = batch_size,
#     t_Number_of_Noise_Batches = 10,
#     t_Regularization_term = 0.1,
#     t_Layers = [1000],
#     t_Noise_Dim = 100,
#     new_baseline=True,
#     logs=True,
# )

___

In [None]:
from torchvision.models import resnet18
import torch

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

train_ms = load_models_dict(path="data/all/models")

exact_ms = load_models_dict(path="data/retrain/models")

In [None]:
from src.metrics import kl_divergence_between_models
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

batch_size = 256
data_dir = f'data{os.sep}cifar10'

transform_test = tt.Compose([
    tt.ToTensor(),
    tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
valid_dl = DataLoader(valid_ds, batch_size, shuffle=False)

kl_divergence_between_models(model1 = train_ms[0], model2 = train_ms[0], data_loader = valid_dl)

In [None]:
kl_divergence_between_models(model1=exact_ms[0], model2=exact_ms[0], data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=train_ms[0], model2=exact_ms[0], data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=exact_ms[0], model2=train_ms[0], data_loader=valid_dl)