# Hyperparameter Finetuning

We want to find the right parameters for the Generator Network.

By using the best values for the following parameters:

- Number of Epochs (meaning `num_epochs` and `num_steps`)
- Learning Rate
- Batch Size
- Number of Noise Batches
- Number of Layers
- Regularization term
- Number of Neurons for each Network

In [1]:
from src.fyemu_tunable import main, evaluate
import torch
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import optuna
import random
from typing import Dict
from torchvision.models import resnet18

from src.metrics import kl_divergence_between_models

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [2]:
def load_models_dict(path: str="data/new/models") -> Dict[str, torch.nn.Module]:
    de = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(num_classes = 10).to(de)
    
    # load all the models
    md = {}
    for list in os.listdir(path):
        
        model.load_state_dict(torch.load(f=os.path.join(path, list), map_location=DEVICE, weights_only=True))
        model.eval()
        md[len(md)] = model

Check out this little tutorial, to see how we handle the optimization using save states:

https://optuna.readthedocs.io/en/stable/tutorial/20_recipes/001_rdb.html

In [3]:
import logging
import sys
import pickle
import optuna

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "GeneratorOpti2"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)

if os.path.exists("sampler2.pkl"):
    restored_sampler = pickle.load(open("sampler2.pkl", "rb"))
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, sampler=restored_sampler)
else:
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

[I 2025-01-17 21:14:42,912] Using an existing study with name 'GeneratorOpti2' instead of creating a new one.


Using an existing study with name 'GeneratorOpti2' instead of creating a new one.


In [10]:
def objective(trial):

    opt_Epochs = trial.suggest_int('opt_Epochs', 1, 10)
    opt_Steps = trial.suggest_int('opt_Steps', 1, 20)
    opt_Learning_Rate = trial.suggest_float('opt_Learning_Rate', 0.01, 0.3)
    opt_Batch_Size = trial.suggest_int('opt_Batch_Size', 32, 512)
    opt_Number_of_Noise_Batches = trial.suggest_int('opt_Number_of_Noise_Batches', 1, 10)
    opt_Regularization_term = trial.suggest_float('opt_Regularization_term', 0.01, 0.3)
    opt_Noise_Dim = trial.suggest_int('opt_Noise_Dim', 1, 512)

    # print(f"Epochs: {opt_Epochs} |\nSteps: {opt_Steps} |\nLearning Rate: {opt_Learning_Rate} |\nBatch Size: {opt_Batch_Size} |\nNoise Batches: {opt_Number_of_Noise_Batches} |\nRegularization Term: {opt_Regularization_term} |\nNoise Dim: {opt_Noise_Dim}")
    n_layers = trial.suggest_int('n_layers', 1, 8)

    Layers = [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,]
    Layers = Layers[:n_layers]
    # print("Layers: ", Layers)

    mod = main(
        t_Epochs = opt_Epochs,
        t_Steps= opt_Steps,
        t_Learning_Rate = opt_Learning_Rate,
        t_Batch_Size = opt_Batch_Size,
        t_Number_of_Noise_Batches = opt_Number_of_Noise_Batches,
        t_Regularization_term = opt_Regularization_term,
        t_Layers = Layers,
        t_Noise_Dim = opt_Noise_Dim,
        new_baseline=False,
        logs=False,
        model_eval_logs=False,
    )
    
    data_dir = f'data{os.sep}cifar10'

    transform_test = tt.Compose([
        tt.ToTensor(),
        tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
    valid_dl = DataLoader(valid_ds, 256,)

    exact = resnet18(num_classes = 10)
    n = random.randint(0, len(os.listdir("data/retrain/models"))-1)
    exact.load_state_dict(torch.load(f"data/retrain/models/ResNET18_CIFAR10_RETRAIN_CLASSES_{n}.pt", map_location=DEVICE, weights_only=True))
    div = kl_divergence_between_models(
        model1 = exact,
        model2 = mod,
        data_loader = valid_dl,
    )

    return div

study.optimize(objective, n_trials=5)

import pickle

# Save the sampler with pickle to be loaded later.
with open("sampler2.pkl", "wb") as fout:
    pickle.dump(study.sampler, fout)

[I 2025-01-18 10:07:25,595] Trial 31 finished with value: 2.5626204639673227 and parameters: {'opt_Epochs': 4, 'opt_Steps': 17, 'opt_Learning_Rate': 0.13113464986409595, 'opt_Batch_Size': 330, 'opt_Number_of_Noise_Batches': 2, 'opt_Regularization_term': 0.27166762667600725, 'opt_Noise_Dim': 58, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


Trial 31 finished with value: 2.5626204639673227 and parameters: {'opt_Epochs': 4, 'opt_Steps': 17, 'opt_Learning_Rate': 0.13113464986409595, 'opt_Batch_Size': 330, 'opt_Number_of_Noise_Batches': 2, 'opt_Regularization_term': 0.27166762667600725, 'opt_Noise_Dim': 58, 'n_layers': 5}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 10:14:12,074] Trial 32 finished with value: 2.5201350718736646 and parameters: {'opt_Epochs': 1, 'opt_Steps': 11, 'opt_Learning_Rate': 0.014085645404016113, 'opt_Batch_Size': 266, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.1832003102316107, 'opt_Noise_Dim': 331, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


Trial 32 finished with value: 2.5201350718736646 and parameters: {'opt_Epochs': 1, 'opt_Steps': 11, 'opt_Learning_Rate': 0.014085645404016113, 'opt_Batch_Size': 266, 'opt_Number_of_Noise_Batches': 8, 'opt_Regularization_term': 0.1832003102316107, 'opt_Noise_Dim': 331, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 10:17:50,254] Trial 33 finished with value: 2.996022218465806 and parameters: {'opt_Epochs': 3, 'opt_Steps': 1, 'opt_Learning_Rate': 0.17235797723470592, 'opt_Batch_Size': 90, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.13947061703175456, 'opt_Noise_Dim': 240, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


Trial 33 finished with value: 2.996022218465806 and parameters: {'opt_Epochs': 3, 'opt_Steps': 1, 'opt_Learning_Rate': 0.17235797723470592, 'opt_Batch_Size': 90, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.13947061703175456, 'opt_Noise_Dim': 240, 'n_layers': 2}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 10:53:13,385] Trial 34 finished with value: 2.5245006024837497 and parameters: {'opt_Epochs': 9, 'opt_Steps': 12, 'opt_Learning_Rate': 0.11262137988854005, 'opt_Batch_Size': 399, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.10888334390174914, 'opt_Noise_Dim': 281, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


Trial 34 finished with value: 2.5245006024837497 and parameters: {'opt_Epochs': 9, 'opt_Steps': 12, 'opt_Learning_Rate': 0.11262137988854005, 'opt_Batch_Size': 399, 'opt_Number_of_Noise_Batches': 4, 'opt_Regularization_term': 0.10888334390174914, 'opt_Noise_Dim': 281, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


[I 2025-01-18 11:05:49,320] Trial 35 finished with value: 2.5003083825111387 and parameters: {'opt_Epochs': 2, 'opt_Steps': 11, 'opt_Learning_Rate': 0.04231281786048584, 'opt_Batch_Size': 511, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.2428903795536915, 'opt_Noise_Dim': 143, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


Trial 35 finished with value: 2.5003083825111387 and parameters: {'opt_Epochs': 2, 'opt_Steps': 11, 'opt_Learning_Rate': 0.04231281786048584, 'opt_Batch_Size': 511, 'opt_Number_of_Noise_Batches': 7, 'opt_Regularization_term': 0.2428903795536915, 'opt_Noise_Dim': 143, 'n_layers': 3}. Best is trial 17 with value: 2.2544206410646437.


In [5]:
study.best_params

{'opt_Epochs': 10,
 'opt_Steps': 3,
 'opt_Learning_Rate': 0.20640839953786477,
 'opt_Batch_Size': 186,
 'opt_Number_of_Noise_Batches': 5,
 'opt_Regularization_term': 0.18035246104166439,
 'opt_Noise_Dim': 426,
 'n_layers': 3}

In [6]:
trials_df = study.trials_dataframe()
best10_df = trials_df.sort_values("value").head(10)
best10_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_n_layers,params_opt_Batch_Size,params_opt_Epochs,params_opt_Learning_Rate,params_opt_Noise_Dim,params_opt_Number_of_Noise_Batches,params_opt_Regularization_term,params_opt_Steps,state
5,5,2.305901,2025-01-17 12:31:05.669089,2025-01-17 12:31:37.217590,0 days 00:00:31.548501,3,186,10,0.206408,426,5,0.180352,3,COMPLETE
2,2,2.70253,2025-01-17 12:10:48.627085,2025-01-17 12:12:47.792270,0 days 00:01:59.165185,6,423,3,0.093255,382,4,0.0431,13,COMPLETE
1,1,2.749585,2025-01-17 12:09:47.814136,2025-01-17 12:10:48.611890,0 days 00:01:00.797754,3,64,9,0.167193,225,7,0.108306,12,COMPLETE
10,10,2.812649,2025-01-17 21:14:43.036284,2025-01-17 21:42:59.640412,0 days 00:28:16.604128,4,179,6,0.158649,52,6,0.031941,20,COMPLETE
3,3,2.958736,2025-01-17 12:15:02.580302,2025-01-17 12:15:42.109564,0 days 00:00:39.529262,2,450,3,0.048239,197,4,0.086577,1,COMPLETE
4,4,3.031566,2025-01-17 12:15:42.126696,2025-01-17 12:21:24.702130,0 days 00:05:42.575434,8,471,6,0.29775,8,7,0.207947,19,COMPLETE
6,6,3.16906,2025-01-17 12:31:37.233993,2025-01-17 12:33:19.228269,0 days 00:01:41.994276,4,458,9,0.096268,448,9,0.076254,10,COMPLETE
0,0,,2025-01-17 12:08:55.005249,2025-01-17 12:09:16.458867,0 days 00:00:21.453618,7,116,2,0.025044,249,1,0.166994,3,FAIL
7,7,,2025-01-17 20:57:22.915080,2025-01-17 20:57:24.298970,0 days 00:00:01.383890,4,179,6,0.158649,52,6,0.031941,20,FAIL
8,8,,2025-01-17 21:11:11.779227,2025-01-17 21:11:13.149912,0 days 00:00:01.370685,4,179,6,0.158649,52,6,0.031941,20,FAIL


In [7]:
print(f"Value {float(best10_df["value"].mean())}")
print(f"Number of Layers {float(best10_df["params_n_layers"].mean())} ")
print(f"Batch Size for Training {float(best10_df["params_opt_Batch_Size"].mean())} ")
print(f"Epochs for Noise Training {float(best10_df["params_opt_Epochs"].mean())} ")
print(f"LR for Noise Training {float(best10_df["params_opt_Learning_Rate"].mean())} ")
print(f"Noise Dim of Generator {float(best10_df["params_opt_Noise_Dim"].mean())}")
print(f"Number of Noise Batches Used {float(best10_df["params_opt_Number_of_Noise_Batches"].mean())} ")
print(f"Regularization Term {float(best10_df["params_opt_Regularization_term"].mean())} ")
print(f"Learning Steps for Noise Training {float(best10_df["params_opt_Steps"].mean())}")

Value 2.818575150200299
Number of Layers 4.5 
Batch Size for Training 270.5 
Epochs for Noise Training 6.0 
LR for Noise Training 0.14101033415351125 
Noise Dim of Generator 209.1
Number of Noise Batches Used 5.5 
Regularization Term 0.09653515835859165 
Learning Steps for Noise Training 12.1


^^^This will represent the values used as default^^^

___

### Standard Parameters

In [None]:
# n0 = 5000
# n2 = 5000
# batch_size = 128

# standard_model, standard_history = main(
#     t_Epochs = 5,
#     t_Steps= int((n0 + n2)/(2 * batch_size)), # The Idea is to have the same amount of updates as their are samples to unlearn
#     t_Learning_Rate = 0.1,
#     t_Batch_Size = batch_size,
#     t_Number_of_Noise_Batches = 10,
#     t_Regularization_term = 0.1,
#     t_Layers = [1000],
#     t_Noise_Dim = 100,
#     new_baseline=True,
#     logs=True,
# )

___

In [None]:
from torchvision.models import resnet18
import torch

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

train_ms = load_models_dict(path="data/all/models")

exact_ms = load_models_dict(path="data/retrain/models")

In [None]:
from src.metrics import kl_divergence_between_models
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

batch_size = 256
data_dir = f'data{os.sep}cifar10'

transform_test = tt.Compose([
    tt.ToTensor(),
    tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
valid_dl = DataLoader(valid_ds, batch_size, shuffle=False)

kl_divergence_between_models(model1 = train_ms[0], model2 = train_ms[0], data_loader = valid_dl)

In [None]:
kl_divergence_between_models(model1=exact_ms[0], model2=exact_ms[0], data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=train_ms[0], model2=exact_ms[0], data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=exact_ms[0], model2=train_ms[0], data_loader=valid_dl)