# Hyperparameter Finetuning

We want to find the right parameters for the Generator Network.

By using the best values for the following parameters:

- Number of Epochs (meaning `num_epochs` and `num_steps`)
- Learning Rate
- Batch Size
- Number of Noise Batches
- Number of Layers
- Regularization term
- Number of Neurons for each Network

In [None]:
from src.fyemu_tunable import main, evaluate
import torch
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import optuna
from torchvision.models import resnet18

from src.metrics import kl_divergence_between_models

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE

Check out this little tutorial, to see how we handle the optimization using save states:

https://optuna.readthedocs.io/en/stable/tutorial/20_recipes/001_rdb.html

In [None]:
import logging
import sys
import pickle
import optuna

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "GeneratorOpti"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)

if os.path.exists("sampler.pkl"):
    restored_sampler = pickle.load(open("sampler.pkl", "rb"))
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, sampler=restored_sampler)
else:
    study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

In [None]:
def objective(trial):

    opt_Epochs = trial.suggest_int('opt_Epochs', 1, 10)
    opt_Steps = trial.suggest_int('opt_Steps', 1, 20)
    opt_Learning_Rate = trial.suggest_float('opt_Learning_Rate', 0.01, 0.3)
    opt_Batch_Size = 256 # trial.suggest_int('opt_Batch_Size', 32, 512)
    opt_Number_of_Noise_Batches = trial.suggest_int('opt_Number_of_Noise_Batches', 1, 10)
    opt_Regularization_term = trial.suggest_float('opt_Regularization_term', 0.01, 0.3)
    opt_Noise_Dim = trial.suggest_int('opt_Noise_Dim', 1, 512)

    # print(f"Epochs: {opt_Epochs} |\nSteps: {opt_Steps} |\nLearning Rate: {opt_Learning_Rate} |\nBatch Size: {opt_Batch_Size} |\nNoise Batches: {opt_Number_of_Noise_Batches} |\nRegularization Term: {opt_Regularization_term} |\nNoise Dim: {opt_Noise_Dim}")

    l1 = trial.suggest_int('l1', 32, 1024)
    l2 = trial.suggest_int('l2', 32, 1024)
    l3 = trial.suggest_int('l3', 32, 1024)
    l4 = trial.suggest_int('l4', 32, 1024)
    l5 = trial.suggest_int('l5', 32, 1024)
    l6 = trial.suggest_int('l6', 32, 1024)
    l7 = trial.suggest_int('l7', 32, 1024)
    n_layers = trial.suggest_int('n_layers', 1, 7)

    Layers = [l1, l2, l3, l4, l5, l6, l7]
    Layers = Layers[:n_layers]
    # print("Layers: ", Layers)

    mod = main(
        t_Epochs = opt_Epochs,
        t_Steps= opt_Steps,
        t_Learning_Rate = opt_Learning_Rate,
        t_Batch_Size = opt_Batch_Size,
        t_Number_of_Noise_Batches = opt_Number_of_Noise_Batches,
        t_Regularization_term = opt_Regularization_term,
        t_Layers = Layers,
        t_Noise_Dim = opt_Noise_Dim,
        new_baseline=False,
        logs=False,
        model_eval_logs=True,
    )
    
    data_dir = f'data{os.sep}cifar10'

    transform_test = tt.Compose([
        tt.ToTensor(),
        tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
    valid_dl = DataLoader(valid_ds, 256,)

    exact = resnet18(num_classes = 10)
    exact.load_state_dict(torch.load("ResNET18_CIFAR10_RETAIN_CLASSES.pt", weights_only=True))
    div = kl_divergence_between_models(
        model1 = mod,
        model2 = exact,
        data_loader = valid_dl,
    )

    return div

study.optimize(objective, n_trials=50)

import pickle

# Save the sampler with pickle to be loaded later.
with open("sampler.pkl", "wb") as fout:
    pickle.dump(study.sampler, fout)

In [5]:
study.best_params

{'opt_Epochs': 7,
 'opt_Steps': 5,
 'opt_Learning_Rate': 0.1554530512278481,
 'opt_Batch_Size': 273,
 'opt_Number_of_Noise_Batches': 3,
 'opt_Regularization_term': 0.20414346598354305,
 'opt_Noise_Dim': 229,
 'l1': 672,
 'l2': 408,
 'l3': 266,
 'l4': 129,
 'l5': 372,
 'l6': 443,
 'l7': 528,
 'n_layers': 3}

In [23]:
trials_df = study.trials_dataframe()
best10_df = trials_df.sort_values("value").head(10)
best10_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_l1,params_l2,params_l3,params_l4,params_l5,...,params_l7,params_n_layers,params_opt_Batch_Size,params_opt_Epochs,params_opt_Learning_Rate,params_opt_Noise_Dim,params_opt_Number_of_Noise_Batches,params_opt_Regularization_term,params_opt_Steps,state
275,275,0.768493,2025-01-14 10:31:35.793537,2025-01-14 10:33:28.905537,0 days 00:01:53.112000,672,408,266,129,372,...,528,3,273.0,7,0.155453,229,3,0.204143,5,COMPLETE
135,135,0.778053,2025-01-13 09:19:32.307524,2025-01-13 09:21:30.359889,0 days 00:01:58.052365,534,910,538,155,958,...,535,5,201.0,8,0.036812,479,8,0.147283,5,COMPLETE
570,570,0.778113,2025-01-16 10:05:56.810646,2025-01-16 10:08:11.953031,0 days 00:02:15.142385,368,336,439,781,429,...,49,1,,9,0.165617,336,8,0.061542,19,COMPLETE
261,261,0.778298,2025-01-14 10:03:38.835062,2025-01-14 10:05:32.148857,0 days 00:01:53.313795,692,314,238,210,598,...,517,3,298.0,7,0.158513,76,8,0.212575,3,COMPLETE
371,371,0.778931,2025-01-14 13:31:40.866109,2025-01-14 13:33:37.786496,0 days 00:01:56.920387,349,63,327,157,360,...,228,7,269.0,9,0.036099,427,2,0.204727,3,COMPLETE
204,204,0.779018,2025-01-13 11:32:50.460912,2025-01-13 11:34:42.504825,0 days 00:01:52.043913,475,270,208,137,625,...,204,3,227.0,8,0.013843,331,10,0.246037,3,COMPLETE
91,91,0.779788,2025-01-09 11:17:04.540813,2025-01-09 11:17:34.150099,0 days 00:00:29.609286,312,358,718,149,886,...,595,5,189.0,6,0.027585,133,7,0.257624,3,COMPLETE
395,395,0.780311,2025-01-14 14:16:19.846509,2025-01-14 14:18:16.838864,0 days 00:01:56.992355,677,59,243,106,613,...,161,3,272.0,9,0.042272,263,6,0.189973,4,COMPLETE
500,500,0.783781,2025-01-15 16:13:45.013982,2025-01-15 16:16:27.170467,0 days 00:02:42.156485,700,797,312,172,594,...,496,4,,8,0.142858,502,10,0.215919,19,COMPLETE
526,526,0.784351,2025-01-15 17:14:41.868992,2025-01-15 17:17:16.566668,0 days 00:02:34.697676,485,344,312,143,685,...,789,4,,8,0.039318,399,6,0.082118,16,COMPLETE


In [38]:
print(f"Value {float(best10_df["value"].mean())}")
print(f"L1 {float(best10_df["params_l1"].mean())} ")
print(f"L2 {float(best10_df["params_l2"].mean())} ")
print(f"L3 {float(best10_df["params_l3"].mean())} ")
print(f"L4 {float(best10_df["params_l4"].mean())} ")
print(f"L5 {float(best10_df["params_l5"].mean())} ")
print(f"L6 {float(best10_df["params_l6"].mean())} ")
print(f"L7 {float(best10_df["params_l7"].mean())} ")
print(f"Number of Layers {float(best10_df["params_n_layers"].mean())} ")
print(f"Batch Size for Training {float(best10_df["params_opt_Batch_Size"].mean())} ")
print(f"Epochs for Noise Training {float(best10_df["params_opt_Epochs"].mean())} ")
print(f"LR for Noise Training {float(best10_df["params_opt_Learning_Rate"].mean())} ")
print(f"Noise Dim of Generator {float(best10_df["params_opt_Noise_Dim"].mean())}")
print(f"Number of Noise Batches Used {float(best10_df["params_opt_Number_of_Noise_Batches"].mean())} ")
print(f"Regularization Term {float(best10_df["params_opt_Regularization_term"].mean())} ")
print(f"Learning Steps for Noise Training {float(best10_df["params_opt_Steps"].mean())}")

Value 0.7789136573299765
L1 526.4 
L2 385.9 
L3 360.1 
L4 213.9 
L5 612.0 
L6 377.2 
L7 410.2 
Number of Layers 3.8 
Batch Size for Training 247.0 
Epochs for Noise Training 7.9 
LR for Noise Training 0.0818370551051665 
Noise Dim of Generator 317.5
Number of Noise Batches Used 6.8 
Regularization Term 0.18219432382942993 
Learning Steps for Noise Training 8.0


^^^This will represent the values used as default^^^

___

### Standard Parameters

In [None]:
# n0 = 5000
# n2 = 5000
# batch_size = 128

# standard_model, standard_history = main(
#     t_Epochs = 5,
#     t_Steps= int((n0 + n2)/(2 * batch_size)), # The Idea is to have the same amount of updates as their are samples to unlearn
#     t_Learning_Rate = 0.1,
#     t_Batch_Size = batch_size,
#     t_Number_of_Noise_Batches = 10,
#     t_Regularization_term = 0.1,
#     t_Layers = [1000],
#     t_Noise_Dim = 100,
#     new_baseline=True,
#     logs=True,
# )

___

In [None]:
from torchvision.models import resnet18
import torch

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

train = resnet18(num_classes = 10).to(DEVICE)
train.load_state_dict(torch.load("ResNET18_CIFAR10_ALL_CLASSES.pt",     weights_only=True))

exact = resnet18(num_classes = 10).to(DEVICE)
exact.load_state_dict(torch.load("ResNET18_CIFAR10_RETAIN_CLASSES.pt",  weights_only=True))

In [None]:
from src.metrics import kl_divergence_between_models
import os
import torchvision.transforms as tt
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

batch_size = 256
data_dir = f'data{os.sep}cifar10'

transform_test = tt.Compose([
    tt.ToTensor(),
    tt.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

valid_ds = ImageFolder(data_dir+f'{os.sep}test', transform_test)
valid_dl = DataLoader(valid_ds, batch_size, shuffle=False)

kl_divergence_between_models(model1 = train, model2 = train, data_loader = valid_dl)

In [None]:
kl_divergence_between_models(model1=exact, model2=exact, data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=train, model2=exact, data_loader=valid_dl)

In [None]:
kl_divergence_between_models(model1=exact, model2=train, data_loader=valid_dl)