<a href="https://colab.research.google.com/github/Jitpanu-Chai/Optuna/blob/main/Optuna_FashionMNIST_plusconvo_batchsize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq optuna

[K     |████████████████████████████████| 308 kB 7.3 MB/s 
[K     |████████████████████████████████| 210 kB 65.2 MB/s 
[K     |████████████████████████████████| 81 kB 11.6 MB/s 
[K     |████████████████████████████████| 78 kB 9.1 MB/s 
[K     |████████████████████████████████| 113 kB 83.3 MB/s 
[K     |████████████████████████████████| 146 kB 90.6 MB/s 
[K     |████████████████████████████████| 49 kB 7.7 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [2]:
import optuna
import os

# **PyTorch**

In [3]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
import math

In [None]:
DEVICE = torch.device("cuda") #torch.device("cpu")
# BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
# N_TRAIN_EXAMPLES = BATCHSIZE * 30
# N_VALID_EXAMPLES = BATCHSIZE * 10

# **Optuna+Model construct**

In [None]:
m = nn.MaxPool2d((2,2))
# pool of non-square window
# m = nn.MaxPool2d((3, 2), stride=(2, 1))
input = torch.randn(20, 56, 2, 2)
output = m(input)
print(output.shape)

torch.Size([20, 56, 1, 1])


In [None]:
math.floor(( (28 +2*0-1*(3-1)-1  ) /1)+1)

26

In [None]:
def cal_shape_after_convo(in_shape,kernel_size=1,dilation=1,stride=1,padding='same'):  #same padd=0
    if padding != 'same':
        new_shape = math.floor(( (in_shape +2*padding-dilation*(kernel_size-1)-1  ) /stride)+1)
    else:
        new_shape = in_shape
    return new_shape


In [None]:
def define_model(trial):
    #optmize number of layer ,hidden unit drop out
    n_layers = trial.suggest_int("n_layers",1,3)
    n_blocks = trial.suggest_int("n_blocks", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

    layers=[]
    # in_features = 28*28
    in_features = 1 #init channel (fasjon MNIST is 1 due to greyscale)    
    squre_shape = 28

    #convo block
    for i in range(1,n_blocks):
        kernal_select = trial.suggest_int("kernal_units_l{}".format(i),3,7)

        out_features = i*28 # how many times of it shape (28,28)
        layers.append(nn.Conv2d(in_features,out_features,kernel_size=kernal_select,padding='same'))
        squre_shape = cal_shape_after_convo(squre_shape,kernel_size=kernal_select)
       
       
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm2d(out_features))

        in_features = out_features
        layers.append(nn.Conv2d(in_features,out_features,kernel_size=kernal_select,padding='same'))
        squre_shape = cal_shape_after_convo(squre_shape,kernel_size=kernal_select)
       
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm2d(out_features))

        layers.append(nn.MaxPool2d((2,2)))
        squre_shape = math.floor(squre_shape/2)
               
        layers.append(nn.Dropout(0.5))
            
 
    layers.append(nn.Flatten())

    in_features = in_features*squre_shape*squre_shape    

    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i),16,512)
        layers.append(nn.Linear(in_features,out_features))
        layers.append(nn.ReLU())
        p= trial.suggest_float("dropout_l{}".format(i),0.2,0.8)
        layers.append(nn.Dropout(p))
        in_features=out_features

    layers.append(nn.Linear(in_features,CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [None]:
def get_mnist(BATCHSIZE):
    # Load FashionMNIST dataset.
    transform = transforms.Compose([transforms.ToTensor(),])

    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=True, download=True, transform=transform),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=False, transform=transform),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

# **Optuna Area**

In [None]:
def objective(trial):
    #code here
    #create model
    model = define_model(trial).to(DEVICE)
  
    #create optimize
    optimizer_name=trial.suggest_categorical("optimizers",["Adam","RMSprop","SGD"])
    lr= trial.suggest_float("lr",1e-5,1e-1,log=True)
    optimizer= getattr(optim, optimizer_name)(model.parameters(),lr=lr)

    #create batchsize select
    BATCHSIZE = trial.suggest_categorical("n_batch", [32,64,128,256,512,1024])

    N_TRAIN_EXAMPLES = BATCHSIZE * 30
    N_VALID_EXAMPLES = BATCHSIZE * 10

    #get MNIST dataset
    train_loader,valid_loader = get_mnist(BATCHSIZE)

    #train model
    for eporch in range(EPOCHS):
        #train torch code model
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1,28,28).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1,28,28).to(DEVICE), target.to(DEVICE)
                
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy,eporch) #report to optuna to check status its worth or not
        #handing prun algo
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()    
    return accuracy

In [None]:
# if __name__ == 'main':

study = optuna.create_study(direction='maximize') #select own direction or default
study.optimize(objective,n_trials=100) #number of trials #timeout=600

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print('study statistic')
print('number of finished trials: ',len(study.trials))
print('number of proned trials: ',len(pruned_trials))
print('number of completed trials: ',len(complete_trials))

print("Best trials:")
trial =study.best_trial
print('Value', trial.value)

print('Params')
for key,value in trial.params.items():
    print('{}:{}'.format(key,value))


[32m[I 2022-05-02 16:32:48,856][0m A new study created in memory with name: no-name-0157fc21-fb17-4ed4-bb46-b62ae0eedf03[0m
[32m[I 2022-05-02 16:32:59,359][0m Trial 0 finished with value: 0.671875 and parameters: {'n_layers': 3, 'n_blocks': 2, 'weight_decay': 5.0613861051247375e-09, 'kernal_units_l1': 3, 'n_units_l0': 285, 'dropout_l0': 0.7523013863712733, 'n_units_l1': 244, 'dropout_l1': 0.44611666708747577, 'n_units_l2': 152, 'dropout_l2': 0.3253678146645078, 'optimizers': 'RMSprop', 'lr': 1.3281683011527026e-05, 'n_batch': 256}. Best is trial 0 with value: 0.671875.[0m
[32m[I 2022-05-02 16:33:02,945][0m Trial 1 finished with value: 0.728125 and parameters: {'n_layers': 1, 'n_blocks': 3, 'weight_decay': 7.421792212677782e-05, 'kernal_units_l1': 3, 'kernal_units_l2': 7, 'n_units_l0': 168, 'dropout_l0': 0.6956880436641028, 'optimizers': 'SGD', 'lr': 0.0023737527008802217, 'n_batch': 64}. Best is trial 1 with value: 0.728125.[0m
  self.padding, self.dilation, self.groups)
[32m

study statistic
number of finished trials:  100
number of proned trials:  57
number of completed trials:  43
Best trials:
Value 0.9189
Params
n_layers:2
n_blocks:2
weight_decay:4.7274119135084036e-05
kernal_units_l1:3
n_units_l0:362
dropout_l0:0.4181671524652524
n_units_l1:262
dropout_l1:0.3221445951766942
optimizers:Adam
lr:0.0007623074661478627
n_batch:1024


# **Tensorflow**

In [4]:
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
import numpy as np

In [23]:
# N_TRAIN_EXAMPLES = 3000
# N_VALID_EXAMPLES = 1000
# BATCHSIZE = 128
CLASSES = 10
EPOCHS = 10

In [24]:
def create_model(trial):
    # We optimize the numbers of layers, their units and weight decay parameter.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    n_blocks = trial.suggest_int("n_blocks", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

    num_filters2=28
    drop_dense2=0.5
    drop_conv2=0
    num_classes = 10
    ac2='relu'
    reg2=None

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Input((28, 28, 3)))
    for i in range(1,n_blocks):
        num_kernal = trial.suggest_int("kernal_units_l{}".format(i), 3, 7)

        model.add(tf.keras.layers.Conv2D(i*num_filters2, num_kernal, activation=ac2, kernel_regularizer=reg2,padding='same'))
        model.add(tf.keras.layers.BatchNormalization(axis=-1))
        model.add(tf.keras.layers.Conv2D(i*num_filters2, num_kernal, activation=ac2,kernel_regularizer=reg2,padding='same'))
        model.add(tf.keras.layers.BatchNormalization(axis=-1))
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))   # reduces to 16x16x3xnum_filters
        model.add(tf.keras.layers.Dropout(drop_conv2))

    model.add(tf.keras.layers.Flatten())
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
        model.add(
            tf.keras.layers.Dense(
                num_hidden,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
            )
        )
    model.add(
        tf.keras.layers.Dense(CLASSES, kernel_regularizer=tf.keras.regularizers.l2(weight_decay))
    )
    return model

In [25]:
def create_optimizer(trial):
    # We optimize the choice of optimizers as well as their parameters.
    kwargs = {}
    optimizer_options = ["RMSprop", "Adam", "SGD"]
    optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options)
    if optimizer_selected == "RMSprop":
        kwargs["learning_rate"] = trial.suggest_float(
            "rmsprop_learning_rate", 1e-5, 1e-1, log=True
        )
        kwargs["decay"] = trial.suggest_float("rmsprop_decay", 0.85, 0.99)
        kwargs["momentum"] = trial.suggest_float("rmsprop_momentum", 1e-5, 1e-1, log=True)
    elif optimizer_selected == "Adam":
        kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True)
    elif optimizer_selected == "SGD":
        kwargs["learning_rate"] = trial.suggest_float(
            "sgd_opt_learning_rate", 1e-5, 1e-1, log=True
        )
        kwargs["momentum"] = trial.suggest_float("sgd_opt_momentum", 1e-5, 1e-1, log=True)

    optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)
    return optimizer


In [26]:
def learn(model, optimizer, dataset, mode="eval"):
    accuracy = tf.metrics.Accuracy("accuracy", dtype=tf.float32)

    for batch, (images, labels) in enumerate(dataset):
        with tf.GradientTape() as tape:
            logits = model(images, training=(mode == "train"))
            loss_value = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
            )
            if mode == "eval":
                accuracy(
                    tf.argmax(logits, axis=1, output_type=tf.int64), tf.cast(labels, tf.int64)
                )
            else:
                #unconnected_gradients=tf.UnconnectedGradients.ZERO
                grads = tape.gradient(loss_value, model.trainable_variables)  # use model.variables if no bacthnormalize and not need to use unconnected_gradients
                optimizer.apply_gradients(zip(grads, model.trainable_variables)) 

    if mode == "eval":        
        return accuracy

In [27]:
def get_fashion_mnist(BATCHSIZE,N_TRAIN_EXAMPLES,N_VALID_EXAMPLES):
    (x_train, y_train), (x_valid, y_valid) = fashion_mnist.load_data()   

    x_train = x_train.astype("float32") / 255
    x_valid = x_valid.astype("float32") / 255

    y_train = y_train.astype("int32")
    y_valid = y_valid.astype("int32")

    #if use conv2d
    x_train = tf.expand_dims(x_train,axis=-1)
    x_valid = tf.expand_dims(x_valid,axis=-1)
    x_train  = tf.image.grayscale_to_rgb(x_train)
    x_valid  = tf.image.grayscale_to_rgb(x_valid)
    ##

    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_ds = train_ds.shuffle(60000).batch(BATCHSIZE).take(N_TRAIN_EXAMPLES)

    valid_ds = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
    valid_ds = valid_ds.shuffle(10000).batch(BATCHSIZE).take(N_VALID_EXAMPLES)
    return train_ds, valid_ds

In [28]:
def objective(trial):
    #select batch size
    BATCHSIZE = trial.suggest_categorical("n_batch", [32,64,128,256,512,1024])

    N_TRAIN_EXAMPLES = BATCHSIZE * 30
    N_VALID_EXAMPLES = BATCHSIZE * 10

    # Get MNIST data.
    train_ds, valid_ds = get_fashion_mnist(BATCHSIZE,N_TRAIN_EXAMPLES,N_VALID_EXAMPLES)

    # Build model and optimizer.
    model = create_model(trial)
    optimizer = create_optimizer(trial)

    # Training and validating cycle.
    with tf.device("/GPU:0"):
        for eporch in range(EPOCHS):
            learn(model, optimizer, train_ds, "train")

            accuracy = learn(model, optimizer, valid_ds, "eval")

            trial.report(accuracy.result(),eporch) #report to optuna to check status its worth or not
            #handing prun algo
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()      

        # Return last validation accuracy.
        return accuracy.result()

# **Optuna Area**

In [29]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print('study statistic')
print('number of finished trials: ',len(study.trials))
print('number of proned trials: ',len(pruned_trials))
print('number of completed trials: ',len(complete_trials))

print("Best trials:")
trial =study.best_trial
print('Value', trial.value)

print('Params')
for key,value in trial.params.items():
    print('{}:{}'.format(key,value))

[32m[I 2022-05-04 13:46:29,441][0m A new study created in memory with name: no-name-4750f701-5999-4a2a-a728-0b17e78bb613[0m
[32m[I 2022-05-04 13:49:33,957][0m Trial 0 finished with value: 0.6317999958992004 and parameters: {'n_batch': 64, 'n_layers': 2, 'n_blocks': 3, 'weight_decay': 1.77174502023076e-07, 'kernal_units_l1': 6, 'kernal_units_l2': 3, 'n_units_l0': 37, 'n_units_l1': 37, 'optimizer': 'RMSprop', 'rmsprop_learning_rate': 0.00017699066735915226, 'rmsprop_decay': 0.8932610541169589, 'rmsprop_momentum': 0.006298463926655138}. Best is trial 0 with value: 0.6317999958992004.[0m
[32m[I 2022-05-04 13:52:01,814][0m Trial 1 finished with value: 0.9049000144004822 and parameters: {'n_batch': 32, 'n_layers': 3, 'n_blocks': 2, 'weight_decay': 5.6840367940709237e-05, 'kernal_units_l1': 3, 'n_units_l0': 28, 'n_units_l1': 23, 'n_units_l2': 58, 'optimizer': 'Adam', 'adam_learning_rate': 0.0002193370761202871}. Best is trial 1 with value: 0.9049000144004822.[0m
[32m[I 2022-05-04 13

study statistic
number of finished trials:  100
number of proned trials:  70
number of completed trials:  30
Best trials:
Value 0.9222999811172485
Params
n_batch:128
n_layers:1
n_blocks:5
weight_decay:2.429345622809195e-10
kernal_units_l1:4
kernal_units_l2:5
kernal_units_l3:7
kernal_units_l4:6
n_units_l0:6
optimizer:SGD
sgd_opt_learning_rate:0.09279727203690014
sgd_opt_momentum:0.09007219780292347
