<a href="https://colab.research.google.com/github/Jitpanu-Chai/Optuna/blob/main/Optuna_FashionMNIST_plusconvo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq optuna

[K     |████████████████████████████████| 308 kB 4.3 MB/s 
[K     |████████████████████████████████| 81 kB 10.9 MB/s 
[K     |████████████████████████████████| 210 kB 55.7 MB/s 
[K     |████████████████████████████████| 78 kB 9.3 MB/s 
[K     |████████████████████████████████| 49 kB 8.1 MB/s 
[K     |████████████████████████████████| 146 kB 53.9 MB/s 
[K     |████████████████████████████████| 113 kB 58.9 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [2]:
import optuna
import os

# **PyTorch**

In [41]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
import math

In [17]:
DEVICE = torch.device("cuda") #torch.device("cpu")
BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10

# **Optuna+Model construct**

In [197]:
m = nn.MaxPool2d((2,2))
# pool of non-square window
# m = nn.MaxPool2d((3, 2), stride=(2, 1))
input = torch.randn(20, 56, 2, 2)
output = m(input)
print(output.shape)

torch.Size([20, 56, 1, 1])


In [66]:
math.floor(( (28 +2*0-1*(3-1)-1  ) /1)+1)

26

In [192]:
def cal_shape_after_convo(in_shape,kernel_size=1,dilation=1,stride=1,padding='same'):  #same padd=0
    if padding != 'same':
        new_shape = math.floor(( (in_shape +2*padding-dilation*(kernel_size-1)-1  ) /stride)+1)
    else:
        new_shape = in_shape
    return new_shape


In [195]:
def define_model(trial):
    #optmize number of layer ,hidden unit drop out
    n_layers = trial.suggest_int("n_layers",1,3)
    n_blocks = trial.suggest_int("n_blocks", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

    layers=[]
    # in_features = 28*28
    in_features = 1 #init channel (fasjon MNIST is 1 due to greyscale)    
    squre_shape = 28

    #convo block
    for i in range(1,n_blocks):
      
        out_features = i*28 # how many times of it shape (28,28)
        layers.append(nn.Conv2d(in_features,out_features,kernel_size=(3,3),padding='same'))
        squre_shape = cal_shape_after_convo(squre_shape,kernel_size=3)
       
       
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm2d(out_features))

        in_features = out_features
        layers.append(nn.Conv2d(in_features,out_features,kernel_size=(3,3),padding='same'))
        squre_shape = cal_shape_after_convo(squre_shape,kernel_size=3)
       
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm2d(out_features))

        layers.append(nn.MaxPool2d((2,2)))
        squre_shape = math.floor(squre_shape/2)
               
        layers.append(nn.Dropout(0.5))
            
 
    layers.append(nn.Flatten())

    in_features = in_features*squre_shape*squre_shape    

    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i),16,512)
        layers.append(nn.Linear(in_features,out_features))
        layers.append(nn.ReLU())
        p= trial.suggest_float("dropout_l{}".format(i),0.2,0.8)
        layers.append(nn.Dropout(p))
        in_features=out_features

    layers.append(nn.Linear(in_features,CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [29]:
def get_mnist():
    # Load FashionMNIST dataset.
    transform = transforms.Compose([transforms.ToTensor(),])

    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=True, download=True, transform=transform),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=False, transform=transform),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

# **Optuna Area**

In [170]:
def objective(trial):
    #code here
    #create model
    model = define_model(trial).to(DEVICE)
  
    #create optimize
    optimizer_name=trial.suggest_categorical("optimizers",["Adam","RMSprop","SGD"])
    lr= trial.suggest_float("lr",1e-5,1e-1,log=True)
    optimizer= getattr(optim, optimizer_name)(model.parameters(),lr=lr)

    #get MNIST dataset
    train_loader,valid_loader = get_mnist()

    #train model
    for eporch in range(EPOCHS):
        #train torch code model
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1,28,28).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1,28,28).to(DEVICE), target.to(DEVICE)
                
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy,eporch) #report to optuna to check status its worth or not
        #handing prun algo
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()    
    return accuracy

In [196]:
# if __name__ == 'main':

study = optuna.create_study(direction='maximize') #select own direction or default
study.optimize(objective,n_trials=100) #number of trials #timeout=600

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print('study statistic')
print('number of finished trials: ',len(study.trials))
print('number of proned trials: ',len(pruned_trials))
print('number of completed trials: ',len(complete_trials))

print("Best trials:")
trial =study.best_trial
print('Value', trial.value)

print('Params')
for key,value in trial.params.items():
    print('{}:{}'.format(key,value))


[32m[I 2022-04-18 05:38:17,524][0m A new study created in memory with name: no-name-ad3e7294-259e-47ee-b5d4-bfb88ce3ae3c[0m
[32m[I 2022-04-18 05:38:22,260][0m Trial 0 finished with value: 0.80703125 and parameters: {'n_layers': 1, 'n_blocks': 2, 'weight_decay': 4.046120942215587e-06, 'n_units_l0': 329, 'dropout_l0': 0.7086303463289649, 'optimizers': 'SGD', 'lr': 0.0032649756716407177}. Best is trial 0 with value: 0.80703125.[0m
[32m[I 2022-04-18 05:38:27,117][0m Trial 1 finished with value: 0.8453125 and parameters: {'n_layers': 1, 'n_blocks': 2, 'weight_decay': 1.0155992484519032e-09, 'n_units_l0': 146, 'dropout_l0': 0.20156072113121778, 'optimizers': 'Adam', 'lr': 0.012982806663220384}. Best is trial 1 with value: 0.8453125.[0m
[32m[I 2022-04-18 05:38:33,311][0m Trial 2 finished with value: 0.0984375 and parameters: {'n_layers': 2, 'n_blocks': 5, 'weight_decay': 5.455216279135173e-09, 'n_units_l0': 71, 'dropout_l0': 0.7751981253446396, 'n_units_l1': 16, 'dropout_l1': 0.640

study statistic
number of finished trials:  100
number of proned trials:  87
number of completed trials:  13
Best trials:
Value 0.88671875
Params
n_layers:1
n_blocks:2
weight_decay:2.576819579267082e-09
n_units_l0:189
dropout_l0:0.32341598763057056
optimizers:Adam
lr:0.002537427225966454


# **Tensorflow**

In [3]:
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
import numpy as np

In [4]:
N_TRAIN_EXAMPLES = 3000
N_VALID_EXAMPLES = 1000
BATCHSIZE = 128
CLASSES = 10
EPOCHS = 1

In [5]:
def create_model(trial):
    # We optimize the numbers of layers, their units and weight decay parameter.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    n_blocks = trial.suggest_int("n_blocks", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

    num_filters2=28
    drop_dense2=0.5
    drop_conv2=0
    num_classes = 10
    ac2='relu'
    reg2=None

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Input((28, 28, 3)))
    for i in range(1,n_blocks):
        model.add(tf.keras.layers.Conv2D(i*num_filters2, (3, 3), activation=ac2, kernel_regularizer=reg2,padding='same'))
        model.add(tf.keras.layers.BatchNormalization(axis=-1))
        model.add(tf.keras.layers.Conv2D(i*num_filters2, (3, 3), activation=ac2,kernel_regularizer=reg2,padding='same'))
        model.add(tf.keras.layers.BatchNormalization(axis=-1))
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))   # reduces to 16x16x3xnum_filters
        model.add(tf.keras.layers.Dropout(drop_conv2))

    model.add(tf.keras.layers.Flatten())
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
        model.add(
            tf.keras.layers.Dense(
                num_hidden,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
            )
        )
    model.add(
        tf.keras.layers.Dense(CLASSES, kernel_regularizer=tf.keras.regularizers.l2(weight_decay))
    )
    return model

In [6]:
def create_optimizer(trial):
    # We optimize the choice of optimizers as well as their parameters.
    kwargs = {}
    optimizer_options = ["RMSprop", "Adam", "SGD"]
    optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options)
    if optimizer_selected == "RMSprop":
        kwargs["learning_rate"] = trial.suggest_float(
            "rmsprop_learning_rate", 1e-5, 1e-1, log=True
        )
        kwargs["decay"] = trial.suggest_float("rmsprop_decay", 0.85, 0.99)
        kwargs["momentum"] = trial.suggest_float("rmsprop_momentum", 1e-5, 1e-1, log=True)
    elif optimizer_selected == "Adam":
        kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True)
    elif optimizer_selected == "SGD":
        kwargs["learning_rate"] = trial.suggest_float(
            "sgd_opt_learning_rate", 1e-5, 1e-1, log=True
        )
        kwargs["momentum"] = trial.suggest_float("sgd_opt_momentum", 1e-5, 1e-1, log=True)

    optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)
    return optimizer


In [7]:
def learn(model, optimizer, dataset, mode="eval"):
    accuracy = tf.metrics.Accuracy("accuracy", dtype=tf.float32)

    for batch, (images, labels) in enumerate(dataset):
        with tf.GradientTape() as tape:
            logits = model(images, training=(mode == "train"))
            loss_value = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
            )
            if mode == "eval":
                accuracy(
                    tf.argmax(logits, axis=1, output_type=tf.int64), tf.cast(labels, tf.int64)
                )
            else:
                #unconnected_gradients=tf.UnconnectedGradients.ZERO
                grads = tape.gradient(loss_value, model.trainable_variables)  # use model.variables if no bacthnormalize and not need to use unconnected_gradients
                optimizer.apply_gradients(zip(grads, model.trainable_variables)) 

    if mode == "eval":        
        return accuracy

In [8]:
def get_fashion_mnist():
    (x_train, y_train), (x_valid, y_valid) = fashion_mnist.load_data()   

    x_train = x_train.astype("float32") / 255
    x_valid = x_valid.astype("float32") / 255

    y_train = y_train.astype("int32")
    y_valid = y_valid.astype("int32")

    #if use conv2d
    x_train = tf.expand_dims(x_train,axis=-1)
    x_valid = tf.expand_dims(x_valid,axis=-1)
    x_train  = tf.image.grayscale_to_rgb(x_train)
    x_valid  = tf.image.grayscale_to_rgb(x_valid)
    ##

    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_ds = train_ds.shuffle(60000).batch(BATCHSIZE).take(N_TRAIN_EXAMPLES)

    valid_ds = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
    valid_ds = valid_ds.shuffle(10000).batch(BATCHSIZE).take(N_VALID_EXAMPLES)
    return train_ds, valid_ds

In [9]:
def objective(trial):
    # Get MNIST data.
    train_ds, valid_ds = get_fashion_mnist()

    # Build model and optimizer.
    model = create_model(trial)
    optimizer = create_optimizer(trial)

    # Training and validating cycle.
    with tf.device("/GPU:0"):
        for _ in range(EPOCHS):
            learn(model, optimizer, train_ds, "train")

        accuracy = learn(model, optimizer, valid_ds, "eval")    

    # Return last validation accuracy.
    return accuracy.result()

# **Optuna Area**

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-04-18 01:33:00,622][0m A new study created in memory with name: no-name-8101c920-e3f7-4191-8a6d-6cf9236d0732[0m


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


[32m[I 2022-04-18 01:33:28,687][0m Trial 0 finished with value: 0.2011999934911728 and parameters: {'n_layers': 1, 'n_blocks': 4, 'weight_decay': 1.4281756309299527e-09, 'n_units_l0': 9, 'optimizer': 'RMSprop', 'rmsprop_learning_rate': 0.0001362782370178351, 'rmsprop_decay': 0.9414219033836515, 'rmsprop_momentum': 0.008363942906356506}. Best is trial 0 with value: 0.2011999934911728.[0m
[32m[I 2022-04-18 01:33:42,671][0m Trial 1 finished with value: 0.7930999994277954 and parameters: {'n_layers': 2, 'n_blocks': 4, 'weight_decay': 2.0839350240977483e-08, 'n_units_l0': 27, 'n_units_l1': 126, 'optimizer': 'Adam', 'adam_learning_rate': 0.004658236199794476}. Best is trial 1 with value: 0.7930999994277954.[0m
[32m[I 2022-04-18 01:33:52,869][0m Trial 2 finished with value: 0.741100013256073 and parameters: {'n_layers': 1, 'n_blocks': 3, 'weight_decay': 0.0004589575270552784, 'n_units_l0': 14, 'optimizer': 'Adam', 'adam_learning_rate': 0.000516791718695325}. Best is trial 1 with value

Number of finished trials:  100
Best trial:
  Value:  0.8716999888420105
  Params: 
    n_layers: 2
    n_blocks: 3
    weight_decay: 1.3698860991102454e-09
    n_units_l0: 72
    n_units_l1: 65
    optimizer: SGD
    sgd_opt_learning_rate: 0.05213788753077835
    sgd_opt_momentum: 1.1457459164044076e-05
