In [None]:
import numpy as np
import wandb
from keras.datasets import fashion_mnist


# Neural Network Class: feed_forward_NN_4

class feed_forward_NN_4:
    def __init__(self,
                 layers,
                 optimizer,
                 learning_rate,
                 momentum,
                 beta1,
                 beta2,
                 beta_rms,
                 epsilon,
                 weight_decay,
                 init_type,
                 activation
                 ):
    
        
        self.layers = layers
        self.layer_n = len(layers)
        self.optimizer = optimizer.lower()
        self.lr = learning_rate
        self.momentum = momentum
        self.beta1 = beta1
        self.beta2 = beta2
        self.beta_rms = beta_rms
        self.epsilon = epsilon
        self.weight_decay = weight_decay
        self.init_type = init_type.lower()
        self.activation = activation.lower()
        

        # Initialize Weights & BiaseS
        self.weights = []
        self.biases = []
        for i in range(self.layer_n - 1):
            if self.init_type == "xavier":
                # "Xavier" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(1.0 / layers[i])
            else:
                # "random" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2.0 / layers[i])
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)

        # initialize extra Params 
        if self.optimizer in ["momentum", "nesterov", "rmsprop", "adam", "nadam"]:
            self.v_w = [np.zeros_like(w) for w in self.weights]
            self.v_b = [np.zeros_like(b) for b in self.biases]
        if self.optimizer in ["adam", "nadam"]:
            self.m_w = [np.zeros_like(w) for w in self.weights]
            self.m_b = [np.zeros_like(b) for b in self.biases]
            self.t = 0

    # activations 
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def relu(self, x):
        return np.maximum(0, x)

    def activate(self, x):
        if self.activation == "sigmoid":
            return self.sigmoid(x)
        elif self.activation == "tanh":
            return self.tanh(x)
        elif self.activation == "relu":
            return self.relu(x)
        else:
            return self.sigmoid(x) 
        
    # derivatives
    def derivative(self, a):

        if self.activation == "sigmoid":
            return a * (1 - a)
        elif self.activation == "tanh":
            return 1 - a**2
        elif self.activation == "relu":
            return (a > 0).astype(float)
        else:
            return a * (1 - a) 

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    # Forward Pass
    def forward_pass(self, x):
        self.h = [x]  
        # Hidden layers
        for i in range(self.layer_n - 2):
            z = np.dot(self.h[i], self.weights[i]) + self.biases[i]
            act = self.activate(z)
            self.h.append(act)
        # Output layer- softmax
        z_out = np.dot(self.h[-1], self.weights[-1]) + self.biases[-1]
        out = self.softmax(z_out)
        self.h.append(out)
        return self.h

    # Backward Pass
    def backward_prop(self, y_true):
        m = y_true.shape[0]
        dw = [None] * (self.layer_n - 1)
        db = [None] * (self.layer_n - 1)

        # Cross-entropy derivative for output layer
        delta = self.h[-1] - y_true  # shape: (batch_size, output_dim)

        # Propagation
        for i in reversed(range(self.layer_n - 1)):
            dw[i] = np.dot(self.h[i].T, delta) / m
            db[i] = np.sum(delta, axis=0, keepdims=True) / m
            if i > 0:
                # For hidden layers, multiply by derivative of activation
                delta = np.dot(delta, self.weights[i].T) * self.derivative(self.h[i])
        return dw, db

    # Param Updates for "Non-Nesterov" 
    def _update_params(self, dw, db):
        # Add weight decay to each gradient
        for i in range(self.layer_n - 1):
            dw[i] += self.weight_decay * self.weights[i]

        if self.optimizer == "sgd":
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * dw[i]
                self.biases[i] -= self.lr * db[i]

        elif self.optimizer == "momentum":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dw[i]
                self.v_b[i] = self.momentum * self.v_b[i] + db[i]
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i] -= self.lr * self.v_b[i]

        elif self.optimizer == "rmsprop":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.beta_rms * self.v_w[i] + (1 - self.beta_rms) * (dw[i] ** 2)
                self.v_b[i] = self.beta_rms * self.v_b[i] + (1 - self.beta_rms) * (db[i] ** 2)
                self.weights[i] -= self.lr * dw[i] / (np.sqrt(self.v_w[i]) + self.epsilon)
                self.biases[i]  -= self.lr * db[i] / (np.sqrt(self.v_b[i]) + self.epsilon)

        elif self.optimizer == "adam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)

                self.weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

        elif self.optimizer == "nadam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** (self.t + 1))
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** (self.t + 1))
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** (self.t + 1))
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** (self.t + 1))

                grad_term_w = self.beta1 * m_w_hat + (1 - self.beta1) * dw[i] / (1 - self.beta1 ** (self.t + 1))
                grad_term_b = self.beta1 * m_b_hat + (1 - self.beta1) * db[i] / (1 - self.beta1 ** (self.t + 1))

                self.weights[i] -= self.lr * grad_term_w / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * grad_term_b / (np.sqrt(v_b_hat) + self.epsilon)

    # Training Step  with "Nesterov"
    def _train_step(self, x_batch, y_batch):
        if self.optimizer == "nesterov":
            # to look-ahead: w_look = w - momentum * v
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr*self.momentum * self.v_w[i]
                self.biases[i]  -= self.lr*self.momentum * self.v_b[i]

            # Forward at the look-ahead position
            self.forward_pass(x_batch)
            out = self.h[-1]
            l2_norm_weights = 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params # (1e-10) to prevent underflow
            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)

            # add weight decay here
            for i in range(self.layer_n - 1):
                dW[i] += self.weight_decay * self.weights[i]

            # backward at the look-ahead position (go back to w_t)
            for i in range(self.layer_n - 1):
                self.weights[i] += self.lr*self.momentum * self.v_w[i]
                self.biases[i]  += self.lr*self.momentum * self.v_b[i]

            # update velocity: u_t = momentum*u_{t-1} + dW
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dW[i]
                self.v_b[i] = self.momentum * self.v_b[i] + dB[i]

            # final param update: w = w - lr*u_t
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i]  -= self.lr * self.v_b[i]

            return loss
        else:
            # Normal forward/back
            self.forward_pass(x_batch)
            out = self.h[-1]

            l2_norm_weights=0
            l2_norm_bias= 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params 

            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)
            self._update_params(dW, dB)
            return loss

    # Outer Training Loop 
    def training(self, x_train, y_train, x_val, y_val, epochs, batch_size):
       
        for ep in range(epochs):
            idx = np.random.permutation(x_train.shape[0])
            x_train_shuff = x_train[idx]
            y_train_shuff = y_train[idx]
            n_batches = len(x_train) // batch_size
            epoch_loss = 0.0
            for b in range(n_batches):
                start = b * batch_size
                end = start + batch_size
                x_batch = x_train_shuff[start:end]
                y_batch = y_train_shuff[start:end]
                loss = self._train_step(x_batch, y_batch)
                epoch_loss += loss
            avg_loss = epoch_loss / n_batches

            # Validation

            preds = self.predict(x_val)
            val_labels = np.argmax(y_val, axis=1)
            val_acc = np.mean(preds == val_labels)

            val_outputs = self.forward_pass(x_val)[-1]
        
            # Cross-entropy loss for validation
            val_loss = -np.mean(np.sum(y_val * np.log(val_outputs + 1e-10), axis=1))

            # Log metrics to wandb
            wandb.log({"epoch": ep+1, "training_loss": avg_loss, "validation_accuracy": val_acc, "validation loss": val_loss})
            print(f"Epoch {ep+1}/{epochs} - loss={avg_loss:.4f}, val_acc={val_acc:.4f}, val_loss={val_loss}" )

    #Prediction 
    def predict(self, X):
        self.forward_pass(X)
        return np.argmax(self.h[-1], axis=1)




# (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
# x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
# x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

# np.random.seed(42)
# idx = np.arange(x_train_full.shape[0])
# np.random.shuffle(idx)
# x_train_full = x_train_full[idx]
# y_train_full = y_train_full[idx]

# # 90% training, 10% validation 
# train_size=int(.9*len(x_train_full))

# x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
# x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

# num_classes = 10
# y_train_1h = np.eye(num_classes)[y_train]
# y_val_1h = np.eye(num_classes)[y_val]
# y_test_1h = np.eye(num_classes)[y_test]

# # model
# model = feed_forward_NN_4(
#     layers=[784] + [32] *3 + [10],
# optimizer="nesterov",
# learning_rate=0.01,
# momentum=0.9,
# beta1=0.9,
# beta2=0.999,
# beta_rms=0.9,
# epsilon=1e-4,
# weight_decay=0.0005,
# init_type="xavier",
# activation="relu")

#     # Train the model
# model.training(
#         x_train=x_train,
#         y_train=y_train_1h,
#         x_val=x_val,
#         y_val=y_val_1h,
#         epochs=10,
#         batch_size=32
#     )

#     #Evaluation on test set
# test_preds = model.predict(x_test)
# test_labels = np.argmax(y_test_1h, axis=1)
# test_acc = np.mean(test_preds == test_labels)
# print("test accuracy ",test_acc)
# #wandb.log({"test_accuracy": test_acc})




# train_sweep() function

def train_sweep():
    # Initialize wandb
    wandb.init()
    config = wandb.config

    #custom run name from hyperparameters
    run_name = f"hl_{config.num_hidden_layers}_bs_{config.batch_size}_ac_{config.activation}_opt_{config.optimizer}"
    wandb.run.name = run_name

    # Load Fashion-MNIST
    (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
    x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
    x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

    np.random.seed(42)
    idx = np.arange(x_train_full.shape[0])
    np.random.shuffle(idx)
    x_train_full = x_train_full[idx]
    y_train_full = y_train_full[idx]

    # 90% training, 10% validation 
    train_size=int(.9*len(x_train_full))

    x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
    x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

    num_classes = 10
    y_train_1h = np.eye(num_classes)[y_train]
    y_val_1h = np.eye(num_classes)[y_val]
    y_test_1h = np.eye(num_classes)[y_test]

    # model
    model = feed_forward_NN_4(
        layers=[784] + [config.hidden_size] * config.num_hidden_layers + [10],
        optimizer=config.optimizer,
        learning_rate=config.learning_rate,
        momentum=config.momentum,
        beta1=config.beta1,
        beta2=config.beta2,
        beta_rms=config.beta_rms,
        epsilon=config.epsilon,
        weight_decay=config.weight_decay,
        init_type=config.init_type,
        activation=config.activation
    )

    # Train the model
    model.training(
        x_train=x_train,
        y_train=y_train_1h,
        x_val=x_val,
        y_val=y_val_1h,
        epochs=config.epochs,
        batch_size=config.batch_size
    )

    #Evaluation on test set
    test_preds = model.predict(x_test)
    test_labels = np.argmax(y_test_1h, axis=1)
    test_acc = np.mean(test_preds == test_labels)
    
    wandb.log({"test_accuracy": test_acc})
    print("test accuracy ",test_acc)


# sweep configuration
sweep_config = {
    "method": "random", 
    "metric": {
        "name": "validation_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "epochs": {"values": [15]},
        "num_hidden_layers": {"values": [3]},
        "hidden_size": {"values": [64,128]},
        "weight_decay": {"values": [0.0]},
        "learning_rate": {"values": [5e-3,1e-3]},
        "optimizer": {"values": ["adam"]},
        "batch_size": {"values": [32]},
        "init_type": {"values": ["xavier"]},
        "activation": {"values": ["relu"]},
        "momentum": {"values": [0.9]},
        "beta1": {"values": [0.9]},
        "beta2": {"values": [0.999]},
        "beta_rms": {"values": [0.9]},
        "epsilon": {"values": [1e-8]}
    }
}

# Running the sweep

if __name__ == "__main__":
    # Creating sweep
    sweep_id = wandb.sweep(sweep_config, project="q4_sweep_project")
    # Launching sweep agent
    wandb.agent(sweep_id, function=train_sweep)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: tbphh202
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/q4_sweep_project/sweeps/tbphh202


[34m[1mwandb[0m: Agent Starting Run: vvg5vttr with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33med24s401[0m ([33med24s401-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/15 - loss=0.5460, val_acc=0.8470, val_loss=0.4313800753755172
Epoch 2/15 - loss=0.4277, val_acc=0.8442, val_loss=0.42233274853858394
Epoch 3/15 - loss=0.3970, val_acc=0.8567, val_loss=0.38684253322232853
Epoch 4/15 - loss=0.3729, val_acc=0.8613, val_loss=0.40310718068059503
Epoch 5/15 - loss=0.3694, val_acc=0.8658, val_loss=0.3797990675807041
Epoch 6/15 - loss=0.3591, val_acc=0.8587, val_loss=0.3971590549282658
Epoch 7/15 - loss=0.3442, val_acc=0.8653, val_loss=0.3900859654684136
Epoch 8/15 - loss=0.3400, val_acc=0.8688, val_loss=0.362453378967503
Epoch 9/15 - loss=0.3324, val_acc=0.8625, val_loss=0.3884308457926465
Epoch 10/15 - loss=0.3277, val_acc=0.8687, val_loss=0.4257646814918029
Epoch 11/15 - loss=0.3246, val_acc=0.8757, val_loss=0.3719614796062187
Epoch 12/15 - loss=0.3189, val_acc=0.8702, val_loss=0.3855816356567572
Epoch 13/15 - loss=0.3272, val_acc=0.8705, val_loss=0.4053483735611952
Epoch 14/15 - loss=0.3169, val_acc=0.8780, val_loss=0.34395362717023575
Epoch 15/15 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▃▃▂▂▂▂▂▁▁▂▁▁
validation loss,█▇▄▆▄▅▅▂▅█▃▄▆▁▂
validation_accuracy,▂▁▄▅▅▄▅▆▅▆█▆▆█▇

0,1
epoch,15.0
test_accuracy,0.8669
training_loss,0.30905
validation loss,0.35281
validation_accuracy,0.87133


[34m[1mwandb[0m: Agent Starting Run: 2tgpvh18 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.5460, val_acc=0.8470, val_loss=0.4313800753755172
Epoch 2/15 - loss=0.4277, val_acc=0.8442, val_loss=0.42233274853858394
Epoch 3/15 - loss=0.3970, val_acc=0.8567, val_loss=0.38684253322232853
Epoch 4/15 - loss=0.3729, val_acc=0.8613, val_loss=0.40310718068059503
Epoch 5/15 - loss=0.3694, val_acc=0.8658, val_loss=0.3797990675807041
Epoch 6/15 - loss=0.3591, val_acc=0.8587, val_loss=0.3971590549282658
Epoch 7/15 - loss=0.3442, val_acc=0.8653, val_loss=0.3900859654684136
Epoch 8/15 - loss=0.3400, val_acc=0.8688, val_loss=0.362453378967503
Epoch 9/15 - loss=0.3324, val_acc=0.8625, val_loss=0.3884308457926465
Epoch 10/15 - loss=0.3277, val_acc=0.8687, val_loss=0.4257646814918029
Epoch 11/15 - loss=0.3246, val_acc=0.8757, val_loss=0.3719614796062187
Epoch 12/15 - loss=0.3189, val_acc=0.8702, val_loss=0.3855816356567572
Epoch 13/15 - loss=0.3272, val_acc=0.8705, val_loss=0.4053483735611952
Epoch 14/15 - loss=0.3169, val_acc=0.8780, val_loss=0.34395362717023575
Epoch 15/15 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▃▃▂▂▂▂▂▁▁▂▁▁
validation loss,█▇▄▆▄▅▅▂▅█▃▄▆▁▂
validation_accuracy,▂▁▄▅▅▄▅▆▅▆█▆▆█▇

0,1
epoch,15.0
test_accuracy,0.8669
training_loss,0.30905
validation loss,0.35281
validation_accuracy,0.87133


[34m[1mwandb[0m: Agent Starting Run: 28640awz with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.5367, val_acc=0.8330, val_loss=0.4643269985308462
Epoch 2/15 - loss=0.4169, val_acc=0.8532, val_loss=0.40373668157989295
Epoch 3/15 - loss=0.3897, val_acc=0.8462, val_loss=0.4096803144640486
Epoch 4/15 - loss=0.3672, val_acc=0.8698, val_loss=0.38513054858439594
Epoch 5/15 - loss=0.3559, val_acc=0.8547, val_loss=0.3898804972014513
Epoch 6/15 - loss=0.3480, val_acc=0.8678, val_loss=0.3620215535867382
Epoch 7/15 - loss=0.3452, val_acc=0.8683, val_loss=0.3762228741273175
Epoch 8/15 - loss=0.3351, val_acc=0.8627, val_loss=0.3771779528756746
Epoch 9/15 - loss=0.3219, val_acc=0.8743, val_loss=0.34143927287951586
Epoch 10/15 - loss=0.3191, val_acc=0.8760, val_loss=0.3394106004457124
Epoch 11/15 - loss=0.3179, val_acc=0.8738, val_loss=0.3535693977691643
Epoch 12/15 - loss=0.3134, val_acc=0.8765, val_loss=0.3657335364763806
Epoch 13/15 - loss=0.3080, val_acc=0.8662, val_loss=0.369186363644204
Epoch 14/15 - loss=0.3046, val_acc=0.8758, val_loss=0.372315636082725
Epoch 15/15 - 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▄▃▃▂▂▂▂▂▁▁▁▁▁
validation loss,█▅▅▄▄▂▃▃▁▁▂▂▃▃▅
validation_accuracy,▁▄▃▇▄▇▇▆████▆█▅

0,1
epoch,15.0
test_accuracy,0.8511
training_loss,0.30181
validation loss,0.40191
validation_accuracy,0.86017


[34m[1mwandb[0m: Agent Starting Run: ydwfsxn1 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.5367, val_acc=0.8330, val_loss=0.4643269985308462
Epoch 2/15 - loss=0.4169, val_acc=0.8532, val_loss=0.40373668157989295
Epoch 3/15 - loss=0.3897, val_acc=0.8462, val_loss=0.4096803144640486
Epoch 4/15 - loss=0.3672, val_acc=0.8698, val_loss=0.38513054858439594
Epoch 5/15 - loss=0.3559, val_acc=0.8547, val_loss=0.3898804972014513
Epoch 6/15 - loss=0.3480, val_acc=0.8678, val_loss=0.3620215535867382
Epoch 7/15 - loss=0.3452, val_acc=0.8683, val_loss=0.3762228741273175


In [None]:
import numpy as np
import wandb
from keras.datasets import fashion_mnist


# Neural Network Class: feed_forward_NN_4

class feed_forward_NN_4adam:
    def __init__(self,
                 layers,
                 optimizer,
                 learning_rate,
                 momentum,
                 beta1,
                 beta2,
                 beta_rms,
                 epsilon,
                 weight_decay,
                 init_type,
                 activation
                 ):
    
        
        self.layers = layers
        self.layer_n = len(layers)
        self.optimizer = optimizer.lower()
        self.lr = learning_rate
        self.momentum = momentum
        self.beta1 = beta1
        self.beta2 = beta2
        self.beta_rms = beta_rms
        self.epsilon = epsilon
        self.weight_decay = weight_decay
        self.init_type = init_type.lower()
        self.activation = activation.lower()
        

        # Initialize Weights & BiaseS
        self.weights = []
        self.biases = []
        for i in range(self.layer_n - 1):
            if self.init_type == "xavier":
                # "Xavier" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(1.0 / layers[i])
            else:
                # "random" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2.0 / layers[i])
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)

        # initialize extra Params 
        if self.optimizer in ["momentum", "nesterov", "rmsprop", "adam", "nadam"]:
            self.v_w = [np.zeros_like(w) for w in self.weights]
            self.v_b = [np.zeros_like(b) for b in self.biases]
        if self.optimizer in ["adam", "nadam"]:
            self.m_w = [np.zeros_like(w) for w in self.weights]
            self.m_b = [np.zeros_like(b) for b in self.biases]
            self.t = 0

    # activations 
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def relu(self, x):
        return np.maximum(0, x)

    def activate(self, x):
        if self.activation == "sigmoid":
            return self.sigmoid(x)
        elif self.activation == "tanh":
            return self.tanh(x)
        elif self.activation == "relu":
            return self.relu(x)
        else:
            return self.sigmoid(x) 
        
    # derivatives
    def derivative(self, a):

        if self.activation == "sigmoid":
            return a * (1 - a)
        elif self.activation == "tanh":
            return 1 - a**2
        elif self.activation == "relu":
            return (a > 0).astype(float)
        else:
            return a * (1 - a) 

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    # Forward Pass
    def forward_pass(self, x):
        self.h = [x]  
        # Hidden layers
        for i in range(self.layer_n - 2):
            z = np.dot(self.h[i], self.weights[i]) + self.biases[i]
            act = self.activate(z)
            self.h.append(act)
        # Output layer- softmax
        z_out = np.dot(self.h[-1], self.weights[-1]) + self.biases[-1]
        out = self.softmax(z_out)
        self.h.append(out)
        return self.h

    # Backward Pass
    def backward_prop(self, y_true):
        m = y_true.shape[0]
        dw = [None] * (self.layer_n - 1)
        db = [None] * (self.layer_n - 1)

        # Cross-entropy derivative for output layer
        delta = self.h[-1] - y_true  # shape: (batch_size, output_dim)

        # Propagation
        for i in reversed(range(self.layer_n - 1)):
            dw[i] = np.dot(self.h[i].T, delta) / m
            db[i] = np.sum(delta, axis=0, keepdims=True) / m
            if i > 0:
                # For hidden layers, multiply by derivative of activation
                delta = np.dot(delta, self.weights[i].T) * self.derivative(self.h[i])
        return dw, db

    # Param Updates for "Non-Nesterov" 
    def _update_params(self, dw, db):
        # Add weight decay to each gradient
        for i in range(self.layer_n - 1):
            dw[i] += self.weight_decay * self.weights[i]

        if self.optimizer == "sgd":
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * dw[i]
                self.biases[i] -= self.lr * db[i]

        elif self.optimizer == "momentum":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dw[i]
                self.v_b[i] = self.momentum * self.v_b[i] + db[i]
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i] -= self.lr * self.v_b[i]

        elif self.optimizer == "rmsprop":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.beta_rms * self.v_w[i] + (1 - self.beta_rms) * (dw[i] ** 2)
                self.v_b[i] = self.beta_rms * self.v_b[i] + (1 - self.beta_rms) * (db[i] ** 2)
                self.weights[i] -= self.lr * dw[i] / (np.sqrt(self.v_w[i]) + self.epsilon)
                self.biases[i]  -= self.lr * db[i] / (np.sqrt(self.v_b[i]) + self.epsilon)

        elif self.optimizer == "adam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)

                self.weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

        elif self.optimizer == "nadam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** (self.t + 1))
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** (self.t + 1))
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** (self.t + 1))
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** (self.t + 1))

                grad_term_w = self.beta1 * m_w_hat + (1 - self.beta1) * dw[i] / (1 - self.beta1 ** (self.t + 1))
                grad_term_b = self.beta1 * m_b_hat + (1 - self.beta1) * db[i] / (1 - self.beta1 ** (self.t + 1))

                self.weights[i] -= self.lr * grad_term_w / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * grad_term_b / (np.sqrt(v_b_hat) + self.epsilon)

    # Training Step  with "Nesterov"
    def _train_step(self, x_batch, y_batch):
        if self.optimizer == "nesterov":
            # to look-ahead: w_look = w - momentum * v
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr*self.momentum * self.v_w[i]
                self.biases[i]  -= self.lr*self.momentum * self.v_b[i]

            # Forward at the look-ahead position
            self.forward_pass(x_batch)
            out = self.h[-1]
            l2_norm_weights = 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params # (1e-10) to prevent underflow
            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)

            # add weight decay here
            for i in range(self.layer_n - 1):
                dW[i] += self.weight_decay * self.weights[i]

            # backward at the look-ahead position (go back to w_t)
            for i in range(self.layer_n - 1):
                self.weights[i] += self.lr*self.momentum * self.v_w[i]
                self.biases[i]  += self.lr*self.momentum * self.v_b[i]

            # update velocity: u_t = momentum*u_{t-1} + dW
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dW[i]
                self.v_b[i] = self.momentum * self.v_b[i] + dB[i]

            # final param update: w = w - lr*u_t
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i]  -= self.lr * self.v_b[i]

            return loss
        else:
            # Normal forward/back
            self.forward_pass(x_batch)
            out = self.h[-1]

            l2_norm_weights=0
            l2_norm_bias= 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params 

            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)
            self._update_params(dW, dB)
            return loss

    # Outer Training Loop 
    def training(self, x_train, y_train, x_val, y_val, epochs, batch_size):
       
        for ep in range(epochs):
            idx = np.random.permutation(x_train.shape[0])
            x_train_shuff = x_train[idx]
            y_train_shuff = y_train[idx]
            n_batches = len(x_train) // batch_size
            epoch_loss = 0.0
            for b in range(n_batches):
                start = b * batch_size
                end = start + batch_size
                x_batch = x_train_shuff[start:end]
                y_batch = y_train_shuff[start:end]
                loss = self._train_step(x_batch, y_batch)
                epoch_loss += loss
            avg_loss = epoch_loss / n_batches

            # Validation

            preds = self.predict(x_val)
            val_labels = np.argmax(y_val, axis=1)
            val_acc = np.mean(preds == val_labels)

            val_outputs = self.forward_pass(x_val)[-1]
        
            # Cross-entropy loss for validation
            val_loss = -np.mean(np.sum(y_val * np.log(val_outputs + 1e-10), axis=1))

            # Log metrics to wandb
            wandb.log({"epoch": ep+1, "training_loss": avg_loss, "validation_accuracy": val_acc, "validation loss": val_loss})
            print(f"Epoch {ep+1}/{epochs} - loss={avg_loss:.4f}, val_acc={val_acc:.4f}, val_loss={val_loss}" )

    #Prediction 
    def predict(self, X):
        self.forward_pass(X)
        return np.argmax(self.h[-1], axis=1)




# (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
# x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
# x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

# np.random.seed(42)
# idx = np.arange(x_train_full.shape[0])
# np.random.shuffle(idx)
# x_train_full = x_train_full[idx]
# y_train_full = y_train_full[idx]

# # 90% training, 10% validation 
# train_size=int(.9*len(x_train_full))

# x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
# x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

# num_classes = 10
# y_train_1h = np.eye(num_classes)[y_train]
# y_val_1h = np.eye(num_classes)[y_val]
# y_test_1h = np.eye(num_classes)[y_test]

# # model
# model = feed_forward_NN_4(
#     layers=[784] + [32] *3 + [10],
# optimizer="nesterov",
# learning_rate=0.01,
# momentum=0.9,
# beta1=0.9,
# beta2=0.999,
# beta_rms=0.9,
# epsilon=1e-4,
# weight_decay=0.0005,
# init_type="xavier",
# activation="relu")

#     # Train the model
# model.training(
#         x_train=x_train,
#         y_train=y_train_1h,
#         x_val=x_val,
#         y_val=y_val_1h,
#         epochs=10,
#         batch_size=32
#     )

#     #Evaluation on test set
# test_preds = model.predict(x_test)
# test_labels = np.argmax(y_test_1h, axis=1)
# test_acc = np.mean(test_preds == test_labels)
# print("test accuracy ",test_acc)
# #wandb.log({"test_accuracy": test_acc})




# train_sweep() function

def train_sweep():
    # Initialize wandb
    wandb.init()
    config = wandb.config

    #custom run name from hyperparameters
    run_name = f"hl_{config.num_hidden_layers}_bs_{config.batch_size}_ac_{config.activation}_opt_{config.optimizer}"
    wandb.run.name = run_name

    # Load Fashion-MNIST
    (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
    x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
    x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

    np.random.seed(42)
    idx = np.arange(x_train_full.shape[0])
    np.random.shuffle(idx)
    x_train_full = x_train_full[idx]
    y_train_full = y_train_full[idx]

    # 90% training, 10% validation 
    train_size=int(.9*len(x_train_full))

    x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
    x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

    num_classes = 10
    y_train_1h = np.eye(num_classes)[y_train]
    y_val_1h = np.eye(num_classes)[y_val]
    y_test_1h = np.eye(num_classes)[y_test]

    # model
    model = feed_forward_NN_4adam(
        layers=[784] + [config.hidden_size] * config.num_hidden_layers + [10],
        optimizer=config.optimizer,
        learning_rate=config.learning_rate,
        momentum=config.momentum,
        beta1=config.beta1,
        beta2=config.beta2,
        beta_rms=config.beta_rms,
        epsilon=config.epsilon,
        weight_decay=config.weight_decay,
        init_type=config.init_type,
        activation=config.activation
    )

    # Train the model
    model.training(
        x_train=x_train,
        y_train=y_train_1h,
        x_val=x_val,
        y_val=y_val_1h,
        epochs=config.epochs,
        batch_size=config.batch_size
    )

    #Evaluation on test set
    test_preds = model.predict(x_test)
    test_labels = np.argmax(y_test_1h, axis=1)
    test_acc = np.mean(test_preds == test_labels)
    
    wandb.log({"test_accuracy": test_acc})
    print("test accuracy ",test_acc)


# sweep configuration
sweep_config = {
    "method": "random", 
    "metric": {
        "name": "validation_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "epochs": {"values": [15]},
        "num_hidden_layers": {"values": [3,4]},
        "hidden_size": {"values": [64,128]},
        "weight_decay": {"values": [0.0]},
        "learning_rate": {"values": [1e-3,1e-4]},
        "optimizer": {"values": ["adam","nadam"]},
        "batch_size": {"values": [32]},
        "init_type": {"values": ["xavier","random"]},
        "activation": {"values": ["relu"]},
        "momentum": {"values": [0.9]},
        "beta1": {"values": [0.9]},
        "beta2": {"values": [0.999]},
        "beta_rms": {"values": [0.9]},
        "epsilon": {"values": [1e-8]}
    }
}

# Running the sweep

if __name__ == "__main__":
    # Creating sweep
    sweep_id = wandb.sweep(sweep_config, project="q4_sweep_project")
    # Launching sweep agent
    wandb.agent(sweep_id, function=train_sweep)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: fqdtz4g7
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/q4_sweep_project/sweeps/fqdtz4g7


[34m[1mwandb[0m: Agent Starting Run: ohot1yen with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33med24s401[0m ([33med24s401-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/15 - loss=0.4769, val_acc=0.8518, val_loss=0.40060060396598857
Epoch 2/15 - loss=0.3621, val_acc=0.8680, val_loss=0.3611647742387115
Epoch 3/15 - loss=0.3289, val_acc=0.8670, val_loss=0.3534777875825798
Epoch 4/15 - loss=0.3095, val_acc=0.8732, val_loss=0.3482778639171706
Epoch 5/15 - loss=0.2909, val_acc=0.8735, val_loss=0.33800089407360445
Epoch 6/15 - loss=0.2788, val_acc=0.8852, val_loss=0.3181620776500594
Epoch 7/15 - loss=0.2641, val_acc=0.8860, val_loss=0.3173986692505926
Epoch 8/15 - loss=0.2556, val_acc=0.8862, val_loss=0.31423058026336115
Epoch 9/15 - loss=0.2419, val_acc=0.8832, val_loss=0.3406884992620306
Epoch 10/15 - loss=0.2351, val_acc=0.8875, val_loss=0.30044659833502785
Epoch 11/15 - loss=0.2274, val_acc=0.8897, val_loss=0.3239642810268094
Epoch 12/15 - loss=0.2198, val_acc=0.8857, val_loss=0.33989585383123366
Epoch 13/15 - loss=0.2124, val_acc=0.8888, val_loss=0.3058144514547397
Epoch 14/15 - loss=0.2053, val_acc=0.8937, val_loss=0.3083127319936225
Epoch 15/1

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
validation loss,█▅▅▄▄▂▂▂▄▁▃▄▁▂▃
validation_accuracy,▁▄▄▅▅▇▇▇▆▇▇▇▇█▇

0,1
epoch,15.0
test_accuracy,0.8892
training_loss,0.19835
validation loss,0.32951
validation_accuracy,0.8885


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5wyvxtmp with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.7885, val_acc=0.8087, val_loss=0.5533356808456145
Epoch 2/15 - loss=0.4806, val_acc=0.8382, val_loss=0.4575927928078974
Epoch 3/15 - loss=0.4284, val_acc=0.8500, val_loss=0.4310807654363351
Epoch 4/15 - loss=0.4017, val_acc=0.8565, val_loss=0.401306930252314
Epoch 5/15 - loss=0.3836, val_acc=0.8592, val_loss=0.3914466705850863
Epoch 6/15 - loss=0.3710, val_acc=0.8632, val_loss=0.3823828862680102
Epoch 7/15 - loss=0.3590, val_acc=0.8658, val_loss=0.3703948660501879
Epoch 8/15 - loss=0.3491, val_acc=0.8693, val_loss=0.3625794887560769
Epoch 9/15 - loss=0.3400, val_acc=0.8653, val_loss=0.3626237102509242
Epoch 10/15 - loss=0.3332, val_acc=0.8648, val_loss=0.3555114453265892
Epoch 11/15 - loss=0.3258, val_acc=0.8735, val_loss=0.3515976609162414
Epoch 12/15 - loss=0.3186, val_acc=0.8767, val_loss=0.34292758133342993
Epoch 13/15 - loss=0.3125, val_acc=0.8745, val_loss=0.3478598843153571
Epoch 14/15 - loss=0.3068, val_acc=0.8775, val_loss=0.34019948414623197
Epoch 15/15 - 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁
validation loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▅▆▆▇▇▇▇▇█████

0,1
epoch,15.0
test_accuracy,0.8706
training_loss,0.30154
validation loss,0.33879
validation_accuracy,0.87733


[34m[1mwandb[0m: Agent Starting Run: l6xwvvi5 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.7003, val_acc=0.8220, val_loss=0.4943318380748373
Epoch 2/15 - loss=0.4455, val_acc=0.8453, val_loss=0.4272136531671892
Epoch 3/15 - loss=0.4050, val_acc=0.8513, val_loss=0.4170393582911953
Epoch 4/15 - loss=0.3805, val_acc=0.8595, val_loss=0.3825536575852357
Epoch 5/15 - loss=0.3603, val_acc=0.8628, val_loss=0.37099440208518414
Epoch 6/15 - loss=0.3468, val_acc=0.8693, val_loss=0.3573359847989474
Epoch 7/15 - loss=0.3328, val_acc=0.8708, val_loss=0.3482750134588486
Epoch 8/15 - loss=0.3219, val_acc=0.8737, val_loss=0.33455455284190694
Epoch 9/15 - loss=0.3102, val_acc=0.8758, val_loss=0.3333025842760965
Epoch 10/15 - loss=0.3008, val_acc=0.8777, val_loss=0.32892516834399954
Epoch 11/15 - loss=0.2928, val_acc=0.8778, val_loss=0.32822808067028963
Epoch 12/15 - loss=0.2849, val_acc=0.8802, val_loss=0.3242672803580722
Epoch 13/15 - loss=0.2780, val_acc=0.8798, val_loss=0.31821372008848
Epoch 14/15 - loss=0.2710, val_acc=0.8833, val_loss=0.30992415812016627
Epoch 15/15 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▃▃▃▂▂▂▂▂▁▁▁▁▁
validation loss,█▅▅▄▃▃▂▂▂▂▂▂▁▁▁
validation_accuracy,▁▄▄▅▆▆▇▇▇▇▇████

0,1
epoch,15.0
test_accuracy,0.8761
training_loss,0.26401
validation loss,0.32226
validation_accuracy,0.8805


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 97on6uxu with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.4864, val_acc=0.8558, val_loss=0.39414863926527977
Epoch 2/15 - loss=0.3656, val_acc=0.8697, val_loss=0.35466168373727985
Epoch 3/15 - loss=0.3330, val_acc=0.8645, val_loss=0.3551483422911853
Epoch 4/15 - loss=0.3114, val_acc=0.8717, val_loss=0.3477499508047815
Epoch 5/15 - loss=0.2925, val_acc=0.8755, val_loss=0.33793942679718947
Epoch 6/15 - loss=0.2817, val_acc=0.8843, val_loss=0.3165049359637797
Epoch 7/15 - loss=0.2657, val_acc=0.8862, val_loss=0.3200998692929277
Epoch 8/15 - loss=0.2566, val_acc=0.8868, val_loss=0.3246274174481567
Epoch 9/15 - loss=0.2438, val_acc=0.8838, val_loss=0.31929888356391967
Epoch 10/15 - loss=0.2341, val_acc=0.8862, val_loss=0.3044654284737919
Epoch 11/15 - loss=0.2271, val_acc=0.8882, val_loss=0.32792625455069924
Epoch 12/15 - loss=0.2206, val_acc=0.8873, val_loss=0.32132468162556554
Epoch 13/15 - loss=0.2123, val_acc=0.8848, val_loss=0.334373050016081
Epoch 14/15 - loss=0.2056, val_acc=0.8922, val_loss=0.3208257262307884
Epoch 15/1

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
validation loss,█▅▅▄▄▂▂▃▂▁▃▂▃▂▄
validation_accuracy,▁▄▃▄▅▆▇▇▆▇▇▇▇█▇

0,1
epoch,15.0
test_accuracy,0.8854
training_loss,0.19786
validation loss,0.33918
validation_accuracy,0.888


[34m[1mwandb[0m: Agent Starting Run: 9l5v5mg7 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.7136, val_acc=0.8215, val_loss=0.514249439221173
Epoch 2/15 - loss=0.4450, val_acc=0.8522, val_loss=0.424631159802466
Epoch 3/15 - loss=0.4010, val_acc=0.8568, val_loss=0.4077086254771315
Epoch 4/15 - loss=0.3764, val_acc=0.8647, val_loss=0.38313558974950057
Epoch 5/15 - loss=0.3581, val_acc=0.8687, val_loss=0.36678927111228254
Epoch 6/15 - loss=0.3451, val_acc=0.8715, val_loss=0.36454694910194924
Epoch 7/15 - loss=0.3324, val_acc=0.8742, val_loss=0.3506223913021616
Epoch 8/15 - loss=0.3225, val_acc=0.8763, val_loss=0.3426140422574846
Epoch 9/15 - loss=0.3129, val_acc=0.8692, val_loss=0.34419958776581877
Epoch 10/15 - loss=0.3051, val_acc=0.8725, val_loss=0.33756536300903056
Epoch 11/15 - loss=0.2980, val_acc=0.8773, val_loss=0.33536077214787013
Epoch 12/15 - loss=0.2908, val_acc=0.8792, val_loss=0.32726795845946033
Epoch 13/15 - loss=0.2836, val_acc=0.8810, val_loss=0.332058711502308
Epoch 14/15 - loss=0.2789, val_acc=0.8793, val_loss=0.32688582932803106
Epoch 15/1

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▃▃▂▂▂▂▂▂▁▁▁▁▁
validation loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▅▅▆▇▇▇▇▇▇█████

0,1
epoch,15.0
test_accuracy,0.871
training_loss,0.27249
validation loss,0.332
validation_accuracy,0.87933


[34m[1mwandb[0m: Agent Starting Run: kuspv0cm with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.4983, val_acc=0.8520, val_loss=0.39620568058328415
Epoch 2/15 - loss=0.3740, val_acc=0.8683, val_loss=0.35397257665943666
Epoch 3/15 - loss=0.3386, val_acc=0.8603, val_loss=0.3735516517537113
Epoch 4/15 - loss=0.3171, val_acc=0.8702, val_loss=0.35227046469350487
Epoch 5/15 - loss=0.2992, val_acc=0.8717, val_loss=0.3392147276409889
Epoch 6/15 - loss=0.2862, val_acc=0.8913, val_loss=0.3076140956708525
Epoch 7/15 - loss=0.2720, val_acc=0.8900, val_loss=0.3109438713703989
Epoch 8/15 - loss=0.2614, val_acc=0.8868, val_loss=0.3136543904148377
Epoch 9/15 - loss=0.2490, val_acc=0.8853, val_loss=0.3142703619804999
Epoch 10/15 - loss=0.2408, val_acc=0.8875, val_loss=0.3045127358453091
Epoch 11/15 - loss=0.2317, val_acc=0.8908, val_loss=0.31719448859125154
Epoch 12/15 - loss=0.2256, val_acc=0.8867, val_loss=0.3221218955050082
Epoch 13/15 - loss=0.2162, val_acc=0.8900, val_loss=0.3026525728061711
Epoch 14/15 - loss=0.2084, val_acc=0.8947, val_loss=0.30650302112360034
Epoch 15/1

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
validation loss,█▅▆▅▄▁▂▂▂▁▂▂▁▁▃
validation_accuracy,▁▄▂▄▄▇▇▇▆▇▇▇▇█▇

0,1
epoch,15.0
test_accuracy,0.8767
training_loss,0.2002
validation loss,0.32889
validation_accuracy,0.88617


[34m[1mwandb[0m: Agent Starting Run: sfas1vq6 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.5227, val_acc=0.8418, val_loss=0.4190571691064101
Epoch 2/15 - loss=0.3794, val_acc=0.8588, val_loss=0.38165079268040614
Epoch 3/15 - loss=0.3448, val_acc=0.8722, val_loss=0.3394224718640618
Epoch 4/15 - loss=0.3230, val_acc=0.8742, val_loss=0.33376045545819505
Epoch 5/15 - loss=0.3045, val_acc=0.8765, val_loss=0.3328886604925862
Epoch 6/15 - loss=0.2925, val_acc=0.8813, val_loss=0.32026988218612995
Epoch 7/15 - loss=0.2806, val_acc=0.8793, val_loss=0.3319029520970732
Epoch 8/15 - loss=0.2715, val_acc=0.8825, val_loss=0.3093203109126555
Epoch 9/15 - loss=0.2600, val_acc=0.8850, val_loss=0.3084745316614946
Epoch 10/15 - loss=0.2517, val_acc=0.8825, val_loss=0.31247891546735035
Epoch 11/15 - loss=0.2457, val_acc=0.8768, val_loss=0.32989977623803163
Epoch 12/15 - loss=0.2384, val_acc=0.8865, val_loss=0.3035031063714345
Epoch 13/15 - loss=0.2320, val_acc=0.8825, val_loss=0.3233565565951363
Epoch 14/15 - loss=0.2275, val_acc=0.8913, val_loss=0.29329948867355904
Epoch 15/

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
validation loss,█▆▄▃▃▃▃▂▂▂▃▂▃▁▃
validation_accuracy,▁▃▅▆▆▇▆▇▇▇▆▇▇█▇

0,1
epoch,15.0
test_accuracy,0.8773
training_loss,0.2212
validation loss,0.32809
validation_accuracy,0.883


[34m[1mwandb[0m: Agent Starting Run: x80z2wtp with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.7933, val_acc=0.8160, val_loss=0.5245966101830507
Epoch 2/15 - loss=0.4833, val_acc=0.8368, val_loss=0.4603758925063214
Epoch 3/15 - loss=0.4372, val_acc=0.8405, val_loss=0.43951944542969096
Epoch 4/15 - loss=0.4108, val_acc=0.8535, val_loss=0.4092619256140888
Epoch 5/15 - loss=0.3916, val_acc=0.8582, val_loss=0.3980041108024879
Epoch 6/15 - loss=0.3772, val_acc=0.8598, val_loss=0.3870382981188369
Epoch 7/15 - loss=0.3651, val_acc=0.8555, val_loss=0.38731085643957164
Epoch 8/15 - loss=0.3544, val_acc=0.8698, val_loss=0.36207905602593593
Epoch 9/15 - loss=0.3441, val_acc=0.8733, val_loss=0.35626490768384306
Epoch 10/15 - loss=0.3373, val_acc=0.8743, val_loss=0.3521419088860554
Epoch 11/15 - loss=0.3278, val_acc=0.8713, val_loss=0.3560201447453623
Epoch 12/15 - loss=0.3235, val_acc=0.8750, val_loss=0.34315261245936207
Epoch 13/15 - loss=0.3160, val_acc=0.8735, val_loss=0.3422829032823996
Epoch 14/15 - loss=0.3113, val_acc=0.8725, val_loss=0.34418188888518186
Epoch 15/

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
validation loss,█▆▅▄▄▃▃▂▂▂▂▂▂▂▁
validation_accuracy,▁▃▄▅▆▆▅▇▇▇▇▇▇▇█

0,1
epoch,15.0
test_accuracy,0.8717
training_loss,0.30619
validation loss,0.32497
validation_accuracy,0.88


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rme04wsv with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.7885, val_acc=0.8087, val_loss=0.5533356808456145
Epoch 2/15 - loss=0.4806, val_acc=0.8382, val_loss=0.4575927928078974
Epoch 3/15 - loss=0.4284, val_acc=0.8500, val_loss=0.4310807654363351
Epoch 4/15 - loss=0.4017, val_acc=0.8565, val_loss=0.401306930252314
Epoch 5/15 - loss=0.3836, val_acc=0.8592, val_loss=0.3914466705850863
Epoch 6/15 - loss=0.3710, val_acc=0.8632, val_loss=0.3823828862680102
Epoch 7/15 - loss=0.3590, val_acc=0.8658, val_loss=0.3703948660501879
Epoch 8/15 - loss=0.3491, val_acc=0.8693, val_loss=0.3625794887560769
Epoch 9/15 - loss=0.3400, val_acc=0.8653, val_loss=0.3626237102509242
Epoch 10/15 - loss=0.3332, val_acc=0.8648, val_loss=0.3555114453265892
Epoch 11/15 - loss=0.3258, val_acc=0.8735, val_loss=0.3515976609162414
Epoch 12/15 - loss=0.3186, val_acc=0.8767, val_loss=0.34292758133342993
Epoch 13/15 - loss=0.3125, val_acc=0.8745, val_loss=0.3478598843153571
Epoch 14/15 - loss=0.3068, val_acc=0.8775, val_loss=0.34019948414623197
Epoch 15/15 - 

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁
validation loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
validation_accuracy,▁▄▅▆▆▇▇▇▇▇█████

0,1
epoch,15.0
test_accuracy,0.8706
training_loss,0.30154
validation loss,0.33879
validation_accuracy,0.87733


[34m[1mwandb[0m: Agent Starting Run: 428bnhlq with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.5133, val_acc=0.8485, val_loss=0.4114296920941669
Epoch 2/15 - loss=0.3796, val_acc=0.8603, val_loss=0.3794051494724149
Epoch 3/15 - loss=0.3459, val_acc=0.8763, val_loss=0.33713876249912106
Epoch 4/15 - loss=0.3222, val_acc=0.8700, val_loss=0.34271211559632137
Epoch 5/15 - loss=0.3044, val_acc=0.8753, val_loss=0.3313880690685784
Epoch 6/15 - loss=0.2903, val_acc=0.8735, val_loss=0.3363463944589117
Epoch 7/15 - loss=0.2790, val_acc=0.8783, val_loss=0.32034813919621125
Epoch 8/15 - loss=0.2700, val_acc=0.8845, val_loss=0.31434802523022465
Epoch 9/15 - loss=0.2598, val_acc=0.8873, val_loss=0.313987339211229
Epoch 10/15 - loss=0.2496, val_acc=0.8768, val_loss=0.33179118224869625
Epoch 11/15 - loss=0.2447, val_acc=0.8833, val_loss=0.323373830177318
Epoch 12/15 - loss=0.2356, val_acc=0.8862, val_loss=0.3169624628450392
Epoch 13/15 - loss=0.2317, val_acc=0.8805, val_loss=0.32292701420760844
Epoch 14/15 - loss=0.2261, val_acc=0.8872, val_loss=0.3094077637191036
Epoch 15/15

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
test_accuracy,▁
training_loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
validation loss,█▆▃▄▃▃▂▂▂▃▂▂▂▁▁
validation_accuracy,▁▃▆▅▅▅▆▇▇▆▇▇▆▇█

0,1
epoch,15.0
test_accuracy,0.8867
training_loss,0.21732
validation loss,0.30322
validation_accuracy,0.89117


[34m[1mwandb[0m: Agent Starting Run: 26dscs5d with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/15 - loss=0.4908, val_acc=0.8410, val_loss=0.41903148234283516
Epoch 2/15 - loss=0.3753, val_acc=0.8565, val_loss=0.38272879688989025
Epoch 3/15 - loss=0.3423, val_acc=0.8773, val_loss=0.33378893553610617
Epoch 4/15 - loss=0.3188, val_acc=0.8733, val_loss=0.3317447920329504
Epoch 5/15 - loss=0.3035, val_acc=0.8762, val_loss=0.33493537058426126
Epoch 6/15 - loss=0.2878, val_acc=0.8802, val_loss=0.3259554224533894
Epoch 7/15 - loss=0.2786, val_acc=0.8732, val_loss=0.3369888480798871
Epoch 8/15 - loss=0.2685, val_acc=0.8812, val_loss=0.32079680068523847
Epoch 9/15 - loss=0.2588, val_acc=0.8818, val_loss=0.32251568475432324
Epoch 10/15 - loss=0.2511, val_acc=0.8763, val_loss=0.32763735423097445
Epoch 11/15 - loss=0.2446, val_acc=0.8783, val_loss=0.33602265006410215
Epoch 12/15 - loss=0.2366, val_acc=0.8887, val_loss=0.3165670001630858
Epoch 13/15 - loss=0.2304, val_acc=0.8810, val_loss=0.3365970947740969
