In [None]:
import numpy as np
import wandb
from keras.datasets import fashion_mnist


# Neural Network Class: feed_forward_NN_4

class feed_forward_NN_4:
    def __init__(self,
                 layers,
                 optimizer,
                 learning_rate,
                 momentum,
                 beta1,
                 beta2,
                 beta_rms,
                 epsilon,
                 weight_decay,
                 init_type,
                 activation
                 ):
    
        
        self.layers = layers
        self.layer_n = len(layers)
        self.optimizer = optimizer.lower()
        self.lr = learning_rate
        self.momentum = momentum
        self.beta1 = beta1
        self.beta2 = beta2
        self.beta_rms = beta_rms
        self.epsilon = epsilon
        self.weight_decay = weight_decay
        self.init_type = init_type.lower()
        self.activation = activation.lower()
        

        # Initialize Weights & BiaseS
        self.weights = []
        self.biases = []
        for i in range(self.layer_n - 1):
            if self.init_type == "xavier":
                # "Xavier" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(1.0 / layers[i])
            else:
                # "random" initialization
                w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2.0 / layers[i])
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)

        # initialize extra Params 
        if self.optimizer in ["momentum", "nesterov", "rmsprop", "adam", "nadam"]:
            self.v_w = [np.zeros_like(w) for w in self.weights]
            self.v_b = [np.zeros_like(b) for b in self.biases]
        if self.optimizer in ["adam", "nadam"]:
            self.m_w = [np.zeros_like(w) for w in self.weights]
            self.m_b = [np.zeros_like(b) for b in self.biases]
            self.t = 0

    # activations 
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def relu(self, x):
        return np.maximum(0, x)

    def activate(self, x):
        if self.activation == "sigmoid":
            return self.sigmoid(x)
        elif self.activation == "tanh":
            return self.tanh(x)
        elif self.activation == "relu":
            return self.relu(x)
        else:
            return self.sigmoid(x) 
        
    # derivatives
    def derivative(self, a):

        if self.activation == "sigmoid":
            return a * (1 - a)
        elif self.activation == "tanh":
            return 1 - a**2
        elif self.activation == "relu":
            return (a > 0).astype(float)
        else:
            return a * (1 - a) 

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    # Forward Pass
    def forward_pass(self, x):
        self.h = [x]  
        # Hidden layers
        for i in range(self.layer_n - 2):
            z = np.dot(self.h[i], self.weights[i]) + self.biases[i]
            act = self.activate(z)
            self.h.append(act)
        # Output layer- softmax
        z_out = np.dot(self.h[-1], self.weights[-1]) + self.biases[-1]
        out = self.softmax(z_out)
        self.h.append(out)
        return self.h

    # Backward Pass
    def backward_prop(self, y_true):
        m = y_true.shape[0]
        dw = [None] * (self.layer_n - 1)
        db = [None] * (self.layer_n - 1)

        # Cross-entropy derivative for output layer
        delta = self.h[-1] - y_true  # shape: (batch_size, output_dim)

        # Propagation
        for i in reversed(range(self.layer_n - 1)):
            dw[i] = np.dot(self.h[i].T, delta) / m
            db[i] = np.sum(delta, axis=0, keepdims=True) / m
            if i > 0:
                # For hidden layers, multiply by derivative of activation
                delta = np.dot(delta, self.weights[i].T) * self.derivative(self.h[i])
        return dw, db

    # Param Updates for "Non-Nesterov" 
    def _update_params(self, dw, db):
        # Add weight decay to each gradient
        for i in range(self.layer_n - 1):
            dw[i] += self.weight_decay * self.weights[i]

        if self.optimizer == "sgd":
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * dw[i]
                self.biases[i] -= self.lr * db[i]

        elif self.optimizer == "momentum":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dw[i]
                self.v_b[i] = self.momentum * self.v_b[i] + db[i]
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i] -= self.lr * self.v_b[i]

        elif self.optimizer == "rmsprop":
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.beta_rms * self.v_w[i] + (1 - self.beta_rms) * (dw[i] ** 2)
                self.v_b[i] = self.beta_rms * self.v_b[i] + (1 - self.beta_rms) * (db[i] ** 2)
                self.weights[i] -= self.lr * dw[i] / (np.sqrt(self.v_w[i]) + self.epsilon)
                self.biases[i]  -= self.lr * db[i] / (np.sqrt(self.v_b[i]) + self.epsilon)

        elif self.optimizer == "adam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)

                self.weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

        elif self.optimizer == "nadam":
            self.t += 1
            for i in range(self.layer_n - 1):
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * dw[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * db[i]
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (dw[i] ** 2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (db[i] ** 2)

                # bias correction
                m_w_hat = self.m_w[i] / (1 - self.beta1 ** (self.t + 1))
                m_b_hat = self.m_b[i] / (1 - self.beta1 ** (self.t + 1))
                v_w_hat = self.v_w[i] / (1 - self.beta2 ** (self.t + 1))
                v_b_hat = self.v_b[i] / (1 - self.beta2 ** (self.t + 1))

                grad_term_w = self.beta1 * m_w_hat + (1 - self.beta1) * dw[i] / (1 - self.beta1 ** (self.t + 1))
                grad_term_b = self.beta1 * m_b_hat + (1 - self.beta1) * db[i] / (1 - self.beta1 ** (self.t + 1))

                self.weights[i] -= self.lr * grad_term_w / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i]  -= self.lr * grad_term_b / (np.sqrt(v_b_hat) + self.epsilon)

    # Training Step  with "Nesterov"
    def _train_step(self, x_batch, y_batch):
        if self.optimizer == "nesterov":
            # to look-ahead: w_look = w - momentum * v
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr*self.momentum * self.v_w[i]
                self.biases[i]  -= self.lr*self.momentum * self.v_b[i]

            # Forward at the look-ahead position
            self.forward_pass(x_batch)
            out = self.h[-1]
            l2_norm_weights = 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params # (1e-10) to prevent underflow
            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)

            # add weight decay here
            for i in range(self.layer_n - 1):
                dW[i] += self.weight_decay * self.weights[i]

            # backward at the look-ahead position (go back to w_t)
            for i in range(self.layer_n - 1):
                self.weights[i] += self.lr*self.momentum * self.v_w[i]
                self.biases[i]  += self.lr*self.momentum * self.v_b[i]

            # update velocity: u_t = momentum*u_{t-1} + dW
            for i in range(self.layer_n - 1):
                self.v_w[i] = self.momentum * self.v_w[i] + dW[i]
                self.v_b[i] = self.momentum * self.v_b[i] + dB[i]

            # final param update: w = w - lr*u_t
            for i in range(self.layer_n - 1):
                self.weights[i] -= self.lr * self.v_w[i]
                self.biases[i]  -= self.lr * self.v_b[i]

            return loss
        else:
            # Normal forward/back
            self.forward_pass(x_batch)
            out = self.h[-1]

            l2_norm_weights=0
            l2_norm_bias= 0
            for i in range(len(self.weights)):
                l2_norm_weights += np.sum(self.weights[i] ** 2)
            # for i in range(len(self.biases)):
            #     l2_norm_bias += np.sum(self.biases[i] ** 2)
                    
            l2_norm_params = l2_norm_weights #+ l2_norm_bias
            
            loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis = 1)) +  (self.weight_decay/2) * l2_norm_params 

            #loss = -np.mean(np.sum(y_batch * np.log(out + 1e-10), axis=1))
            dW, dB = self.backward_prop(y_batch)
            self._update_params(dW, dB)
            return loss

    # Outer Training Loop 
    def training(self, x_train, y_train, x_val, y_val, epochs, batch_size):
       
        for ep in range(epochs):
            idx = np.random.permutation(x_train.shape[0])
            x_train_shuff = x_train[idx]
            y_train_shuff = y_train[idx]
            n_batches = len(x_train) // batch_size
            epoch_loss = 0.0
            for b in range(n_batches):
                start = b * batch_size
                end = start + batch_size
                x_batch = x_train_shuff[start:end]
                y_batch = y_train_shuff[start:end]
                loss = self._train_step(x_batch, y_batch)
                epoch_loss += loss
            avg_loss = epoch_loss / n_batches

            # Validation

            preds = self.predict(x_val)
            val_labels = np.argmax(y_val, axis=1)
            val_acc = np.mean(preds == val_labels)

            val_outputs = self.forward_pass(x_val)[-1]
        
            # Cross-entropy loss for validation
            val_loss = -np.mean(np.sum(y_val * np.log(val_outputs + 1e-10), axis=1))

            # Log metrics to wandb
            wandb.log({"epoch": ep+1, "training_loss": avg_loss, "validation_accuracy": val_acc, "validation loss": val_loss})
            print(f"Epoch {ep+1}/{epochs} - loss={avg_loss:.4f}, val_acc={val_acc:.4f}, val_loss={val_loss}" )

    #Prediction 
    def predict(self, X):
        self.forward_pass(X)
        return np.argmax(self.h[-1], axis=1)




# (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
# x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
# x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

# np.random.seed(42)
# idx = np.arange(x_train_full.shape[0])
# np.random.shuffle(idx)
# x_train_full = x_train_full[idx]
# y_train_full = y_train_full[idx]

# # 90% training, 10% validation 
# train_size=int(.9*len(x_train_full))

# x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
# x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

# num_classes = 10
# y_train_1h = np.eye(num_classes)[y_train]
# y_val_1h = np.eye(num_classes)[y_val]
# y_test_1h = np.eye(num_classes)[y_test]

# # model
# model = feed_forward_NN_4(
#     layers=[784] + [32] *3 + [10],
# optimizer="nesterov",
# learning_rate=0.01,
# momentum=0.9,
# beta1=0.9,
# beta2=0.999,
# beta_rms=0.9,
# epsilon=1e-4,
# weight_decay=0.0005,
# init_type="xavier",
# activation="relu")

#     # Train the model
# model.training(
#         x_train=x_train,
#         y_train=y_train_1h,
#         x_val=x_val,
#         y_val=y_val_1h,
#         epochs=10,
#         batch_size=32
#     )

#     #Evaluation on test set
# test_preds = model.predict(x_test)
# test_labels = np.argmax(y_test_1h, axis=1)
# test_acc = np.mean(test_preds == test_labels)
# print("test accuracy ",test_acc)
# #wandb.log({"test_accuracy": test_acc})




# train_sweep() function

def train_sweep():
    # Initialize wandb
    wandb.init()
    config = wandb.config

    #custom run name from hyperparameters
    run_name = f"hl_{config.num_hidden_layers}_bs_{config.batch_size}_ac_{config.activation}_opt_{config.optimizer}"
    wandb.run.name = run_name

    # Load Fashion-MNIST
    (x_train_full, y_train_full), (x_test, y_test) = fashion_mnist.load_data()
    x_train_full = x_train_full.reshape(x_train_full.shape[0], -1) / 255.0
    x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

    np.random.seed(42)
    idx = np.arange(x_train_full.shape[0])
    np.random.shuffle(idx)
    x_train_full = x_train_full[idx]
    y_train_full = y_train_full[idx]

    # 90% training, 10% validation 
    train_size=int(.9*len(x_train_full))

    x_train, y_train=x_train_full[:train_size],y_train_full[:train_size]
    x_val, y_val=x_train_full[train_size:], y_train_full[train_size:]

    num_classes = 10
    y_train_1h = np.eye(num_classes)[y_train]
    y_val_1h = np.eye(num_classes)[y_val]
    y_test_1h = np.eye(num_classes)[y_test]

    # model
    model = feed_forward_NN_4(
        layers=[784] + [config.hidden_size] * config.num_hidden_layers + [10],
        optimizer=config.optimizer,
        learning_rate=config.learning_rate,
        momentum=config.momentum,
        beta1=config.beta1,
        beta2=config.beta2,
        beta_rms=config.beta_rms,
        epsilon=config.epsilon,
        weight_decay=config.weight_decay,
        init_type=config.init_type,
        activation=config.activation
    )

    # Train the model
    model.training(
        x_train=x_train,
        y_train=y_train_1h,
        x_val=x_val,
        y_val=y_val_1h,
        epochs=config.epochs,
        batch_size=config.batch_size
    )

    #Evaluation on test set
    test_preds = model.predict(x_test)
    test_labels = np.argmax(y_test_1h, axis=1)
    test_acc = np.mean(test_preds == test_labels)
    
    wandb.log({"test_accuracy": test_acc})
    print("test accuracy ",test_acc)


# sweep configuration
sweep_config = {
    "method": "random", 
    "metric": {
        "name": "validation_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "epochs": {"values": [10]},
        "num_hidden_layers": {"values": [3,5]},
        "hidden_size": {"values": [64, 128]},
        "weight_decay": {"values": [0.0, 0.0005]},
        "learning_rate": {"values": [1e-3, 1e-4]},
        "optimizer": {"values": ["momentum", "nesterov", "rmsprop", "adam", "nadam"]},
        "batch_size": {"values": [32]},
        "init_type": {"values": ["random", "xavier"]},
        "activation": {"values": ["sigmoid", "tanh", "relu"]},
        "momentum": {"values": [0.8, 0.9]},
        "beta1": {"values": [0.9]},
        "beta2": {"values": [0.999]},
        "beta_rms": {"values": [0.9]},
        "epsilon": {"values": [1e-8]}
    }
}

# Running the sweep

if __name__ == "__main__":
    # Creating sweep
    sweep_id = wandb.sweep(sweep_config, project="q4_sweep_project")
    # Launching sweep agent
    wandb.agent(sweep_id, function=train_sweep)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: p60nv46x
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/q4_sweep_project/sweeps/p60nv46x


[34m[1mwandb[0m: Agent Starting Run: 983zh0zp with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33med24s401[0m ([33med24s401-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10 - loss=0.5553, val_acc=0.8282, val_loss=0.4502716316380152
Epoch 2/10 - loss=0.4163, val_acc=0.8480, val_loss=0.42432164110251913
Epoch 3/10 - loss=0.3919, val_acc=0.8623, val_loss=0.3812924125999896
Epoch 4/10 - loss=0.3840, val_acc=0.8690, val_loss=0.4012169075061982
Epoch 5/10 - loss=0.3837, val_acc=0.8687, val_loss=0.39931507821164314
Epoch 6/10 - loss=0.3857, val_acc=0.8615, val_loss=0.40627183501358344
Epoch 7/10 - loss=0.3884, val_acc=0.8632, val_loss=0.4016074682796713
Epoch 8/10 - loss=0.3928, val_acc=0.8398, val_loss=0.5069759598151149
Epoch 9/10 - loss=0.3916, val_acc=0.8578, val_loss=0.4444773724894061
Epoch 10/10 - loss=0.3890, val_acc=0.8718, val_loss=0.42694463804726085
test accuracy  0.8656


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▂▁▁▁▁▁▁▁▁
validation loss,▅▃▁▂▂▂▂█▅▄
validation_accuracy,▁▄▆█▇▆▇▃▆█

0,1
epoch,10.0
test_accuracy,0.8656
training_loss,0.38904
validation loss,0.42694
validation_accuracy,0.87183


[34m[1mwandb[0m: Agent Starting Run: uovgr4vw with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=0.8834, val_acc=0.8267, val_loss=0.47538230296875705
Epoch 2/10 - loss=0.6570, val_acc=0.8558, val_loss=0.39825404014473625
Epoch 3/10 - loss=0.6084, val_acc=0.8575, val_loss=0.39168677438194544
Epoch 4/10 - loss=0.5783, val_acc=0.8637, val_loss=0.36085865300128966
Epoch 5/10 - loss=0.5563, val_acc=0.8707, val_loss=0.3545870458846383
Epoch 6/10 - loss=0.5377, val_acc=0.8703, val_loss=0.34473827226874093
Epoch 7/10 - loss=0.5217, val_acc=0.8723, val_loss=0.3401226195617471
Epoch 8/10 - loss=0.5085, val_acc=0.8760, val_loss=0.33950222180277234
Epoch 9/10 - loss=0.4963, val_acc=0.8757, val_loss=0.3220967424473837
Epoch 10/10 - loss=0.4843, val_acc=0.8688, val_loss=0.343127294585632
test accuracy  0.8675


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▃▂▂▂▁▁▁
validation loss,█▄▄▃▂▂▂▂▁▂
validation_accuracy,▁▅▅▆▇▇▇██▇

0,1
epoch,10.0
test_accuracy,0.8675
training_loss,0.48426
validation loss,0.34313
validation_accuracy,0.86883


[34m[1mwandb[0m: Agent Starting Run: q3c3z15h with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=1.3440, val_acc=0.7082, val_loss=0.9073662886684932
Epoch 2/10 - loss=0.7750, val_acc=0.7633, val_loss=0.7138991853294984
Epoch 3/10 - loss=0.6476, val_acc=0.7920, val_loss=0.6288609369578053
Epoch 4/10 - loss=0.5838, val_acc=0.7983, val_loss=0.5806358211738426
Epoch 5/10 - loss=0.5445, val_acc=0.8093, val_loss=0.5485756062703193
Epoch 6/10 - loss=0.5190, val_acc=0.8168, val_loss=0.5264923467970231
Epoch 7/10 - loss=0.4991, val_acc=0.8190, val_loss=0.5119154577780011
Epoch 8/10 - loss=0.4842, val_acc=0.8245, val_loss=0.4939587734489917
Epoch 9/10 - loss=0.4717, val_acc=0.8273, val_loss=0.48385537928223477
Epoch 10/10 - loss=0.4610, val_acc=0.8283, val_loss=0.477346511452069
test accuracy  0.8235


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▃▂▂▂▁▁▁▁▁
validation loss,█▅▃▃▂▂▂▁▁▁
validation_accuracy,▁▄▆▆▇▇▇███

0,1
epoch,10.0
test_accuracy,0.8235
training_loss,0.46102
validation loss,0.47735
validation_accuracy,0.82833


[34m[1mwandb[0m: Agent Starting Run: 9he8lidr with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=1.6901, val_acc=0.6608, val_loss=1.0849220550864034
Epoch 2/10 - loss=1.0849, val_acc=0.7203, val_loss=0.8209892621538978
Epoch 3/10 - loss=0.9088, val_acc=0.7500, val_loss=0.7014003415657878
Epoch 4/10 - loss=0.8141, val_acc=0.7800, val_loss=0.6265385215766032
Epoch 5/10 - loss=0.7525, val_acc=0.7970, val_loss=0.5769467214846653
Epoch 6/10 - loss=0.7115, val_acc=0.8092, val_loss=0.5440950830985022
Epoch 7/10 - loss=0.6837, val_acc=0.8172, val_loss=0.520259758321714
Epoch 8/10 - loss=0.6639, val_acc=0.8195, val_loss=0.5060898067384088
Epoch 9/10 - loss=0.6491, val_acc=0.8228, val_loss=0.4920164784575944
Epoch 10/10 - loss=0.6372, val_acc=0.8255, val_loss=0.4804385771582166
test accuracy  0.8225


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▂▂▁▁▁▁▁
validation loss,█▅▄▃▂▂▁▁▁▁
validation_accuracy,▁▄▅▆▇▇████

0,1
epoch,10.0
test_accuracy,0.8225
training_loss,0.63718
validation loss,0.48044
validation_accuracy,0.8255


[34m[1mwandb[0m: Agent Starting Run: 06fllqqi with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.6100, val_acc=0.8407, val_loss=0.44123077796022897
Epoch 2/10 - loss=0.4132, val_acc=0.8550, val_loss=0.4002993521066746
Epoch 3/10 - loss=0.3807, val_acc=0.8495, val_loss=0.40426051903768817
Epoch 4/10 - loss=0.3577, val_acc=0.8613, val_loss=0.3784133103676986
Epoch 5/10 - loss=0.3424, val_acc=0.8652, val_loss=0.35747088755349204
Epoch 6/10 - loss=0.3284, val_acc=0.8642, val_loss=0.3588343176087859
Epoch 7/10 - loss=0.3184, val_acc=0.8747, val_loss=0.33800688875213775
Epoch 8/10 - loss=0.3071, val_acc=0.8792, val_loss=0.33343892668284936
Epoch 9/10 - loss=0.3005, val_acc=0.8750, val_loss=0.33113935830790064
Epoch 10/10 - loss=0.2907, val_acc=0.8742, val_loss=0.34139463457914676
test accuracy  0.8719


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▂▂▂▂▁▁▁
validation loss,█▅▆▄▃▃▁▁▁▂
validation_accuracy,▁▄▃▅▅▅▇█▇▇

0,1
epoch,10.0
test_accuracy,0.8719
training_loss,0.29069
validation loss,0.34139
validation_accuracy,0.87417


[34m[1mwandb[0m: Agent Starting Run: c283qtl2 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.5316, val_acc=0.8408, val_loss=0.4262506810276712
Epoch 2/10 - loss=0.3910, val_acc=0.8612, val_loss=0.38029229947575316
Epoch 3/10 - loss=0.3584, val_acc=0.8673, val_loss=0.35333921797827467
Epoch 4/10 - loss=0.3346, val_acc=0.8703, val_loss=0.35265591099771987
Epoch 5/10 - loss=0.3207, val_acc=0.8672, val_loss=0.3440293440400351
Epoch 6/10 - loss=0.3072, val_acc=0.8673, val_loss=0.3479996395814301
Epoch 7/10 - loss=0.2944, val_acc=0.8773, val_loss=0.3374566103018656
Epoch 8/10 - loss=0.2859, val_acc=0.8805, val_loss=0.3280997443162233
Epoch 9/10 - loss=0.2771, val_acc=0.8798, val_loss=0.3222661801201401
Epoch 10/10 - loss=0.2679, val_acc=0.8777, val_loss=0.33938835640725173
test accuracy  0.8716


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▃▂▂▂▁▁▁
validation loss,█▅▃▃▂▃▂▁▁▂
validation_accuracy,▁▅▆▆▆▆▇██▇

0,1
epoch,10.0
test_accuracy,0.8716
training_loss,0.26789
validation loss,0.33939
validation_accuracy,0.87767


[34m[1mwandb[0m: Agent Starting Run: 87e7mxie with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=1.6882, val_acc=0.6713, val_loss=0.9829690042692873
Epoch 2/10 - loss=1.0144, val_acc=0.7610, val_loss=0.7127878521082422
Epoch 3/10 - loss=0.8510, val_acc=0.7903, val_loss=0.6126208839811959
Epoch 4/10 - loss=0.7720, val_acc=0.8055, val_loss=0.556748979053276
Epoch 5/10 - loss=0.7221, val_acc=0.8183, val_loss=0.5202245598817438
Epoch 6/10 - loss=0.6901, val_acc=0.8305, val_loss=0.4956543569604034
Epoch 7/10 - loss=0.6667, val_acc=0.8338, val_loss=0.4803127923235173
Epoch 8/10 - loss=0.6491, val_acc=0.8340, val_loss=0.46663277474079934
Epoch 9/10 - loss=0.6347, val_acc=0.8390, val_loss=0.45545805654147564
Epoch 10/10 - loss=0.6227, val_acc=0.8425, val_loss=0.44658549176739265
test accuracy  0.836


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▂▂▂▁▁▁▁▁
validation loss,█▄▃▂▂▂▁▁▁▁
validation_accuracy,▁▅▆▆▇█████

0,1
epoch,10.0
test_accuracy,0.836
training_loss,0.62273
validation loss,0.44659
validation_accuracy,0.8425


[34m[1mwandb[0m: Agent Starting Run: pxrx0b96 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.4830, val_acc=0.8525, val_loss=0.3915567499634833
Epoch 2/10 - loss=0.3673, val_acc=0.8642, val_loss=0.3709101547896698
Epoch 3/10 - loss=0.3353, val_acc=0.8743, val_loss=0.3340325507608215
Epoch 4/10 - loss=0.3147, val_acc=0.8802, val_loss=0.32716974497891876
Epoch 5/10 - loss=0.2985, val_acc=0.8735, val_loss=0.3279797500329235
Epoch 6/10 - loss=0.2853, val_acc=0.8817, val_loss=0.3249539008688805
Epoch 7/10 - loss=0.2745, val_acc=0.8837, val_loss=0.32202964045447513
Epoch 8/10 - loss=0.2651, val_acc=0.8832, val_loss=0.30925817110188275
Epoch 9/10 - loss=0.2554, val_acc=0.8857, val_loss=0.3108576969021752
Epoch 10/10 - loss=0.2466, val_acc=0.8797, val_loss=0.3196521121128617
test accuracy  0.8726


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▅▄▃▃▂▂▂▁▁
validation loss,█▆▃▃▃▂▂▁▁▂
validation_accuracy,▁▃▆▇▅▇█▇█▇

0,1
epoch,10.0
test_accuracy,0.8726
training_loss,0.24655
validation loss,0.31965
validation_accuracy,0.87967


[34m[1mwandb[0m: Agent Starting Run: gag8ms48 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=2.4132, val_acc=0.1768, val_loss=2.292150293765403
Epoch 2/10 - loss=2.3905, val_acc=0.2853, val_loss=2.2861204554435686
Epoch 3/10 - loss=2.3842, val_acc=0.2873, val_loss=2.2802184838898456
Epoch 4/10 - loss=2.3777, val_acc=0.4022, val_loss=2.2732270674223356
Epoch 5/10 - loss=2.3706, val_acc=0.4557, val_loss=2.2657527965506956
Epoch 6/10 - loss=2.3627, val_acc=0.4705, val_loss=2.2576536299632615
Epoch 7/10 - loss=2.3540, val_acc=0.4965, val_loss=2.2483936568820377
Epoch 8/10 - loss=2.3440, val_acc=0.4177, val_loss=2.2378449647951
Epoch 9/10 - loss=2.3327, val_acc=0.4445, val_loss=2.2254009518067543
Epoch 10/10 - loss=2.3195, val_acc=0.4220, val_loss=2.2110843586417293
test accuracy  0.4219


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▆▆▅▅▄▄▃▂▁
validation loss,█▇▇▆▆▅▄▃▂▁
validation_accuracy,▁▃▃▆▇▇█▆▇▆

0,1
epoch,10.0
test_accuracy,0.4219
training_loss,2.31946
validation loss,2.21108
validation_accuracy,0.422


[34m[1mwandb[0m: Agent Starting Run: 8np5n7m6 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.5553, val_acc=0.8282, val_loss=0.4502716316380152
Epoch 2/10 - loss=0.4163, val_acc=0.8480, val_loss=0.42432164110251913
Epoch 3/10 - loss=0.3919, val_acc=0.8623, val_loss=0.3812924125999896
Epoch 4/10 - loss=0.3840, val_acc=0.8690, val_loss=0.4012169075061982
Epoch 5/10 - loss=0.3837, val_acc=0.8687, val_loss=0.39931507821164314
Epoch 6/10 - loss=0.3857, val_acc=0.8615, val_loss=0.40627183501358344
Epoch 7/10 - loss=0.3884, val_acc=0.8632, val_loss=0.4016074682796713
Epoch 8/10 - loss=0.3928, val_acc=0.8398, val_loss=0.5069759598151149
Epoch 9/10 - loss=0.3916, val_acc=0.8578, val_loss=0.4444773724894061
Epoch 10/10 - loss=0.3890, val_acc=0.8718, val_loss=0.42694463804726085
test accuracy  0.8656


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▂▁▁▁▁▁▁▁▁
validation loss,▅▃▁▂▂▂▂█▅▄
validation_accuracy,▁▄▆█▇▆▇▃▆█

0,1
epoch,10.0
test_accuracy,0.8656
training_loss,0.38904
validation loss,0.42694
validation_accuracy,0.87183


[34m[1mwandb[0m: Agent Starting Run: d7kc72pl with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=1.8201, val_acc=0.6285, val_loss=1.4182794127681342
Epoch 2/10 - loss=1.2176, val_acc=0.6615, val_loss=1.0795495502435692
Epoch 3/10 - loss=0.9810, val_acc=0.6913, val_loss=0.916333560681336
Epoch 4/10 - loss=0.8531, val_acc=0.7242, val_loss=0.8176308782402324
Epoch 5/10 - loss=0.7706, val_acc=0.7383, val_loss=0.7493140426217068
Epoch 6/10 - loss=0.7109, val_acc=0.7557, val_loss=0.6978554089214497
Epoch 7/10 - loss=0.6642, val_acc=0.7702, val_loss=0.6565216775726654
Epoch 8/10 - loss=0.6268, val_acc=0.7792, val_loss=0.6239664273975669
Epoch 9/10 - loss=0.5962, val_acc=0.7862, val_loss=0.5977701306216459
Epoch 10/10 - loss=0.5711, val_acc=0.7950, val_loss=0.5747982165197526
test accuracy  0.7966


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▅▃▃▂▂▂▁▁▁
validation loss,█▅▄▃▂▂▂▁▁▁
validation_accuracy,▁▂▄▅▆▆▇▇██

0,1
epoch,10.0
test_accuracy,0.7966
training_loss,0.57108
validation loss,0.5748
validation_accuracy,0.795


[34m[1mwandb[0m: Agent Starting Run: b2u3g9u7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=1.4049, val_acc=0.6580, val_loss=0.8802240088826195
Epoch 2/10 - loss=0.9893, val_acc=0.6897, val_loss=0.7898220231309623
Epoch 3/10 - loss=0.9177, val_acc=0.7102, val_loss=0.7576589613292878
Epoch 4/10 - loss=0.8454, val_acc=0.7783, val_loss=0.6096843799087401
Epoch 5/10 - loss=0.7688, val_acc=0.8012, val_loss=0.5732428096530953
Epoch 6/10 - loss=0.7143, val_acc=0.8263, val_loss=0.51107325815624
Epoch 7/10 - loss=0.6963, val_acc=0.8280, val_loss=0.48082160792455714
Epoch 8/10 - loss=0.6796, val_acc=0.8390, val_loss=0.4740562301790807
Epoch 9/10 - loss=0.6668, val_acc=0.8388, val_loss=0.4716351772123521
Epoch 10/10 - loss=0.6625, val_acc=0.8365, val_loss=0.4660809467256131
test accuracy  0.8336


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▃▂▁▁▁▁▁
validation loss,█▆▆▃▃▂▁▁▁▁
validation_accuracy,▁▂▃▆▇█████

0,1
epoch,10.0
test_accuracy,0.8336
training_loss,0.66249
validation loss,0.46608
validation_accuracy,0.8365


[34m[1mwandb[0m: Agent Starting Run: rzj40vla with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=0.8189, val_acc=0.8330, val_loss=0.47288786305722436
Epoch 2/10 - loss=0.5876, val_acc=0.8490, val_loss=0.42048123166841916
Epoch 3/10 - loss=0.5446, val_acc=0.8553, val_loss=0.40074556277352696
Epoch 4/10 - loss=0.5177, val_acc=0.8617, val_loss=0.37286030160037376
Epoch 5/10 - loss=0.4973, val_acc=0.8620, val_loss=0.3725171362276269
Epoch 6/10 - loss=0.4816, val_acc=0.8667, val_loss=0.3578653649249147
Epoch 7/10 - loss=0.4685, val_acc=0.8707, val_loss=0.34814295280925867
Epoch 8/10 - loss=0.4573, val_acc=0.8785, val_loss=0.3339394323350977
Epoch 9/10 - loss=0.4464, val_acc=0.8735, val_loss=0.3362757898576429
Epoch 10/10 - loss=0.4362, val_acc=0.8832, val_loss=0.3229743905907827
test accuracy  0.876


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▂▂▂▂▁▁▁
validation loss,█▆▅▃▃▃▂▂▂▁
validation_accuracy,▁▃▄▅▅▆▆▇▇█

0,1
epoch,10.0
test_accuracy,0.876
training_loss,0.43622
validation loss,0.32297
validation_accuracy,0.88317


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1ybr11n1 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.8
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.6495, val_acc=0.8287, val_loss=0.4716594105169914
Epoch 2/10 - loss=0.4239, val_acc=0.8508, val_loss=0.4094628426307846
Epoch 3/10 - loss=0.3844, val_acc=0.8553, val_loss=0.4017706956811508
Epoch 4/10 - loss=0.3606, val_acc=0.8672, val_loss=0.36979783960650636
Epoch 5/10 - loss=0.3414, val_acc=0.8645, val_loss=0.3633434222988479
Epoch 6/10 - loss=0.3276, val_acc=0.8720, val_loss=0.3461213650769512
Epoch 7/10 - loss=0.3138, val_acc=0.8745, val_loss=0.339501928620412
Epoch 8/10 - loss=0.3028, val_acc=0.8777, val_loss=0.32613300740102114
Epoch 9/10 - loss=0.2913, val_acc=0.8803, val_loss=0.32298694665596944
Epoch 10/10 - loss=0.2815, val_acc=0.8817, val_loss=0.31899116400933514
test accuracy  0.8767


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▄▃▃▂▂▂▁▁▁
validation loss,█▅▅▃▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▆▇▇▇██

0,1
epoch,10.0
test_accuracy,0.8767
training_loss,0.28153
validation loss,0.31899
validation_accuracy,0.88167


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nglt2wjb with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 1/10 - loss=0.8667, val_acc=0.8015, val_loss=0.5542136418987664
Epoch 2/10 - loss=0.4980, val_acc=0.8322, val_loss=0.46923135082256345
Epoch 3/10 - loss=0.4471, val_acc=0.8422, val_loss=0.4511293157104777
Epoch 4/10 - loss=0.4176, val_acc=0.8525, val_loss=0.4181017991411145
Epoch 5/10 - loss=0.3947, val_acc=0.8477, val_loss=0.4115344210252111
Epoch 6/10 - loss=0.3807, val_acc=0.8617, val_loss=0.3815295543177438
Epoch 7/10 - loss=0.3656, val_acc=0.8640, val_loss=0.3686995210007295
Epoch 8/10 - loss=0.3543, val_acc=0.8678, val_loss=0.35666497829869853
Epoch 9/10 - loss=0.3424, val_acc=0.8685, val_loss=0.357329174166745
Epoch 10/10 - loss=0.3330, val_acc=0.8713, val_loss=0.34871448679526496
test accuracy  0.8641


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_accuracy,▁
training_loss,█▃▂▂▂▂▁▁▁▁
validation loss,█▅▄▃▃▂▂▁▁▁
validation_accuracy,▁▄▅▆▆▇▇███

0,1
epoch,10.0
test_accuracy,0.8641
training_loss,0.33302
validation loss,0.34871
validation_accuracy,0.87133


[34m[1mwandb[0m: Agent Starting Run: 4nee0p0z with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beta1: 0.9
[34m[1mwandb[0m: 	beta2: 0.999
[34m[1mwandb[0m: 	beta_rms: 0.9
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	epsilon: 1e-08
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	init_type: random
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0.0005


Epoch 1/10 - loss=1.6906, val_acc=0.6823, val_loss=0.9048527284554221
Epoch 2/10 - loss=0.9201, val_acc=0.7538, val_loss=0.6892007252587247
Epoch 3/10 - loss=0.7928, val_acc=0.7907, val_loss=0.6004321967752192
Epoch 4/10 - loss=0.7314, val_acc=0.8097, val_loss=0.5544426119523772
