# Digit Mnist 

In [1]:
import numpy as np
import wandb

# Optimizer Class 
class Optimizer:
    def __init__(self, method="sgd", learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.method = method
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.velocity = {}
        self.cache = {}
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, weights, grads, key):
        if self.method == "sgd":
            return weights - self.lr * grads

        elif self.method == "momentum":
            if key not in self.velocity:
                self.velocity[key] = np.zeros_like(weights)
            self.velocity[key] = self.beta1 * self.velocity[key] - self.lr * grads
            return weights + self.velocity[key]

        elif self.method == "nag":
            if key not in self.velocity:
                self.velocity[key] = np.zeros_like(weights)
            lookahead = weights + self.beta1 * self.velocity[key]
            self.velocity[key] = self.beta1 * self.velocity[key] - self.lr * grads
            return lookahead + self.velocity[key]

        elif self.method == "rmsprop":
            if key not in self.cache:
                self.cache[key] = np.zeros_like(weights)
            self.cache[key] = self.beta1 * self.cache[key] + (1 - self.beta1) * (grads ** 2)
            return weights - self.lr * grads / (np.sqrt(self.cache[key]) + self.epsilon)

        elif self.method == "adam":
            self.t += 1
            if key not in self.m:
                self.m[key] = np.zeros_like(weights)
                self.v[key] = np.zeros_like(weights)
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads ** 2)
            m_hat = self.m[key] / (1 - self.beta1 ** self.t)
            v_hat = self.v[key] / (1 - self.beta2 ** self.t)
            return weights - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)

# Load MNIST Dataset
def load_data():
    from keras.datasets import mnist  # Using digit MNIST dataset
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    X_train, X_test = X_train / 255.0, X_test / 255.0  # Normalize
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    
    # Split 10% of training data for validation
    val_size = int(0.1 * X_train.shape[0])
    X_val, y_val = X_train[:val_size], y_train[:val_size]
    X_train, y_train = X_train[val_size:], y_train[val_size:]
    
    return X_train, y_train, X_val, y_val, X_test, y_test 

def initialize_weights(layers, init_type):
    weights = {}
    for i in range(len(layers) - 1):
        if init_type == "xavier":
            weights[f'W{i+1}'] = np.random.randn(layers[i], layers[i+1]) / np.sqrt(layers[i])
        elif init_type == "he":
            weights[f'W{i+1}'] = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2 / layers[i])
        else:
            weights[f'W{i+1}'] = np.random.randn(layers[i], layers[i+1]) * 0.1  
        weights[f'b{i+1}'] = np.zeros((1, layers[i+1]))
    return weights
    
def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True)) 
    return expZ / np.sum(expZ, axis=1, keepdims=True)

def forward_pass(X, weights, activation):
    Z, A = {}, {"A0": X}
    for i in range(1, len(weights) // 2):
        Z[f'Z{i}'] = np.dot(A[f'A{i-1}'], weights[f'W{i}']) + weights[f'b{i}']
        A[f'A{i}'] = np.maximum(0, Z[f'Z{i}']) if activation == "relu" else 1 / (1 + np.exp(-Z[f'Z{i}']))

    last_layer = len(weights) // 2
    Z[f'Z{last_layer}'] = np.dot(A[f'A{last_layer-1}'], weights[f'W{last_layer}']) + weights[f'b{last_layer}']
    A[f'A{last_layer}'] = softmax(Z[f'Z{last_layer}'])
    
    return Z, A

def backward_pass(X, Y, weights, A, Z, activation, weight_decay):
    grads = {}
    m = X.shape[0]
    dA = A[f'A{len(A)-1}'] - Y 

    for i in reversed(range(1, len(weights) // 2 + 1)):
        dZ = dA * (A[f'A{i}'] > 0) if activation == "relu" else dA * A[f'A{i}'] * (1 - A[f'A{i}'])
        grads[f'dW{i}'] = np.dot(A[f'A{i-1}'].T, dZ) / m + weight_decay * weights[f'W{i}']
        grads[f'db{i}'] = np.sum(dZ, axis=0, keepdims=True) / m
        dA = np.dot(dZ, weights[f'W{i}'].T)
    
    return grads

def apply_optimizer(optimizer, weights, grads):
    for i in range(1, len(weights) // 2 + 1):
        weights[f'W{i}'] = optimizer.update(weights[f'W{i}'], grads[f'dW{i}'], f'W{i}')
        weights[f'b{i}'] = optimizer.update(weights[f'b{i}'], grads[f'db{i}'], f'b{i}')

def compute_loss(Y, A):
    m = Y.shape[0]
    return -np.sum(Y * np.log(A + 1e-8)) / m  

def compute_accuracy(Y, A):
    return np.mean(np.argmax(Y, axis=1) == np.argmax(A, axis=1))

def train(config=None):
    with wandb.init(config=config, reinit=True):
        config = wandb.config
        wandb.run.name = (
            "_hl_" + str(config.num_layers) +
            "_hn_" + str(config.hidden_size) +
            "_opt_" + config.optimizer +
            "_act_" + config.activation +
            "_lr_" + str(config.learning_rate) +
            "_bs_" + str(config.batch_size) +
            "_init_" + config.weight_init +
            "_ep_" + str(config.epochs) +
            "_l2_" + str(config.weight_decay)
        )

        X_train, y_train, X_val, y_val, _, _ = load_data()
        num_classes = 10
        y_train, y_val = [np.eye(num_classes)[y] for y in [y_train, y_val]]
        
        layers = [784] + [config.hidden_size] * config.num_layers + [num_classes]
        weights = initialize_weights(layers, config.weight_init)
        
        optimizer = Optimizer(method=config.optimizer, learning_rate=config.learning_rate)
        
        for epoch in range(config.epochs):
            for i in range(0, X_train.shape[0], config.batch_size):
                X_batch = X_train[i:i + config.batch_size]
                y_batch = y_train[i:i + config.batch_size]
                
                Z, A = forward_pass(X_batch, weights, config.activation)
                grads = backward_pass(X_batch, y_batch, weights, A, Z, config.activation, config.weight_decay)
                apply_optimizer(optimizer, weights, grads)
                
            Z_val, A_val = forward_pass(X_val, weights, config.activation)
            val_loss = compute_loss(y_val, A_val[f'A{len(A_val)-1}'])
            val_acc = compute_accuracy(y_val, A_val[f'A{len(A_val)-1}'])
            
            wandb.log({"epoch": epoch + 1, "val_loss": val_loss, "val_accuracy": val_acc})

In [2]:
sweep_config = {
    "method": "bayes",  # Bayesian 
    "metric": {"name": "val_loss", "goal": "minimize"},
    "parameters": {
        "activation": {"values": ["relu", "sigmoid", "tanh"]},
        "batch_size": {"values": [32, 64, 128]},
        "epochs": {"values": [5, 10, 20]},
        "optimizer": {"values": ["sgd", "momentum", "nag", "rmsprop", "adam"]},
        "learning_rate": {"values": [0.1, 0.01, 0.001]},
        "hidden_size": {"values": [64, 128, 256]},
        "num_layers": {"values": [1, 2, 3]},
        "weight_init": {"values": ["xavier", "he", "random"]},
        "loss": {"values": ["cross_entropy", "mean_squared_error"]},
        "weight_decay": {"values": [0.0, 0.0001, 0.001]}
    }
}

sweep_id = wandb.sweep(sweep_config, project="Digit-MNIST-Data")

def sweep_function():
    wandb.agent(sweep_id, function=train, count=15)
sweep_function()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 2e6bq7lk
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Digit-MNIST-Data/sweeps/2e6bq7lk


wandb: Agent Starting Run: rm77aizi with config:
wandb: 	activation: tanh
wandb: 	batch_size: 64
wandb: 	epochs: 10
wandb: 	hidden_size: 128
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: random
wandb: Currently logged in as: ma23c014 (ma23c014-indian-institute-of-technology-madras) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


0,1
epoch,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▅▆▇▇▇███▇
val_loss,█▃▃▂▁▁▁▂▂▂

0,1
epoch,10.0
val_accuracy,0.94667
val_loss,0.28633


wandb: Agent Starting Run: 6b30gezr with config:
wandb: 	activation: relu
wandb: 	batch_size: 128
wandb: 	epochs: 5
wandb: 	hidden_size: 128
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 2
wandb: 	optimizer: sgd
wandb: 	weight_decay: 0.0001
wandb: 	weight_init: xavier


0,1
epoch,▁▃▅▆█
val_accuracy,▁▅▇██
val_loss,█▇▅▃▁

0,1
epoch,5.0
val_accuracy,0.716
val_loss,1.49929


wandb: Agent Starting Run: zb5or22y with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 32
wandb: 	epochs: 20
wandb: 	hidden_size: 64
wandb: 	learning_rate: 0.1
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: xavier


  A[f'A{i}'] = np.maximum(0, Z[f'Z{i}']) if activation == "relu" else 1 / (1 + np.exp(-Z[f'Z{i}']))


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
val_accuracy,█▁▁█▅▇▁▁▂▁▁▁▂▁▂▁▁▁▃▂
val_loss,▁▄▅▃▃▄█▄▄▅▃▅▄▄▅▅▅▅▃▃

0,1
epoch,20.0
val_accuracy,0.10133
val_loss,2.39526


wandb: Agent Starting Run: uzkh8rmn with config:
wandb: 	activation: tanh
wandb: 	batch_size: 64
wandb: 	epochs: 20
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 2
wandb: 	optimizer: sgd
wandb: 	weight_decay: 0.0001
wandb: 	weight_init: random


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
val_accuracy,▁▂▂▂▂▃▃▃▄▄▅▅▆▆▆▇▇▇██
val_loss,█▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
epoch,20.0
val_accuracy,0.27167
val_loss,2.24266


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: p2sgqw0o with config:
wandb: 	activation: sigmoid
wandb: 	batch_size: 128
wandb: 	epochs: 5
wandb: 	hidden_size: 128
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 3
wandb: 	optimizer: nag
wandb: 	weight_decay: 0.001
wandb: 	weight_init: xavier


0,1
epoch,▁▃▅▆█
val_accuracy,▁▁▁▁▁
val_loss,█▆▅▃▁

0,1
epoch,5.0
val_accuracy,0.11183
val_loss,2.29954


wandb: Agent Starting Run: wvl6xuy2 with config:
wandb: 	activation: relu
wandb: 	batch_size: 128
wandb: 	epochs: 10
wandb: 	hidden_size: 64
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 3
wandb: 	optimizer: sgd
wandb: 	weight_decay: 0.001
wandb: 	weight_init: xavier


0,1
epoch,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▂▃▄▅▆▇▇██
val_loss,██▇▇▆▅▄▃▂▁

0,1
epoch,10.0
val_accuracy,0.76917
val_loss,0.99085


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: d2k2f2rg with config:
wandb: 	activation: tanh
wandb: 	batch_size: 64
wandb: 	epochs: 5
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: he


0,1
epoch,▁▃▅▆█
val_accuracy,▁▅▆██
val_loss,█▄▂▁▁

0,1
epoch,5.0
val_accuracy,0.9565
val_loss,0.20776


wandb: Agent Starting Run: uvvic713 with config:
wandb: 	activation: tanh
wandb: 	batch_size: 128
wandb: 	epochs: 5
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0.0001
wandb: 	weight_init: he


0,1
epoch,▁▃▅▆█
val_accuracy,▁▃▃▅█
val_loss,█▄▅▃▁

0,1
epoch,5.0
val_accuracy,0.905
val_loss,0.532


wandb: Agent Starting Run: 7wrf2ouu with config:
wandb: 	activation: tanh
wandb: 	batch_size: 64
wandb: 	epochs: 10
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 3
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▄▆▇▇▇▆█▇█
val_loss,█▅▃▂▂▂▃▁▁▁

0,1
epoch,10.0
val_accuracy,0.97383
val_loss,0.0992


wandb: Agent Starting Run: u8yn21zl with config:
wandb: 	activation: tanh
wandb: 	batch_size: 64
wandb: 	epochs: 5
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0.0001
wandb: 	weight_init: random


0,1
epoch,▁▃▅▆█
val_accuracy,▂▁▅█▇
val_loss,█▃▂▂▁

0,1
epoch,5.0
val_accuracy,0.90083
val_loss,0.58093


wandb: Agent Starting Run: eyym70fv with config:
wandb: 	activation: tanh
wandb: 	batch_size: 32
wandb: 	epochs: 5
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: he


0,1
epoch,▁▃▅▆█
val_accuracy,▁▅▇██
val_loss,█▄▂▁▁

0,1
epoch,5.0
val_accuracy,0.965
val_loss,0.12151


wandb: Agent Starting Run: e73j2q3y with config:
wandb: 	activation: tanh
wandb: 	batch_size: 128
wandb: 	epochs: 5
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 1
wandb: 	optimizer: rmsprop
wandb: 	weight_decay: 0
wandb: 	weight_init: random


0,1
epoch,▁▃▅▆█
val_accuracy,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
val_accuracy,0.91617
val_loss,0.38243


wandb: Agent Starting Run: ra6z8t8n with config:
wandb: 	activation: tanh
wandb: 	batch_size: 32
wandb: 	epochs: 5
wandb: 	hidden_size: 128
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 3
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: random


0,1
epoch,▁▃▅▆█
val_accuracy,▁▄▅▇█
val_loss,█▄▃▂▁

0,1
epoch,5.0
val_accuracy,0.96983
val_loss,0.11284


wandb: Agent Starting Run: wof4hoaq with config:
wandb: 	activation: tanh
wandb: 	batch_size: 32
wandb: 	epochs: 5
wandb: 	hidden_size: 64
wandb: 	learning_rate: 0.001
wandb: 	loss: cross_entropy
wandb: 	num_layers: 1
wandb: 	optimizer: rmsprop
wandb: 	weight_decay: 0
wandb: 	weight_init: random


0,1
epoch,▁▃▅▆█
val_accuracy,▁▅▇██
val_loss,█▃▂▁▁

0,1
epoch,5.0
val_accuracy,0.935
val_loss,0.25544


wandb: Agent Starting Run: h8fg3r7b with config:
wandb: 	activation: tanh
wandb: 	batch_size: 32
wandb: 	epochs: 10
wandb: 	hidden_size: 256
wandb: 	learning_rate: 0.001
wandb: 	loss: mean_squared_error
wandb: 	num_layers: 2
wandb: 	optimizer: adam
wandb: 	weight_decay: 0
wandb: 	weight_init: random


0,1
epoch,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▃▅▆▆███▇▇
val_loss,█▅▄▂▂▁▁▁▂▂

0,1
epoch,10.0
val_accuracy,0.97133
val_loss,0.10731
