In [1]:
import numpy as np
from tqdm import tqdm
import wandb
from keras.datasets import fashion_mnist
import matplotlib.pyplot as plt
import wandb
wandb.login()

In [None]:
y = fashion_mnist.load_data()[1][1]
X = fashion_mnist.load_data()[1][0]
fig, ax = plt.subplots(2,5)
for i, ax in enumerate(ax.flatten()):
    im_idx = np.argwhere(y == i)[0]
    plottable_image = np.reshape(X[im_idx], (28, 28))
    ax.imshow(plottable_image, cmap='gray_r')

In [2]:
class Layer:
    def __init__(self, num_inputs, num_neurons, activation, weight_init):
        self.num_inputs = num_inputs
        self.num_neurons = num_neurons
        self.activation_fn = activation
        self.weight_init = weight_init
        self.w = np.random.randn(self.num_neurons, self.num_inputs)
        self.b = np.random.randn(self.num_neurons)
        if weight_init == 'Xavier':
            self.w = self.w/np.sqrt(self.num_inputs)
            self.b = self.b/np.sqrt(self.num_inputs)

    def activation(self,x):
        if self.activation_fn == 'ReLU':
            return np.maximum(0,x)
        if self.activation_fn == 'softmax':
            mx = np.max(x, axis = 1, keepdims=True)
            x -= mx
            return(np.exp(x)/np.sum(np.exp(x), axis=1, keepdims=True))
        if self.activation_fn == 'sigmoid':
            x = np.clip(x, -500, 500)
            return(1/(1+np.exp(-x)))
        if self.activation_fn == 'tanh':
            return np.tanh(x)

    def grad_activation(self, x):
        if self.activation_fn == 'ReLU':
            return 1*(x>0)
        if self.activation_fn == 'sigmoid':
            return (self.activation(x)*(1 - self.activation(x)))
        if self.activation_fn == 'tanh':
            return (1 - np.square(self.activation(x)))

    def forward(self, cur_input):
        re_bias = self.b.reshape(-1,1)
        self.a = np.dot(self.w,cur_input.T) + re_bias
        self.a = self.a.T
        self.h = self.activation(self.a)
        return self.h

    def backward(self, grad_a, prev_a, prev_h, grad_activation):
        self.dw = np.dot(grad_a.T, prev_h)
        self.db = np.sum(grad_a, axis=0)
        prev_h_grad = np.dot(grad_a, self.w)
        der = grad_activation(prev_a)
        grad_prev_a = prev_h_grad*der
        return grad_prev_a

In [3]:
class NeuralNetwork:
    def __init__(self, num_inputs, num_classes, num_hidden_layer, num_neurons, activation, weight_init):
        self.num_inputs = num_inputs
        self.num_classes = num_classes
        self.num_hidden_layer = num_hidden_layer
        self.num_neurons = num_neurons
        self.activation = activation
        self.weight_init = weight_init
        self.layers = []
        self.layers.append(Layer(num_inputs, num_neurons, activation, weight_init))
        for i in range(num_hidden_layer - 1):
            self.layers.append(Layer(num_neurons, num_neurons, 'ReLU', weight_init))
        self.layers.append(Layer(num_neurons, num_classes, 'softmax', weight_init))

    def forward(self, inputs):
        self.inputs = inputs
        cur_in = inputs
        for i in range(self.num_hidden_layer+1):
            cur_out = self.layers[i].forward(cur_in)
            cur_in = cur_out
        self.y_pred = cur_out
        return cur_out

    def backward(self, outputs):
        grad_a_L = -(outputs - self.y_pred)
        for i in range(self.num_hidden_layer, 0, -1):
            grad_a_L = self.layers[i].backward(grad_a_L,self.layers[i-1].a,self.layers[i-1].h, self.layers[i-1].grad_activation)

        self.layers[0].dw = np.dot(grad_a_L.T, self.inputs)
        self.layers[0].db = np.sum(grad_a_L, axis=0)
        
    def minibatch_sgd(self, dw, db, eta : float = 0.01, weight_decay : float = 0.0):
            for j in range(self.num_hidden_layer+1):
                self.layers[j].w -= eta*dw[j] + eta*weight_decay*self.layers[j].w
                self.layers[j].b -= eta*db[j] + eta*weight_decay*self.layers[j].b

    def momentum_gd(self, uw, ub, dw, db, eta : float = 0.01, weight_decay : float = 0.0, beta : float = 0.9):
        for j in range(self.num_hidden_layer+1):
            uw[j] = beta*uw[j] + dw[j]
            ub[j] = beta*ub[j] + db[j] 
            self.layers[j].w -= eta*uw[j] + eta*weight_decay*self.layers[j].w
            self.layers[j].b -= eta*ub[j] + eta*weight_decay*self.layers[j].b
        return uw, ub

    def NAG_gd(self, mw, mb, dw, db, eta : float = 0.01, weight_decay : float = 0.0, beta : float = 0.9):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta*mw[j] + dw[j]
            mb[j] = beta*mb[j] + db[j]
            self.layers[j].w -= eta*(beta*mw[j] + dw[j]) + eta*weight_decay*self.layers[j].w
            self.layers[j].b -= eta*(beta*mb[j] + db[j]) + eta*weight_decay*self.layers[j].b
        return mw, mb

    def RMSProp_gd(self, uw, ub, dw, db, eta : float = 0.01, weight_decay : float = 0.0, beta : float = 0.9, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            uw[j] = beta*uw[j] + (1-beta)*dw[j]**2
            ub[j] = beta*ub[j] + (1-beta)*db[j]**2
            self.layers[j].w -= eta*dw[j]/(np.sqrt(uw[j])+epsilon) + eta*weight_decay*self.layers[j].w
            self.layers[j].b -= eta*db[j]/(np.sqrt(ub[j])+epsilon) + eta*weight_decay*self.layers[j].b
        return uw, ub
    
    def Adam_gd(self, mw, mb, uw, ub, dw, db, t, eta : float = 0.01, weight_decay : float = 0.0, beta1 : float = 0.9, beta2 : float = 0.999, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta1*mw[j] + (1-beta1)*dw[j]
            mb[j] = beta1*mb[j] + (1-beta1)*db[j]
            uw[j] = beta2*uw[j] + (1-beta2)*(dw[j]**2)
            ub[j] = beta2*ub[j] + (1-beta2)*(db[j]**2)
            mw_hat = mw[j]/(1-beta1**t)
            mb_hat = mb[j]/(1-beta1**t)
            uw_hat = uw[j]/(1-beta2**t)
            ub_hat = ub[j]/(1-beta2**t)
            self.layers[j].w -= eta*mw_hat/(np.sqrt(uw_hat)+epsilon) + eta*weight_decay*self.layers[j].w
            self.layers[j].b -= eta*mb_hat/(np.sqrt(ub_hat)+epsilon) + eta*weight_decay*self.layers[j].b
        return mw, mb, uw, ub


    def NAdam_gd(self, mw, mb, uw, ub, dw, db, t, eta : float = 0.01, weight_decay : float = 0.0, beta1 : float = 0.9, beta2 : float = 0.999, epsilon : float = 1e-8):
        for j in range(self.num_hidden_layer+1):
            mw[j] = beta1*mw[j] + (1-beta1)*dw[j]
            mb[j] = beta1*mb[j] + (1-beta1)*db[j]
            uw[j] = beta2*uw[j] + (1-beta2)*dw[j]**2
            ub[j] = beta2*ub[j] + (1-beta2)*db[j]**2
            m_w_hat = mw[j]/(1-np.power(beta1, t+1))
            m_b_hat = mb[j]/(1-np.power(beta1, t+1))
            uw_hat = uw[j]/(1-np.power(beta2, t+1))
            ub_hat = ub[j]/(1-np.power(beta2, t+1))
            self.layers[j].w -= (eta/(np.sqrt(uw_hat) + epsilon))*(beta1*m_w_hat+ (1-beta1)*dw[j]/(1-np.power(beta1, t+1))) + eta*weight_decay*self.layers[j].w
            self.layers[j].b -= (eta/(np.sqrt(ub_hat) + epsilon))*(beta1*m_b_hat + (1-beta1)*db[j]/(1-np.power(beta1, t+1))) + eta*weight_decay*self.layers[j].b
        return mw, mb, uw, ub


    def train(self, X_train, y_train, X_test, y_test, batch_size, epochs, optimizer, eta : float = 0.001, weight_decay : float = 0.0, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
        for i in range(epochs):
            uw = [np.zeros_like(self.layers[j].w) for j in range(self.num_hidden_layer+1)]
            ub = [np.zeros_like(self.layers[j].b) for j in range(self.num_hidden_layer+1)]
            mw = [np.zeros_like(self.layers[j].w) for j in range(self.num_hidden_layer+1)]
            mb = [np.zeros_like(self.layers[j].b) for j in range(self.num_hidden_layer+1)]
            t = 1
            for i in tqdm(range(0, X_train.shape[0], batch_size)):
                x = X_train[i:i+batch_size]
                y = y_train[i:i+batch_size]
                self.forward(x)
                self.backward(y)
                dw = [self.layers[j].dw / X_train.shape[0] for j in range(self.num_hidden_layer+1)]
                db = [self.layers[j].db / X_train.shape[0] for j in range(self.num_hidden_layer+1)]
                if optimizer == "minibatch_sgd":
                    self.minibatch_sgd(dw, db, eta)
                elif optimizer == "momentum_gd":
                    uw, ub = self.momentum_gd(uw, ub,dw, db, eta, weight_decay, beta1)
                elif optimizer == "NAG_gd":
                    mw, mb = self.NAG_gd(mw, mb, dw, db, eta, weight_decay, beta1)
                elif optimizer == "RMSProp_gd":
                    uw, ub = self.RMSProp_gd(uw, ub, dw, db, eta, weight_decay, beta1, epsilon)
                elif optimizer == "Adam_gd":
                    mw, mb, uw, ub = self.Adam_gd(mw, mb, uw, ub, dw, db, t, eta, weight_decay, beta1, beta2, epsilon)
                elif optimizer == "NAdam_gd":
                    mw, mb, uw, ub = self.NAdam_gd(mw, mb, uw, ub, dw, db, t, eta, weight_decay, beta1, beta2, epsilon)   
                t += 1
            train_acc, train_loss = self.test(X_train, y_train)
            test_acc, test_loss = self.test(X_test, y_test)
            wandb.log({"train_acc": train_acc, "train_loss": train_loss, "val_acc": test_acc, "val_loss": test_loss})
                    

    def test(self, X_test, y_test):
        self.forward(X_test)
        y_pred = self.layers[-1].h
        loss = self.cross_entropy(y_pred, y_test)
        y_pred = np.argmax(y_pred, axis=1)
        y_test = np.argmax(y_test, axis=1)

        return np.sum(y_pred == y_test)/y_test.shape[0], loss

    def cross_entropy(self, y_pred, y_true):
        return -np.sum(y_true*np.log(y_pred + 1e-9))/y_pred.shape[0]
    
    def square_loss(self, y_pred, y_true):
        return np.sum((y_pred - y_true)**2)/y_pred.shape[0]


In [4]:
# nn = NeuralNetwork(784, 10, 1, 64, 'ReLU', 'random')
# (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# X_train = X_train.reshape(X_train.shape[0], 784)
# X_test = X_test.reshape(X_test.shape[0], 784)

# X_train /= 255
# X_test /= 255

# y_train = np.eye(10)[y_train]
# y_test = np.eye(10)[y_test]

# nn.minibatch_sgd(X_train, y_train, eta=0.01, batch_size=25)
# nn.test(X_test, y_test)

In [5]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'
    },
    'parameters': {
        'optimizer': {
            'values': ['minibatch_sgd', 'momentum_gd', 'NAG_gd', 'RMSProp_gd', 'Adam_gd', 'NAdam_gd']
        },
        'eta': {
            'values': [0.01, 0.001, 0.0001]
        },
        'beta1': {
            'values': [0.9]
        },
        'beta2': {
            'values': [0.999]
        },
        'epsilon': {
            'values': [1e-8, 1e-7, 1e-6]
        },
        'batch_size': {
            'values': [32, 64, 128]
        },
        'epochs': {
            'values': [5, 10, 15]
        },
        'num_hidden_layer': {
            'values': [1, 2, 3]
        },
        'num_hidden_unit': {
            'values': [32, 64, 128]
        },
        'activation': {
            'values': ['ReLU', 'sigmoid', 'tanh']
        },
        'initializer': {
            'values': ['Xavier', 'random']
        },
        'weight_decay': {
            'values': [0.0, 0.0001, 0.5]
        },
        
    }
}

In [6]:
def train_test_split(X, y, test_size=0.1, random_state=42):
    np.random.seed(random_state)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]
    split = int(X.shape[0]*test_size)
    X_train, X_test = X[split:], X[:split]
    y_train, y_test = y[split:], y[:split]
    return X_train, X_test, y_train, y_test

In [None]:
# Import the W&B Python Library and log into W&B
import wandb
wandb.login()

def get_name(config):
    name = ""
    for key in config.keys():
        name += key + "_" + str(config[key]) + "-"
    return name[:-1]

# 1: Define objective/training function
def objective(config):
    # Load data
    wandb.run.name = get_name(config)
    (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
    X_train = X_train.reshape(X_train.shape[0], -1) / 255
    X_test = X_test.reshape(X_test.shape[0], -1) / 255
    y_train = np.eye(10)[y_train]
    y_test = np.eye(10)[y_test]

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Train model
    nn = NeuralNetwork(784, 10, config.num_hidden_layer, config.num_hidden_unit, config.activation, config.initializer)
    nn.train(X_train, y_train, X_test, y_test, config.batch_size, config.epochs, config.optimizer, config.eta, config.beta1, config.beta2, config.epsilon)

def main():
    wandb.init(project='new-proj')
    score = objective(wandb.config)

sweep_configuration = sweep_config

# 3: Start the sweep
sweep_id = wandb.sweep(sweep=sweep_configuration, project='new-proj')
wandb.agent(sweep_id, function=main, count=200)