In [12]:
import numpy as np
class Activations:
    @staticmethod
    def relu(Z):
        return np.maximum(0, Z)

    @staticmethod
    def relu_der(Z):
        return (Z > 0).astype(float)

    @staticmethod
    def softmax(Z):
        exps = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)
def categorical_cross_entropy(pred, true):
    m = true.shape[0]
    return -np.sum(true * np.log(pred + 1e-8)) / m
class NeuralNet:
    def __init__(self, input_dim, hidden_layers, output_dim, optimizer='sgd', lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
        np.random.seed(99)
        self.layers = [input_dim] + hidden_layers + [output_dim]
        self.weights = {}
        self.biases = {}
        self.opt = optimizer.lower()
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.time_step = 0
        for i in range(len(self.layers)-1):
            limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
            self.weights[f"W{i+1}"] = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
            self.biases[f"b{i+1}"] = np.zeros((1, self.layers[i+1]))
        self.velocity_w = {k: np.zeros_like(v) for k,v in self.weights.items()}
        self.velocity_b = {k: np.zeros_like(v) for k,v in self.biases.items()}
        self.sqr_w = {k: np.zeros_like(v) for k,v in self.weights.items()}
        self.sqr_b = {k: np.zeros_like(v) for k,v in self.biases.items()}
    def forward(self, X):
        store = {'A0': X}
        L = len(self.layers) - 1
        for l in range(1, L):
            Z = np.dot(store[f"A{l-1}"], self.weights[f"W{l}"]) + self.biases[f"b{l}"]
            A = Activations.relu(Z)
            store[f"Z{l}"], store[f"A{l}"] = Z, A

        ZL = np.dot(store[f"A{L-1}"], self.weights[f"W{L}"]) + self.biases[f"b{L}"]
        AL = Activations.softmax(ZL)
        store[f"Z{L}"], store[f"A{L}"] = ZL, AL
        return AL, store
    def backward(self, Y_pred, Y_true, store):
        grads = {}
        L = len(self.layers)-1
        m = Y_true.shape[0]
        dZ = Y_pred - Y_true
        grads[f"dW{L}"] = np.dot(store[f"A{L-1}"].T, dZ)/m
        grads[f"db{L}"] = np.sum(dZ, axis=0, keepdims=True)/m
        dA_prev = np.dot(dZ, self.weights[f"W{L}"].T)
        for l in reversed(range(1, L)):
            dZ = dA_prev * Activations.relu_der(store[f"Z{l}"])
            grads[f"dW{l}"] = np.dot(store[f"A{l-1}"].T, dZ)/m
            grads[f"db{l}"] = np.sum(dZ, axis=0, keepdims=True)/m
            if l > 1:
                dA_prev = np.dot(dZ, self.weights[f"W{l}"].T)
        return grads
    def update_params(self, grads):
        self.time_step += 1
        for l in range(1, len(self.layers)):
            Wk, bk = f"W{l}", f"b{l}"
            dW, db = grads[f"dW{l}"], grads[f"db{l}"]

            if self.opt == 'sgd':
                self.weights[Wk] -= self.lr * dW
                self.biases[bk] -= self.lr * db
            elif self.opt == 'momentum':
                self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                self.weights[Wk] -= self.lr * self.velocity_w[Wk]
                self.biases[bk] -= self.lr * self.velocity_b[bk]
            elif self.opt == 'nesterov':
              vW_prev = self.velocity_w[Wk]
              vb_prev = self.velocity_b[bk]
              self.velocity_w[Wk] = self.beta1 * self.velocity_w[Wk] - self.lr * dW
              self.velocity_b[bk] = self.beta1 * self.velocity_b[bk] - self.lr * db
              self.weights[Wk] += -self.beta1 * vW_prev + (1 + self.beta1) * self.velocity_w[Wk]
              self.biases[bk]  += -self.beta1 * vb_prev + (1 + self.beta1) * self.velocity_b[bk]
            elif self.opt == 'rmsprop':
                self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                self.weights[Wk] -= self.lr * dW / (np.sqrt(self.sqr_w[Wk])+self.eps)
                self.biases[bk] -= self.lr * db / (np.sqrt(self.sqr_b[bk])+self.eps)
            elif self.opt == 'adam':
                self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                vW_corr = self.velocity_w[Wk]/(1-self.beta1**self.time_step)
                vb_corr = self.velocity_b[bk]/(1-self.beta1**self.time_step)
                sW_corr = self.sqr_w[Wk]/(1-self.beta2**self.time_step)
                sb_corr = self.sqr_b[bk]/(1-self.beta2**self.time_step)
                self.weights[Wk] -= self.lr * vW_corr / (np.sqrt(sW_corr)+self.eps)
                self.biases[bk] -= self.lr * vb_corr / (np.sqrt(sb_corr)+self.eps)
            elif self.opt == 'nadam':
                 self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                 self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                 m_hat_b = self.velocity_b[bk] / (1 - self.beta1**self.time_step)
                 vW_hat = (self.velocity_w[Wk]*self.beta1 + (1-self.beta1)*dW) / (1-self.beta1**self.time_step)
                 vb_hat = (self.velocity_b[bk]*self.beta1 + (1-self.beta1)*db) / (1-self.beta1**self.time_step)
                 self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                 self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                 sW_corr = self.sqr_w[Wk]/(1-self.beta2**self.time_step)
                 sb_corr = self.sqr_b[bk]/(1-self.beta2**self.time_step)
                 self.weights[Wk] -= self.lr * vW_hat / (np.sqrt(sW_corr)+self.eps)
                 self.biases[bk] -= self.lr * vb_hat / (np.sqrt(sb_corr)+self.eps)
    def train(self, X_train, Y_train, X_test, Y_test, epochs=20, batch_size=64):
        m = X_train.shape[0]
        for ep in range(epochs):
            perm = np.random.permutation(m)
            X_shuf, Y_shuf = X_train[perm], Y_train[perm]
            for i in range(0, m, batch_size):
                Xb, Yb = X_shuf[i:i+batch_size], Y_shuf[i:i+batch_size]
                Y_pred, store = self.forward(Xb)
                grads = self.backward(Y_pred, Yb, store)
                self.update_params(grads)
            Y_pred_train, _ = self.forward(X_train)
            loss = categorical_cross_entropy(Y_pred_train, Y_train)
            Y_pred_test, _ = self.forward(X_test)
            acc = np.mean(np.argmax(Y_pred_test, axis=1) == np.argmax(Y_test, axis=1))
            print(f"Epoch {ep+1}/{epochs}, Loss: {loss:.4f}, Test Acc: {acc*100:.2f}%")
