In [1]:
import numpy as np
from keras.datasets import mnist
def get_mnist(flatten=True):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    if flatten:
        x_train = x_train.reshape(x_train.shape[0], 28*28)
        x_test = x_test.reshape(x_test.shape[0], 28*28)
    x_train = x_train.astype(np.float32) / 255.0
    x_test = x_test.astype(np.float32) / 255.0
    y_train = np.eye(10)[y_train]
    y_test = np.eye(10)[y_test]
    return (x_train, y_train), (x_test, y_test)

In [2]:
import numpy as np
class Activate:
    @staticmethod
    def leaky_relu(z, alpha=0.01):
        return np.where(z > 0, z, alpha*z)
    @staticmethod
    def leaky_relu_der(z, alpha=0.01):
        dz = np.ones_like(z)
        dz[z < 0] = alpha
        return dz
    @staticmethod
    def soft(z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [3]:
class FNN:
    def __init__(self, input_size, hidden_layers, output_size):
        np.random.seed(42)
        self.layers = [input_size] + hidden_layers + [output_size]
        self.weights = {}
        self.biases = {}
        for i in range(len(self.layers)-1):
            limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
            self.weights['W'+str(i+1)] = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
            self.biases['b'+str(i+1)] = np.zeros((1, self.layers[i+1]))
    def forw(self, X):
      self.cache = {'A0': X}
      L = len(self.layers)-1
      for l in range(1, L):
          Z = np.dot(self.cache['A'+str(l-1)], self.weights['W'+str(l)]) + self.biases['b'+str(l)]
          A = Activate.leaky_relu(Z)
          self.cache['Z'+str(l)] = Z
          self.cache['A'+str(l)] = A
      ZL = np.dot(self.cache['A'+str(L-1)], self.weights['W'+str(L)]) + self.biases['b'+str(L)]
      AL = Activate.soft(ZL)
      self.cache['Z'+str(L)] = ZL
      self.cache['A'+str(L)] = AL
      return AL
    def cross_ent(self,Y_pred, Y_true):
      m = Y_true.shape[0]
      return np.mean(-np.sum(Y_true * np.log(Y_pred + 1e-8), axis=1))
    def backw(self, Y_true, learning_rate=0.01):
      grads_w = {}
      grads_b = {}
      L = len(self.layers)-1
      m = Y_true.shape[0]
      dZ = self.cache['A'+str(L)] - Y_true
      grads_w['dW'+str(L)] = np.dot(self.cache['A'+str(L-1)].T, dZ)/m
      grads_b['db'+str(L)] = np.sum(dZ, axis=0, keepdims=True)/m
      dA_prev = np.dot(dZ, self.weights['W'+str(L)].T)
      for l in reversed(range(1, L)):
          dZ = dA_prev * Activate.leaky_relu_der(self.cache['Z'+str(l)])
          grads_w['dW'+str(l)] = np.dot(self.cache['A'+str(l-1)].T, dZ)/m
          grads_b['db'+str(l)] = np.sum(dZ, axis=0, keepdims=True)/m
          if l > 1:
              dA_prev = np.dot(dZ, self.weights['W'+str(l)].T)
      for l in range(1, L+1):
          self.weights['W'+str(l)] -= learning_rate * grads_w['dW'+str(l)]
          self.biases['b'+str(l)] -= learning_rate * grads_b['db'+str(l)]
    def train(self, X_train, Y_train, X_test, Y_test, lr=0.005, epochs=15):
      for epoch in range(epochs):
        Y_pred = self.forw(X_train)
        loss = self.cross_ent(Y_pred, Y_train)
        self.backw(Y_train, learning_rate=lr)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}")
      Y_pred_test = self.forw(X_test)
      accuracy = np.mean(np.argmax(Y_pred_test, axis=1) == np.argmax(Y_test, axis=1))
      print(f"Test Accuracy: {accuracy*100:.2f}%")

In [5]:
import numpy as np
import wandb
wandb.init(
    project="mnist-ques_10",
    name="CE_vs_SE"
)
class Activations:
    @staticmethod
    def relu(Z):
        return np.maximum(0, Z)
    @staticmethod
    def relu_der(Z):
        return (Z > 0).astype(float)
    @staticmethod
    def softmax(Z):
        exps = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)
def categorical_cross_entropy(pred, true):
    m = true.shape[0]
    return -np.sum(true * np.log(pred + 1e-8)) / m
def squared_error_loss(pred, true):
    return np.mean(np.sum((pred - true) ** 2, axis=1))
class NeuralNet:
    def __init__(self, input_dim, hidden_layers, output_dim, optimizer='sgd', lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
        np.random.seed(99)
        self.layers = [input_dim] + hidden_layers + [output_dim]
        self.weights = {}
        self.biases = {}
        self.opt = optimizer.lower()
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.time_step = 0
        for i in range(len(self.layers)-1):
            limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
            self.weights[f"W{i+1}"] = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
            self.biases[f"b{i+1}"] = np.zeros((1, self.layers[i+1]))
        self.velocity_w = {k: np.zeros_like(v) for k,v in self.weights.items()}
        self.velocity_b = {k: np.zeros_like(v) for k,v in self.biases.items()}
        self.sqr_w = {k: np.zeros_like(v) for k,v in self.weights.items()}
        self.sqr_b = {k: np.zeros_like(v) for k,v in self.biases.items()}
    def forward(self, X):
        store = {'A0': X}
        L = len(self.layers) - 1
        for l in range(1, L):
            Z = np.dot(store[f"A{l-1}"], self.weights[f"W{l}"]) + self.biases[f"b{l}"]
            A = Activations.relu(Z)
            store[f"Z{l}"], store[f"A{l}"] = Z, A
        ZL = np.dot(store[f"A{L-1}"], self.weights[f"W{L}"]) + self.biases[f"b{L}"]
        AL = Activations.softmax(ZL)
        store[f"Z{L}"], store[f"A{L}"] = ZL, AL
        return AL, store
    def backward(self, Y_pred, Y_true, store, loss_type="cross_entropy"):
        grads = {}
        L = len(self.layers)-1
        m = Y_true.shape[0]
        if loss_type == "cross_entropy":
          dZ = Y_pred - Y_true
        elif loss_type == "squared_error":
          dZ = (Y_pred - Y_true) * Y_pred * (1 - Y_pred)
        grads[f"dW{L}"] = np.dot(store[f"A{L-1}"].T, dZ)/m
        grads[f"db{L}"] = np.sum(dZ, axis=0, keepdims=True)/m
        dA_prev = np.dot(dZ, self.weights[f"W{L}"].T)
        for l in reversed(range(1, L)):
            dZ = dA_prev * Activations.relu_der(store[f"Z{l}"])
            grads[f"dW{l}"] = np.dot(store[f"A{l-1}"].T, dZ)/m
            grads[f"db{l}"] = np.sum(dZ, axis=0, keepdims=True)/m
            if l > 1:
                dA_prev = np.dot(dZ, self.weights[f"W{l}"].T)
        return grads
    def update_params(self, grads):
        self.time_step += 1
        for l in range(1, len(self.layers)):
            Wk, bk = f"W{l}", f"b{l}"
            dW, db = grads[f"dW{l}"], grads[f"db{l}"]

            if self.opt == 'sgd':
                self.weights[Wk] -= self.lr * dW
                self.biases[bk] -= self.lr * db
            elif self.opt == 'momentum':
                self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                self.weights[Wk] -= self.lr * self.velocity_w[Wk]
                self.biases[bk] -= self.lr * self.velocity_b[bk]
            elif self.opt == 'nesterov':
              vW_prev = self.velocity_w[Wk]
              vb_prev = self.velocity_b[bk]
              self.velocity_w[Wk] = self.beta1 * self.velocity_w[Wk] - self.lr * dW
              self.velocity_b[bk] = self.beta1 * self.velocity_b[bk] - self.lr * db
              self.weights[Wk] += -self.beta1 * vW_prev + (1 + self.beta1) * self.velocity_w[Wk]
              self.biases[bk]  += -self.beta1 * vb_prev + (1 + self.beta1) * self.velocity_b[bk]
            elif self.opt == 'rmsprop':
                self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                self.weights[Wk] -= self.lr * dW / (np.sqrt(self.sqr_w[Wk])+self.eps)
                self.biases[bk] -= self.lr * db / (np.sqrt(self.sqr_b[bk])+self.eps)
            elif self.opt == 'adam':
                self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                vW_corr = self.velocity_w[Wk]/(1-self.beta1**self.time_step)
                vb_corr = self.velocity_b[bk]/(1-self.beta1**self.time_step)
                sW_corr = self.sqr_w[Wk]/(1-self.beta2**self.time_step)
                sb_corr = self.sqr_b[bk]/(1-self.beta2**self.time_step)
                self.weights[Wk] -= self.lr * vW_corr / (np.sqrt(sW_corr)+self.eps)
                self.biases[bk] -= self.lr * vb_corr / (np.sqrt(sb_corr)+self.eps)
            elif self.opt == 'nadam':
                 self.velocity_w[Wk] = self.beta1*self.velocity_w[Wk] + (1-self.beta1)*dW
                 self.velocity_b[bk] = self.beta1*self.velocity_b[bk] + (1-self.beta1)*db
                 m_hat_b = self.velocity_b[bk] / (1 - self.beta1**self.time_step)
                 vW_hat = (self.velocity_w[Wk]*self.beta1 + (1-self.beta1)*dW) / (1-self.beta1**self.time_step)
                 vb_hat = (self.velocity_b[bk]*self.beta1 + (1-self.beta1)*db) / (1-self.beta1**self.time_step)
                 self.sqr_w[Wk] = self.beta2*self.sqr_w[Wk] + (1-self.beta2)*(dW**2)
                 self.sqr_b[bk] = self.beta2*self.sqr_b[bk] + (1-self.beta2)*(db**2)
                 sW_corr = self.sqr_w[Wk]/(1-self.beta2**self.time_step)
                 sb_corr = self.sqr_b[bk]/(1-self.beta2**self.time_step)
                 self.weights[Wk] -= self.lr * vW_hat / (np.sqrt(sW_corr)+self.eps)
                 self.biases[bk] -= self.lr * vb_hat / (np.sqrt(sb_corr)+self.eps)
    def train(self, X_train, Y_train, X_test, Y_test, epochs=20, batch_size=64):
        m = X_train.shape[0]
        loss_ce = []
        loss_se = []
        for ep in range(epochs):
            perm = np.random.permutation(m)
            X_shuf, Y_shuf = X_train[perm], Y_train[perm]
            for i in range(0, m, batch_size):
                Xb, Yb = X_shuf[i:i+batch_size], Y_shuf[i:i+batch_size]
                Y_pred, store = self.forward(Xb)
                grads = self.backward(Y_pred, Yb, store)
                self.update_params(grads)
            epoch=ep
            preds, store = self.forward(X_train)
            ce = categorical_cross_entropy(preds, Y_train)
            se = squared_error_loss(preds, Y_train)
            preds_test, _ = self.forward(X_test)
            test_acc = np.mean(np.argmax(preds_test, axis=1) == np.argmax(Y_test, axis=1))
            loss_ce.append(ce)
            loss_se.append(se)
            wandb.log({
                "epoch": epoch,
                "cross_entropy_loss": ce,
                "squared_error_loss": se,
                "test_accuracy": test_acc
            })
            print(
                f"Epoch {ep+1}/{epochs}, "
                f"CE Loss: {ce:.4f}, "
                f"SE Loss: {se:.4f}"
            )
        wandb.log({
            "loss_comparison": wandb.plot.line_series(
            xs=list(range(len(loss_ce))),
            ys=[loss_ce, loss_se],
            keys=["Cross Entropy", "Squared Error"],
            title="Loss Function Comparison",
            xname="Epoch"
        )
    })

In [6]:
(X_train, Y_train), (X_test, Y_test) = get_mnist(flatten=True)
recommendations = [
    {
        "name": "Rec_1_Balanced_Adam",
        "hidden": [128, 128, 128],
        "opt": "adam",
        "lr": 0.001
    },
    {
        "name": "Rec_2_Deep_Nadam",
        "hidden": [128, 128, 128, 128, 128],
        "opt": "nadam",
        "lr": 0.001
    },
    {
        "name": "Rec_3_Light_Adam",
        "hidden": [64, 64],
        "opt": "adam",
        "lr": 0.001
    }
]
for rec in recommendations:
    wandb.init(
        project="mnist-question10-final",
        name=rec["name"],
        config=rec
    )
    net = NeuralNet(
        input_dim=784,
        hidden_layers=rec["hidden"],
        output_dim=10,
        optimizer=rec["opt"],
        lr=rec["lr"]
    )
    print(f"\n--- Running {rec['name']} ---")
    net.train(X_train, Y_train, X_test, Y_test, epochs=10, batch_size=64)
    wandb.finish()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step



--- Running Rec_1_Balanced_Adam ---
Epoch 1/10, CE Loss: 0.1185, SE Loss: 0.0553
Epoch 2/10, CE Loss: 0.0622, SE Loss: 0.0293
Epoch 3/10, CE Loss: 0.0567, SE Loss: 0.0270
Epoch 4/10, CE Loss: 0.0365, SE Loss: 0.0181
Epoch 5/10, CE Loss: 0.0424, SE Loss: 0.0213
Epoch 6/10, CE Loss: 0.0336, SE Loss: 0.0167
Epoch 7/10, CE Loss: 0.0305, SE Loss: 0.0156
Epoch 8/10, CE Loss: 0.0180, SE Loss: 0.0091
Epoch 9/10, CE Loss: 0.0145, SE Loss: 0.0075
Epoch 10/10, CE Loss: 0.0263, SE Loss: 0.0138


0,1
cross_entropy_loss,█▄▄▂▃▂▂▁▁▂
epoch,▁▂▃▃▄▅▆▆▇█
squared_error_loss,█▄▄▃▃▂▂▁▁▂
test_accuracy,▁▆▅▇▆▆▇██▆

0,1
cross_entropy_loss,0.02632
epoch,9.0
squared_error_loss,0.01378
test_accuracy,0.9757



--- Running Rec_2_Deep_Nadam ---
Epoch 1/10, CE Loss: 0.1266, SE Loss: 0.0604
Epoch 2/10, CE Loss: 0.0776, SE Loss: 0.0370
Epoch 3/10, CE Loss: 0.0645, SE Loss: 0.0302
Epoch 4/10, CE Loss: 0.0478, SE Loss: 0.0229
Epoch 5/10, CE Loss: 0.0349, SE Loss: 0.0174
Epoch 6/10, CE Loss: 0.0337, SE Loss: 0.0168
Epoch 7/10, CE Loss: 0.0327, SE Loss: 0.0161
Epoch 8/10, CE Loss: 0.0239, SE Loss: 0.0115
Epoch 9/10, CE Loss: 0.0197, SE Loss: 0.0097
Epoch 10/10, CE Loss: 0.0205, SE Loss: 0.0102


0,1
cross_entropy_loss,█▅▄▃▂▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
squared_error_loss,█▅▄▃▂▂▂▁▁▁
test_accuracy,▁▅▅▆█▆▆███

0,1
cross_entropy_loss,0.0205
epoch,9.0
squared_error_loss,0.01021
test_accuracy,0.9784



--- Running Rec_3_Light_Adam ---
Epoch 1/10, CE Loss: 0.1752, SE Loss: 0.0802
Epoch 2/10, CE Loss: 0.1044, SE Loss: 0.0474
Epoch 3/10, CE Loss: 0.0872, SE Loss: 0.0413
Epoch 4/10, CE Loss: 0.0637, SE Loss: 0.0291
Epoch 5/10, CE Loss: 0.0514, SE Loss: 0.0241
Epoch 6/10, CE Loss: 0.0451, SE Loss: 0.0217
Epoch 7/10, CE Loss: 0.0404, SE Loss: 0.0194
Epoch 8/10, CE Loss: 0.0327, SE Loss: 0.0163
Epoch 9/10, CE Loss: 0.0306, SE Loss: 0.0148
Epoch 10/10, CE Loss: 0.0234, SE Loss: 0.0113


0,1
cross_entropy_loss,█▅▄▃▂▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
squared_error_loss,█▅▄▃▂▂▂▂▁▁
test_accuracy,▁▅▆▇▇█▇███

0,1
cross_entropy_loss,0.0234
epoch,9.0
squared_error_loss,0.01134
test_accuracy,0.9756
