Name :- Heet Dhanuka

Batch :- B2

Roll No. :- 34

DL practical 4

---------

Aim :- Create a neural network from scratch for a multiclass classification
task with the following architecture:

Input layer: 4 neurons
First hidden layer: 3 neurons
Second hidden layer: 4 neurons
Output layer: 3 neurons

Generate a random dataset for a 3-class classification problem. Apply
the designed neural network on generated dataset using the ReLU activation
function in the hidden layers and the softmax activation function in the output
layer for multiclass classification. Use categorical cross-entropy as the loss
function for training. Additionally, implement the Gradient Descent, Momentum
based GD, NAG, AdaGrad, RMS prop, Adam optimizer as part of the training
process. Train the neural network using all optimizers and evaluate their
performance on the dataset using performance metrics accuracy.

In [32]:
# import library
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [33]:
# Generate random dataset
num_samples = 1000
num_features = 4
num_classes = 3

# Generate random features (X) and random labels (y) for classification
X = np.random.rand(num_samples, num_features)  # Random features
y = np.random.randint(0, num_classes, num_samples)  # Random class labels (0, 1, 2)
y_onehot = np.eye(num_classes)[y]  # One-hot encode labels

In [34]:
# Split into train/test sets (80/20 split)
train_size = int(0.8 * num_samples)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_onehot[:train_size], y_onehot[train_size:]

In [35]:
# Defining Activation function
# relu function
def relu(z):
    return np.maximum(0, z)

# relu derivative function
def relu_derivative(z):
    return (z > 0).astype(float)

# SoftMax function
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# loss function
def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))

In [36]:
# accuracy function :
def accuracy(y_true, y_pred):
    return np.mean(np.argmax(y_true, axis=1) == np.argmax(y_pred, axis=1))

In [37]:
# Initialize network parameters
def initialize_parameters():
    np.random.seed(42)
    params = {
        "W1": np.random.randn(4, 3) * 0.01,
        "b1": np.zeros((1, 3)),
        "W2": np.random.randn(3, 4) * 0.01,
        "b2": np.zeros((1, 4)),
        "W3": np.random.randn(4, 3) * 0.01,
        "b3": np.zeros((1, 3)),
    }
    return params

In [38]:
# Forward and backward propagation
def forward_propagation(X, params):
    Z1 = np.dot(X, params["W1"]) + params["b1"]
    A1 = relu(Z1)
    Z2 = np.dot(A1, params["W2"]) + params["b2"]
    A2 = relu(Z2)
    Z3 = np.dot(A2, params["W3"]) + params["b3"]
    A3 = softmax(Z3)
    cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2, "Z3": Z3, "A3": A3}
    return A3, cache

In [39]:
def backward_propagation(X, y, params, cache):
    m = X.shape[0]

    dZ3 = cache["A3"] - y
    dW3 = np.dot(cache["A2"].T, dZ3) / m
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m

    dA2 = np.dot(dZ3, params["W3"].T)
    dZ2 = dA2 * relu_derivative(cache["Z2"])
    dW2 = np.dot(cache["A1"].T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    dA1 = np.dot(dZ2, params["W2"].T)
    dZ1 = dA1 * relu_derivative(cache["Z1"])
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2, "dW3": dW3, "db3": db3}
    return grads

In [40]:
# Gradient Descent Optimizer
def gradient_descent(params, grads, learning_rate):
    for key in params.keys():
        params[key] -= learning_rate * grads["d" + key]
    return params

In [41]:
# Momentum based GD
def momentum_gd(params, grads, learning_rate, momentum=0.9, v=None):
  if v is None:
    v = {key: np.zeros_like(params[key]) for key in params}
  for key in params.keys():
      v[key] = momentum * v[key] + learning_rate * grads["d" + key]
      params[key] -= v[key]
  return params, v

In [42]:
# NAG optimizer
def nag(params, grads, learning_rate, momentum=0.9, v=None):
    if v is None:
        v = {key: np.zeros_like(params[key]) for key in params}
    for key in params.keys():
        v[key] = momentum * v[key] + learning_rate * grads["d" + key]
        params[key] -= v[key]
    return params, v



In [43]:
# AdaGrad optimizer
def adagrad(params, grads, learning_rate, epsilon=1e-8, cache=None):
    if cache is None:
        cache = {key: np.zeros_like(params[key]) for key in params}
    for key in params.keys():
        cache[key] += grads["d" + key]**2
        params[key] -= (learning_rate / np.sqrt(cache[key] + epsilon)) * grads["d" + key]
    return params, cache



In [44]:
# RMSprop optimizer
def rmsprop(params, grads, learning_rate, decay_rate=0.9, epsilon=1e-8, cache=None):
    if cache is None:
        cache = {key: np.zeros_like(params[key]) for key in params}
    for key in params.keys():
        cache[key] = decay_rate * cache[key] + (1 - decay_rate) * grads["d" + key]**2
        params[key] -= (learning_rate / np.sqrt(cache[key] + epsilon)) * grads["d" + key]
    return params, cache



In [45]:
# Adam optimizer
def adam(params, grads, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8, m=None, v=None, t=0):
    if m is None:
        m = {key: np.zeros_like(params[key]) for key in params}
    if v is None:
        v = {key: np.zeros_like(params[key]) for key in params}
    t += 1
    for key in params.keys():
        m[key] = beta1 * m[key] + (1 - beta1) * grads["d" + key]
        v[key] = beta2 * v[key] + (1 - beta2) * grads["d" + key]**2

        m_hat = m[key] / (1 - beta1**t)
        v_hat = v[key] / (1 - beta2**t)

        params[key] -= (learning_rate / np.sqrt(v_hat + epsilon)) * m_hat
    return params, m, v, t

In [46]:
optimizers = {
    "GD": gradient_descent,
    "Momentum": momentum_gd,
    "NAG": nag,
    "AdaGrad": adagrad,
    "RMSprop": rmsprop,
    "Adam": adam
}

for optimizer_name, optimizer_func in optimizers.items():
    print(f"\nTraining with {optimizer_name}:")
    trained_params = train(X_train, y_train, optimizer_func, learning_rate=0.01, epochs=100)

    A3_test, _ = forward_propagation(X_test, trained_params)
    test_accuracy = accuracy(y_test, A3_test)
    print(f"Test Accuracy ({optimizer_name}): {test_accuracy:.4f}")


Training with GD:
Epoch 10/100, Loss: 1.0986, Accuracy: 0.3450
Epoch 20/100, Loss: 1.0986, Accuracy: 0.3450
Epoch 30/100, Loss: 1.0986, Accuracy: 0.3450
Epoch 40/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 50/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 60/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 70/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 80/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 90/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 100/100, Loss: 1.0984, Accuracy: 0.3450
Test Accuracy (GD): 0.3800

Training with Momentum:
Epoch 10/100, Loss: 1.0985, Accuracy: 0.3450
Epoch 20/100, Loss: 1.0984, Accuracy: 0.3450
Epoch 30/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 40/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 50/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 60/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 70/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 80/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 90/100, Loss: 1.0983, Accuracy: 0.3450
Epoch 100/100, Loss: 1.0983, Accuracy: 0.3450
Test Accuracy (Momentum): 0