In [None]:
import numpy as np

In [None]:
class Neuron:
    def __init__(self, weights, bias, activation=None):
        """
          Initialize a neuron with weights and bias

          Args:
              weights: Weight vector for the neuron
              bias: Bias value for the neuron
              activation: Activation function for the neuron
        """
        self.weights = weights
        self.bias = bias
        self.activation = activation


    def compute(self, inputs):
      """
        Compute neuron output for given inputs

        Args:
            inputs: Input vector

        Returns:
            Neuron output after applying activation function
      """
      # Vectorized weighted sum
      z = np.dot(inputs, self.weights) + self.bias

      # Apply activation function if provided
      if self.activation is not None:
          z = self.activation(z)
      return z

In [None]:
# Predefined activations
def get_activation(name):
  if name == 'tanh':
    return np.tanh, lambda x: 1 - np.tanh(x)**2

  elif name == 'sigmoid':
    sig = lambda x: 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    return sig, lambda x: sig(x) * (1 - sig(x))

  elif name == 'relu':
    return lambda x: np.maximum(0, x), lambda x: (x > 0).astype(float)

  elif name == 'softmax':
        def softmax(x):
            # Subtract max for numerical stability
            exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
            return exp_x / np.sum(exp_x, axis=1, keepdims=True)
        def softmax_derivative(x):
            return np.ones_like(x)
        return softmax, softmax_derivative

  elif name is None or name =='linear':
    return None, lambda x: np.ones_like(x)
    
  else:
    raise ValueError(f"Unsupported activation function: {name}")

In [None]:
class Layer:
  def __init__(self, num_neurons, num_inputs, activation=None):
      """
        Initialize a layer with multiple neurons

        Args:
            num_neurons: Number of neurons in the layer
            num_inputs: Number of inputs to each neuron
            activation: Activation function for all neurons in the layer
        """
      self.num_neurons = num_neurons
      self.num_inputs = num_inputs

      # Handle string or (func, deriv) tuple
      if isinstance(activation, str):
        self.activation, self.activation_derivative = get_activation(activation)
      elif isinstance(activation, tuple):
        self.activation, self.activation_derivative = activation
      elif activation is None:
        self.activation, self.activation_derivative = get_activation(activation)
      else:
        raise ValueError("Activation function must be a string, tuple (func, deriv), or None")

      # Initialize weights and biases for all neurons
      if self.activation == np.tanh or 'tanh' in str(self.activation):
          std = np.sqrt(1 / self.num_inputs)  # Xavier
      else:  # ReLU or sigmoid
          std = np.sqrt(2 / self.num_inputs)  # He
      self.weights = np.random.randn(num_neurons, num_inputs) * std
      self.biases = np.zeros(num_neurons)  # Biases to zero is standard

      # For backward pass
      self.input = None
      self.z = None

      # Adam optimizer state (initialized to zero)
      self.m_weights = np.zeros_like(self.weights)  # 1st moment (mean)
      self.v_weights = np.zeros_like(self.weights)  # 2nd moment (variance)
      self.m_biases = np.zeros_like(self.biases)
      self.v_biases = np.zeros_like(self.biases)
      self.t = 0  # Timestep counter (for bias correction)

  def forward(self, inputs):
      """
        Compute the output of all neurons in the layer

        Args:
            inputs: Input data (batch_size, num_inputs)

        Returns:
            Layer output (batch_size, num_neurons)
        """
      self.input = inputs

      # Vectorized computation for all neurons
      self.z = np.dot(inputs, self.weights.T) + self.biases

      # Apply activation function if provided
      if self.activation is not None:
         self.output = self.activation(self.z)
      else:
        self.output = self.z
      return self.output

  def backward(self, grad_output):
    """
    Compute gradients w.r.t. weights, biases, and input.

    Args:
        grad_output: Gradient of loss w.r.t. this layer's OUTPUT (dL/dy)

    Returns:
        grad_input: Gradient of loss w.r.t. this layer's INPUT (dL/dx)
    """
    # Activation derivative: dL/dz = dL/dy * dy/dz
    grad_z = grad_output * self.activation_derivative(self.z)

    # Compute gradients for parameters
    # dL/dW = (dL/dz)^T @ input  → shape: (num_neurons, num_inputs)
    batch_size = self.input.shape[0]
    self.grad_weights = np.dot(grad_z.T, self.input) / batch_size
    self.grad_biases = np.mean(grad_z, axis=0)

    # Gradient w.r.t. input for previous layer: dL/dx = dL/dz @ W
    grad_input = np.dot(grad_z, self.weights)

    return grad_input

  def update_params_adam(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """
    Update parameters using Adam optimizer.
    """
    self.t += 1  # Increment timestep
    
    # Update biased first moment estimate (momentum)
    self.m_weights = beta1 * self.m_weights + (1 - beta1) * self.grad_weights
    self.m_biases = beta1 * self.m_biases + (1 - beta1) * self.grad_biases
    
    # Update biased second raw moment estimate (velocity)
    self.v_weights = beta2 * self.v_weights + (1 - beta2) * (self.grad_weights ** 2)
    self.v_biases = beta2 * self.v_biases + (1 - beta2) * (self.grad_biases ** 2)
    
    # Compute bias-corrected estimates
    m_weights_corr = self.m_weights / (1 - beta1 ** self.t)
    m_biases_corr = self.m_biases / (1 - beta1 ** self.t)
    v_weights_corr = self.v_weights / (1 - beta2 ** self.t)
    v_biases_corr = self.v_biases / (1 - beta2 ** self.t)
    
    # Update parameters
    self.weights -= learning_rate * m_weights_corr / (np.sqrt(v_weights_corr) + epsilon)
    self.biases -= learning_rate * m_biases_corr / (np.sqrt(v_biases_corr) + epsilon)

In [None]:
class NeuralNetwork:
  def __init__(self, layers, loss_function=None):
    self.layers = layers
    self.loss_function = loss_function

  def forward(self, inputs):
    """
        Forward pass through all layers.

        Args:
            inputs: Input data (batch_size, num_inputs)

        Returns:
            Final output after passing through all layers
        """

    for layer in self.layers:
      inputs = layer.forward(inputs)
    return inputs

  def loss(self, y_pred, y_true):
    if self.loss_function is None:
      raise ValueError("No loss function provided")

    loss = self.loss_function(y_pred, y_true)
    return loss

  def compute_loss_and_grad(self, y_pred, y_true):
    """Returns (loss_value, grad_wrt_y_pred)"""
    if self.loss_function == "mse":
      loss = np.mean((y_pred - y_true) ** 2)
      grad = 2 * (y_pred - y_true) / y_pred.size # dL/dy_pred
      return loss, grad

    elif self.loss_function == "binary_crossentropy":
        # Clip predictions to prevent log(0) and division by zero
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        loss = -np.mean(
            y_true * np.log(y_pred_clipped) + 
            (1 - y_true) * np.log(1 - y_pred_clipped)
        )
        
        grad = (
            -(y_true / y_pred_clipped) + 
            (1 - y_true) / (1 - y_pred_clipped)
        ) / y_pred.shape[0]  # Divide by batch size for mean
        return loss, grad

    elif self.loss_function == "categorical_crossentropy":
      # Compute softmax probabilities
      exp_logits = np.exp(y_pred - np.max(y_pred, axis=1, keepdims=True))
      softmax_probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
      
      # Clip probabilities to prevent log(0)
      softmax_probs = np.clip(softmax_probs, 1e-7, 1 - 1e-7)
      
      # Compute loss
      loss = -np.mean(np.sum(y_true * np.log(softmax_probs), axis=1))
      
      # Gradient for cross-entropy with softmax: dL/dz = softmax(z) - y_true
      grad = (softmax_probs - y_true) / y_true.shape[0]
      
      return loss, grad

    else:
      raise ValueError(f"Unsupported loss function: {self.loss_function}")

  def backward(self, y_pred, y_true):
    """Compute gradients for all layers"""
    # Get gradient from loss
    loss, grad_output = self.compute_loss_and_grad(y_pred, y_true)

    # Step 2: Backpropagate through layers in REVERSE order
    for layer in reversed(self.layers):
      grad_output = layer.backward(grad_output)

    return loss

  def update_params_adam(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """Update all layers using Adam optimizer"""
    for layer in self.layers:
      layer.update_params_adam(learning_rate, beta1, beta2, epsilon)

  def train_step(self, X, y, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """Single training step with Adam"""
    y_pred = self.forward(X)
    loss = self.backward(y_pred, y)
    self.update_params_adam(learning_rate, beta1, beta2, epsilon)
    return loss

  def train(self, X, y, epochs=100, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, batch_size=None):
    """
    Train the network using Adam optimizer.
    
    Args:
        X: Input data (n_samples, n_features)
        y: Target labels (n_samples, n_outputs)
        epochs: Number of passes through the dataset
        learning_rate: Adam learning rate (default 0.001)
        batch_size: If None, use full batch
    """
    n_samples = X.shape[0]
    batch_size = batch_size or n_samples

    for epoch in range(epochs):
      epoch_loss = 0.0
      # Shuffle data for each epoch
      indices = np.random.permutation(n_samples)
      X_shuffled = X[indices]
      y_shuffled = y[indices]

      for i in range(0, n_samples, batch_size):
        batch_X = X_shuffled[i:i+batch_size]
        batch_y = y_shuffled[i:i+batch_size]

        loss = self.train_step(batch_X, batch_y, learning_rate, beta1, beta2, epsilon)
        epoch_loss += loss * len(batch_X)  # Weight by batch size

      avg_loss = epoch_loss / n_samples
      if epoch % 10 == 0 or epoch == epochs - 1:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.6f}")  

# XOR Problem

In [None]:
# XOR dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32)
y = np.array([[0], [1], [1], [0]], dtype=np.float32)

layers = [
    Layer(8, 2, activation='tanh'),
    Layer(1, 8, activation='sigmoid')
]
net = NeuralNetwork(layers, loss_function="binary_crossentropy")

net.train(X, y, epochs=500, learning_rate=0.01, batch_size=4)

print("\nPredictions:")
preds = net.forward(X)
for i in range(len(X)):
    print(f"{X[i]} → {preds[i][0]:.4f} (target: {y[i][0]})")

# Training on MNIST Dataset

In [None]:
from tensorflow.keras.datasets import mnist

# Set seed for reproducibility
np.random.seed(42)

# Load MNIST data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize pixel values to [0, 1]
X_train = X_train.reshape(-1, 28*28).astype(np.float32) / 255.0
X_test = X_test.reshape(-1, 28*28).astype(np.float32) / 255.0

# Convert labels to one-hot encoding (for cross-entropy loss)
def to_one_hot(y, num_classes=10):
    one_hot = np.zeros((y.size, num_classes))
    one_hot[np.arange(y.size), y] = 1
    return one_hot

y_train_onehot = to_one_hot(y_train)
y_test_onehot = to_one_hot(y_test)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train_onehot.shape}")

In [None]:
def accuracy(y_pred, y_true):
    """Compute accuracy for classification"""
    if y_pred.shape[1] > 1:  # Multi-class
        pred_classes = np.argmax(y_pred, axis=1)
        true_classes = np.argmax(y_true, axis=1)
    else:  # Binary
        pred_classes = (y_pred > 0.5).astype(int).flatten()
        true_classes = y_true.flatten()
    return np.mean(pred_classes == true_classes)

In [None]:
# Build network: 784 → 128 → 64 → 10
layers = [
    Layer(128, 784, activation='relu'),
    Layer(64, 128, activation='relu'),
    Layer(10, 64, activation='linear')  # Output layer: logits for cross-entropy
]

net = NeuralNetwork(layers, loss_function="categorical_crossentropy")

# Test MNIST 

In [None]:
# Train on subset for faster testing (use full dataset for best results)
train_size = 10000  # Reduce for faster training
X_train_subset = X_train[:train_size]
y_train_subset = y_train_onehot[:train_size]

print("Starting training...")
net.train(
    X_train_subset, 
    y_train_subset, 
    epochs=50, 
    learning_rate=0.001, 
    batch_size=64
)

# Evaluate
train_preds = net.forward(X_train_subset)
train_acc = accuracy(train_preds, y_train_subset)
print(f"\nTraining Accuracy: {train_acc:.4f}")

test_preds = net.forward(X_test)
test_acc = accuracy(test_preds, y_test_onehot)
print(f"Test Accuracy: {test_acc:.4f}")