# Week 02: Neural Networks from Scratch

Building neural networks from first principles using only NumPy.

## Learning Objectives
1. Understand forward propagation mathematically
2. Implement backpropagation using chain rule
3. Build a complete neural network class

In [None]:
import numpy as np
from typing import List, Tuple, Callable

## 1. The Neuron Model

### 1.1 Activation Functions

In [None]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    """Sigmoid activation: σ(x) = 1 / (1 + e^(-x))"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def sigmoid_derivative(x: np.ndarray) -> np.ndarray:
    """Derivative: σ'(x) = σ(x) * (1 - σ(x))"""
    s = sigmoid(x)
    return s * (1 - s)

def relu(x: np.ndarray) -> np.ndarray:
    """ReLU activation: max(0, x)"""
    return np.maximum(0, x)

def relu_derivative(x: np.ndarray) -> np.ndarray:
    """ReLU derivative: 1 if x > 0 else 0"""
    return (x > 0).astype(float)

def tanh(x: np.ndarray) -> np.ndarray:
    """Tanh activation"""
    return np.tanh(x)

def tanh_derivative(x: np.ndarray) -> np.ndarray:
    """Tanh derivative: 1 - tanh²(x)"""
    return 1 - np.tanh(x) ** 2

In [None]:
# Visualize activations
x = np.linspace(-5, 5, 100)
print("Activation function outputs at x=2:")
print(f"  sigmoid(2) = {sigmoid(np.array([2]))[0]:.4f}")
print(f"  relu(2) = {relu(np.array([2]))[0]:.4f}")
print(f"  tanh(2) = {tanh(np.array([2]))[0]:.4f}")

## 2. Dense Layer

In [None]:
class DenseLayer:
    """
    Fully connected layer.
    
    Shape:
    - Input: (batch_size, n_inputs)
    - Output: (batch_size, n_outputs)
    - Weights: (n_inputs, n_outputs)
    """
    def __init__(self, n_inputs: int, n_outputs: int, activation: str = 'relu'):
        self.W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / n_inputs)
        self.b = np.zeros(n_outputs)
        self.activation = activation
    
    def _activate(self, z: np.ndarray) -> np.ndarray:
        if self.activation == 'relu':
            return np.maximum(0, z)
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
        elif self.activation == 'tanh':
            return np.tanh(z)
        else:
            return z
    
    def _activate_derivative(self, z: np.ndarray) -> np.ndarray:
        if self.activation == 'relu':
            return (z > 0).astype(float)
        elif self.activation == 'sigmoid':
            s = self._activate(z)
            return s * (1 - s)
        elif self.activation == 'tanh':
            return 1 - np.tanh(z) ** 2
        else:
            return np.ones_like(z)
    
    def forward(self, X: np.ndarray) -> np.ndarray:
        self.X = X
        self.z = X @ self.W + self.b
        self.a = self._activate(self.z)
        return self.a
    
    def backward(self, da: np.ndarray, learning_rate: float) -> np.ndarray:
        batch_size = self.X.shape[0]
        dz = da * self._activate_derivative(self.z)
        dW = (self.X.T @ dz) / batch_size
        db = np.sum(dz, axis=0) / batch_size
        dX = dz @ self.W.T
        self.W -= learning_rate * dW
        self.b -= learning_rate * db
        return dX

In [None]:
# Test layer
layer = DenseLayer(3, 2, activation='relu')
X = np.random.randn(5, 3)
output = layer.forward(X)
print(f"Input shape: {X.shape}")
print(f"Output shape: {output.shape}")

## 3. Complete Neural Network

In [None]:
class NeuralNetwork:
    def __init__(self, layer_sizes: List[int], activation: str = 'relu'):
        self.layers = []
        for i in range(len(layer_sizes) - 1):
            act = 'sigmoid' if i == len(layer_sizes) - 2 else activation
            layer = DenseLayer(layer_sizes[i], layer_sizes[i+1], act)
            self.layers.append(layer)
    
    def forward(self, X: np.ndarray) -> np.ndarray:
        a = X
        for layer in self.layers:
            a = layer.forward(a)
        return a
    
    def compute_loss(self, y_pred: np.ndarray, y_true: np.ndarray) -> float:
        eps = 1e-10
        y_pred = np.clip(y_pred, eps, 1 - eps)
        loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss
    
    def backward(self, y_pred: np.ndarray, y_true: np.ndarray, learning_rate: float):
        eps = 1e-10
        y_pred = np.clip(y_pred, eps, 1 - eps)
        da = (y_pred - y_true) / (y_pred * (1 - y_pred) + eps)
        for layer in reversed(self.layers):
            da = layer.backward(da, learning_rate)
    
    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100,
            learning_rate: float = 0.01, batch_size: int = 32, verbose: bool = True):
        n_samples = X.shape[0]
        history = []
        for epoch in range(epochs):
            indices = np.random.permutation(n_samples)
            epoch_loss = 0
            n_batches = 0
            for start in range(0, n_samples, batch_size):
                batch_idx = indices[start:start + batch_size]
                X_batch = X[batch_idx]
                y_batch = y[batch_idx]
                y_pred = self.forward(X_batch)
                loss = self.compute_loss(y_pred, y_batch)
                epoch_loss += loss
                n_batches += 1
                self.backward(y_pred, y_batch, learning_rate)
            avg_loss = epoch_loss / n_batches
            history.append(avg_loss)
            if verbose and (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
        return history
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.forward(X)

In [None]:
# Train neural network on XOR
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

np.random.seed(42)
nn = NeuralNetwork([2, 5, 1])
history = nn.fit(X, y, epochs=2000, learning_rate=0.5, batch_size=4, verbose=False)

print(f"Final loss: {history[-1]:.6f}")
print(f"Predictions: {nn.predict(X).flatten()}")

## 4. Backpropagation Visualization

In [None]:
def backprop_visualization():
    # Simple example visualization
    X = np.array([[1.0, 2.0]])
    y = np.array([[1.0]])
    W1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
    W2 = np.array([[0.7], [0.8], [0.9]])
    
    z1 = X @ W1
    a1 = np.maximum(0, z1)
    z2 = a1 @ W2
    a2 = 1 / (1 + np.exp(-z2))
    
    print(f"Forward: a2={a2}")
    
    dz2 = a2 - y
    dW2 = a1.T @ dz2
    da1 = dz2 @ W2.T
    
    print(f"Backward dW2:\n{dW2}")

backprop_visualization()

## 5. Exercises: Dropout and Batch Norm

In [None]:
class Dropout:
    def __init__(self, p: float = 0.5):
        self.p = p
        self.training = True
    
    def forward(self, X: np.ndarray) -> np.ndarray:
        if self.training:
            self.mask = (np.random.rand(*X.shape) > self.p) / (1 - self.p)
            return X * self.mask
        return X
    
    def backward(self, da: np.ndarray) -> np.ndarray:
        return da * self.mask

class BatchNorm:
    def __init__(self, n_features: int, eps: float = 1e-5, momentum: float = 0.1):
        self.gamma = np.ones(n_features)
        self.beta = np.zeros(n_features)
        self.eps = eps
        self.momentum = momentum
        self.running_mean = np.zeros(n_features)
        self.running_var = np.ones(n_features)
    
    def forward(self, X: np.ndarray, training: bool = True) -> np.ndarray:
        if training:
            mean = X.mean(axis=0)
            var = X.var(axis=0)
            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var
        else:
            mean = self.running_mean
            var = self.running_var
        self.x_norm = (X - mean) / np.sqrt(var + self.eps)
        return self.gamma * self.x_norm + self.beta