In [None]:
import numpy as np

# ---------------------------
# Helper Functions
# ---------------------------
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss(probs, labels):
    """
    Computes cross-entropy loss.
    Assumes labels are class indices.
    """
    N = probs.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), labels])) / N
    return loss

def d_loss_softmax(probs, labels):
    """
    Computes gradient of loss with respect to logits,
    assuming a softmax output and cross-entropy loss.
    """
    N = probs.shape[0]
    grad = probs.copy()
    grad[np.arange(N), labels] -= 1
    grad /= N
    return grad

# ---------------------------
# Convolutional Layer Class
# ---------------------------
class Conv2D:
    def __init__(self, in_channels, out_channels, kernel_size, learning_rate=0.01, stride=1, padding=0, activation='relu'):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.activation = activation
        self.learning_rate = learning_rate    
        
        # Xavier initialization
        fan_in = in_channels * kernel_size * kernel_size
        fan_out = out_channels * kernel_size * kernel_size
        limit = np.sqrt(2 / (fan_in + fan_out))
        self.weights = np.random.uniform(-limit, limit, (out_channels, in_channels, kernel_size, kernel_size))
        self.biases = np.zeros((out_channels, 1))
        
    def pad_input(self, x):
        if self.padding > 0:
            return np.pad(x, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        return x
    
    def forward(self, x):
        """
        Forward pass: performs convolution and applies ReLU.
        Stores the pre-activation output for backpropagation.
        """
        self.input = x
        x_padded = self.pad_input(x)
        batch_size, _, input_height, input_width = x.shape
        
        out_height = (input_height - self.kernel_size + 2*self.padding) // self.stride + 1
        out_width  = (input_width  - self.kernel_size + 2*self.padding) // self.stride + 1
        
        conv_pre = np.zeros((batch_size, self.out_channels, out_height, out_width))
        # Convolution operation
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for i in range(out_height):
                    for j in range(out_width):
                        h_start = i * self.stride
                        h_end = h_start + self.kernel_size
                        w_start = j * self.stride
                        w_end = w_start + self.kernel_size
                        conv_pre[b, oc, i, j] = np.sum(
                            x_padded[b, :, h_start:h_end, w_start:w_end] * self.weights[oc]
                        ) + self.biases[oc]
                        
        self.conv_pre = conv_pre  # store pre-activation values
        # Apply ReLU activation
        self.output = np.maximum(0, conv_pre)
        return self.output
    
    def backward(self, d_out):
        """
        Backward pass: computes gradients with respect to the input,
        weights, and biases. It assumes that the gradient d_out is already
        with respect to the activated output.
        """
        batch_size, _, out_height, out_width = d_out.shape
        x_padded = self.pad_input(self.input)
        d_x = np.zeros_like(x_padded)
        d_w = np.zeros_like(self.weights)
        d_b = np.zeros_like(self.biases)
        
        for b in range(batch_size):
            for oc in range(self.out_channels):
                for i in range(out_height):
                    for j in range(out_width):
                        h_start = i * self.stride
                        h_end = h_start + self.kernel_size
                        w_start = j * self.stride
                        w_end = w_start + self.kernel_size
                        
                        d_w[oc] += x_padded[b, :, h_start:h_end, w_start:w_end] * d_out[b, oc, i, j]
                        d_x[b, :, h_start:h_end, w_start:w_end] += self.weights[oc] * d_out[b, oc, i, j]
                        d_b[oc] += d_out[b, oc, i, j]
                        
        if self.padding > 0:
            d_x = d_x[:, :, self.padding:-self.padding, self.padding:-self.padding]
        
        return d_x, d_w, d_b

    def update_params(self, d_w, d_b):
        """
        Updates the weights and biases using gradient descent.
        """
        self.weights -= self.learning_rate * d_w
        self.biases -= self.learning_rate * d_b

# ---------------------------
# Fully Connected Layer Class
# ---------------------------
class FullyConnectedLayer:
    def __init__(self, input_dim, output_dim, learning_rate=0.01):
        self.learning_rate = learning_rate
        limit = np.sqrt(6 / (input_dim + output_dim))
        self.weights = np.random.uniform(-limit, limit, (input_dim, output_dim))
        self.bias = np.zeros((1, output_dim))
    
    def forward(self, X):
        """
        Forward pass: computes logits.
        """
        self.input = X
        self.output = np.dot(X, self.weights) + self.bias
        return self.output
    
    def backward(self, d_out):
        """
        Backward pass: computes gradients with respect to inputs, weights, and biases.
        """
        self.dweights = np.dot(self.input.T, d_out)
        self.dbias = np.sum(d_out, axis=0, keepdims=True)
        dx = np.dot(d_out, self.weights.T)
        return dx
    
    def update_params(self):
        """
        Updates the weights and biases using gradient descent.
        """
        self.weights -= self.learning_rate * self.dweights
        self.bias -= self.learning_rate * self.dbias

# ---------------------------
# Sample Training Step
# ---------------------------
if __name__ == '__main__':
    np.random.seed(42)
    
    # Dummy input: 2 samples, 1 channel, 5x5 image
    X = np.random.randn(2, 1, 5, 5)
    # Dummy labels: 2 samples with class indices (assume 3 classes: 0, 1, 2)
    labels = np.array([0, 2])
    
    # Initialize layers
    conv = Conv2D(in_channels=1, out_channels=2, kernel_size=3, learning_rate=0.01, stride=1, padding=0, activation='relu')
    # After conv: output shape will be (2, 2, 3, 3). Flatten each sample to 2*3*3 = 18 features.
    fc = FullyConnectedLayer(input_dim=2*3*3, output_dim=3, learning_rate=0.01)
    
    # -------- Forward Pass --------
    conv_out = conv.forward(X)                # Conv layer forward (with ReLU applied)
    fc_input = conv_out.reshape(conv_out.shape[0], -1)  # Flatten conv output for FC layer
    fc_logits = fc.forward(fc_input)            # FC layer forward (logits)
    probs = softmax(fc_logits)                  # Softmax activation for classification
    
    # Compute loss
    loss = cross_entropy_loss(probs, labels)
    print("Loss:", loss)
    
    # -------- Backward Pass --------
    # Compute gradient of loss w.r.t. logits (softmax + cross-entropy derivative)
    d_logits = d_loss_softmax(probs, labels)    # shape: (2, 3)
    
    # Backprop through Fully Connected Layer
    d_fc_input = fc.backward(d_logits)          # shape: (2, 18)
    fc.update_params()                          # Update FC layer parameters
    
    # Reshape gradient to match conv layer output shape: (2, 2, 3, 3)
    d_conv_out = d_fc_input.reshape(conv_out.shape)
    
    # --- Backprop through Conv Layer ---
    # Since the conv layer forward applied ReLU, we must compute its derivative externally.
    # The derivative of ReLU is 1 where conv_pre > 0 and 0 otherwise.
    d_conv_pre = d_conv_out * (conv.conv_pre > 0)
    
    # Backprop through convolution layer
    d_X, d_w, d_b = conv.backward(d_conv_pre)
    conv.update_params(d_w, d_b)                # Update Conv layer parameters
    
    print("Backward pass complete. Parameters updated.")
