In [1]:
from fully_connected import FullyConnectedLayer
import numpy as np


def test_fully_connected_layer():
    # Set up test parameters
    np.random.seed(0)
    input_dim = 4
    output_dim = 3
    batch_size = 5
    
    # Create a FullyConnectedLayer instance
    fc = FullyConnectedLayer(input_dim, output_dim, learning_rate=0.01)
    
    # Create random input data
    X = np.random.randn(batch_size, input_dim)
    
    # -------- Forward Pass --------
    output = fc.forward(X)
    print("Forward pass output:\n", output)
    
    # -------- Compute Loss & d_out --------
    # Using squared loss: L = 0.5 * sum(output^2)
    loss = 0.5 * np.sum(output ** 2)
    # The derivative of this loss with respect to output is simply output
    d_out = output
    
    # -------- Backward Pass --------
    dx,_,_ = fc.backward(d_out)
    print("\nGradient with respect to input (dx):\n", dx)
    print("\nAnalytical gradient for weights (dweights):\n", fc.dweights)
    print("\nAnalytical gradient for bias (dbias):\n", fc.dbias)
    
    # -------- Gradient Check --------
    epsilon = 1e-5
    # Numerical gradient for weights
    num_dweights = np.zeros_like(fc.weights)
    for i in range(fc.weights.shape[0]):
        for j in range(fc.weights.shape[1]):
            original_val = fc.weights[i, j]
            # Perturb weights positively
            fc.weights[i, j] = original_val + epsilon
            output_plus = fc.forward(X)
            loss_plus = 0.5 * np.sum(output_plus ** 2)
            
            # Perturb weights negatively
            fc.weights[i, j] = original_val - epsilon
            output_minus = fc.forward(X)
            loss_minus = 0.5 * np.sum(output_minus ** 2)
            
            # Restore original value
            fc.weights[i, j] = original_val
            
            # Compute numerical gradient
            num_dweights[i, j] = (loss_plus - loss_minus) / (2 * epsilon)
    
    # Numerical gradient for bias
    num_dbias = np.zeros_like(fc.bias)
    for j in range(fc.bias.shape[1]):
        original_val = fc.bias[0, j]
        fc.bias[0, j] = original_val + epsilon
        output_plus = fc.forward(X)
        loss_plus = 0.5 * np.sum(output_plus ** 2)
        
        fc.bias[0, j] = original_val - epsilon
        output_minus = fc.forward(X)
        loss_minus = 0.5 * np.sum(output_minus ** 2)
        
        fc.bias[0, j] = original_val
        num_dbias[0, j] = (loss_plus - loss_minus) / (2 * epsilon)
    
    print("\nNumerical gradient for weights (num_dweights):\n", num_dweights)
    print("\nMax absolute difference in weights gradient:", np.max(np.abs(fc.dweights - num_dweights)))
    
    print("\nNumerical gradient for bias (num_dbias):\n", num_dbias)
    print("\nMax absolute difference in bias gradient:", np.max(np.abs(fc.dbias - num_dbias)))
    
if __name__ == '__main__':
    test_fully_connected_layer()

Forward pass output:
 [[-0.04441203  0.78826677  0.57660598]
 [ 0.2661469   0.39007234  0.45195581]
 [-0.11615365 -0.88345875  0.39322788]
 [ 0.11939388  1.0420691   0.06827389]
 [ 0.16113172  0.71970105  0.8418569 ]]

Gradient with respect to input (dx):
 [[ 0.41979034  0.04064291  1.07199642  0.46623325]
 [ 0.26547981  0.08906956  0.64023037  0.1774453 ]
 [-0.28769188  0.2214638  -0.28985703 -0.43111009]
 [ 0.43899848 -0.11894462  0.80075827  0.54077901]
 [ 0.46152035  0.13907364  1.22623105  0.39902649]]

Analytical gradient for weights (dweights):
 [[ 1.17835853e+00  6.90654483e+00  1.55551558e+00]
 [-7.28083142e-02 -1.01960974e+00  1.37215174e+00]
 [-6.36805741e-03 -1.32492681e-01  8.70915047e-01]
 [-1.17343536e-01  6.62642904e-01 -1.79875942e-01]]

Analytical gradient for bias (dbias):
 [[0.38610682 2.0566505  2.33192046]]

Numerical gradient for weights (num_dweights):
 [[ 1.17835853e+00  6.90654483e+00  1.55551558e+00]
 [-7.28083142e-02 -1.01960974e+00  1.37215174e+00]
 [-6.368

In [3]:
import numpy as np

def test_backward():
    np.random.seed(42)  # For reproducibility

    # Define layer dimensions
    batch_size = 5
    input_dim = 4
    output_dim = 3

    # Create random inputs and weights
    inputs = np.random.randn(batch_size, input_dim)
    weights = np.random.randn(input_dim, output_dim)
    bias = np.random.randn(1, output_dim)
    
    # Create a fake upstream gradient (dout) from next layer
    dout = np.random.randn(batch_size, output_dim)

    # Define FullyConnectedLayer class
    class FullyConnectedLayer:
        def __init__(self, input_dim, output_dim):
            self.input_dim = input_dim
            self.output_dim = output_dim
            self.weights = np.random.randn(input_dim, output_dim)
            self.bias = np.random.randn(1, output_dim)

        def forward(self, x):
            self.input = x
            return np.dot(x, self.weights) + self.bias
        
        def backward(self, dout):
            self.dweights = np.dot(self.input.T, dout)
            self.dbias = np.sum(dout, axis=0, keepdims=True)
            dx = np.dot(dout, self.weights.T)
            return dx, self.dweights, self.dbias
    
    fc = FullyConnectedLayer(input_dim, output_dim)
    fc.input = inputs
    fc.weights = weights
    fc.bias = bias

    # Compute gradients using backward pass
    dx, dweights, dbias = fc.backward(dout)

    # Numerical gradient checking
    epsilon = 1e-5
    num_dweights = np.zeros_like(weights)
    num_dbias = np.zeros_like(bias)

    # Compute numerical gradients for weights
    for i in range(weights.shape[0]):
        for j in range(weights.shape[1]):
            weights[i, j] += epsilon
            loss1 = np.sum((np.dot(inputs, weights) + bias) * dout)  # Forward with perturbed weight
            weights[i, j] -= 2 * epsilon
            loss2 = np.sum((np.dot(inputs, weights) + bias) * dout)  # Forward with perturbed weight
            weights[i, j] += epsilon  # Reset to original
            num_dweights[i, j] = (loss1 - loss2) / (2 * epsilon)

    # Compute numerical gradients for bias
    for j in range(bias.shape[1]):
        bias[0, j] += epsilon
        loss1 = np.sum((np.dot(inputs, weights) + bias) * dout)
        bias[0, j] -= 2 * epsilon
        loss2 = np.sum((np.dot(inputs, weights) + bias) * dout)
        bias[0, j] += epsilon  # Reset to original
        num_dbias[0, j] = (loss1 - loss2) / (2 * epsilon)

    # Print results
    print("Max absolute difference in weight gradient:", np.max(np.abs(dweights - num_dweights)))
    print("Max absolute difference in bias gradient:", np.max(np.abs(dbias - num_dbias)))

    # Check if gradients are close
    assert np.allclose(dweights, num_dweights, atol=1e-4), "Weight gradients do not match numerical gradients!"
    assert np.allclose(dbias, num_dbias, atol=1e-4), "Bias gradients do not match numerical gradients!"
    print("Backward pass gradient check PASSED!")

# Run test
test_backward()


Max absolute difference in weight gradient: 1.6426970894656279e-10
Max absolute difference in bias gradient: 9.207967721636123e-11
Backward pass gradient check PASSED!
