In [1]:

def backward_propagation(cache, target, W1, W2, learning_rate=0.1, activation_function='linear'):
    """
    Perform backpropagation through a 2-layer neural network
    
    Parameters:
    cache -- dictionary containing values from forward propagation
    target -- target values
    W1, W2 -- weight matrices
    learning_rate -- learning rate for gradient descent
    activation_function -- 'linear' or 'sigmoid'
    
    Returns:
    Updated weights and biases W1, b1, W2, b2
    gradients -- dictionary containing the gradients
    """
    # Retrieve values from cache
    X = cache["X"]
    A1 = cache["A1"]
    A2 = cache["A2"]
    Z1 = cache["Z1"]
    
    # Number of examples
    m = X.shape[1]
    
    # Calculate error at output
    error = A2 - target
    
    # Backpropagation for the second layer
    if activation_function == 'linear':
        dZ2 = error  # Derivative of linear function is 1
    elif activation_function == 'sigmoid':
        dZ2 = error * A2 * (1 - A2)  # Derivative of sigmoid: f'(x) = f(x)(1-f(x))
    
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    
    # Backpropagation for the first layer
    if activation_function == 'linear':
        dZ1 = np.dot(W2.T, dZ2)  # Derivative of linear function is 1
    elif activation_function == 'sigmoid':
        dA1 = np.dot(W2.T, dZ2)
        dZ1 = dA1 * A1 * (1 - A1)  # Derivative of sigmoid
    
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)
    
    # Update weights and biases using gradient descent
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    # Store gradients for inspection
    gradients = {
        "dW1": dW1,
        "db1": db1,
        "dW2": dW2,
        "db2": db2
    }
    
    return W1, b1, W2, b2, gradients