In [13]:
import numpy as np

datasetX = np.array([[0,0], [1,0], [0,1], [1,1]])
datasetY = np.array([[0], [1], [1], [0]])

In the following we need a helper function:
$$\sigma (z) = \frac{1}{1+e^{-z}}= (1+e^{-z})^{-1}$$

The derivative of that helper function is also quite important:
$$\frac{d}{dz} \sigma =  \sigma (z) \cdot (1-\sigma(z))$$

This is a sigmoid activation function for the hidden and output layers.

In [14]:
def sigmoid():
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative():
    return sigmoid(z) * (1 - sigmoid(z))

We also need a loss function. For simplicity the mean squared error function (MSE) is used: 
$$\mathrm{MSE} = \frac{1}{N} \sum_i (y_{\text{predicted}, \ i}-y_{\text{true},\ i })^2

In [15]:
def mse_loss():
    return np.mean((y_prediction - y_true) ** 2) 

For the example neural network, we can hand-select the number of hidden neurons. We will sse:

2 input neurons
2 neurons in one hidden layer
1 output neuron

We also need small random number weights and biases to zero or also small random values.

In [16]:
np.random.seed(42) # for reproducibility

input_dim = 2
hidden_dim = 2
output_dim = 1

W1 = np.random.randn(input_dim, hidden_dim) * 0.1   # Return a samples from the “standard normal” distribution.
b1 = np.zeros((1, hidden_dim))

W2 = np.random.randn(hidden_dim, output_dim) * 0.1
b2 = np.zeros((1, output_dim))

Now we need to define forward- and backward propagation.

For the forward propagation we have an input $X$ with the shape (N x 2):
1. We compute hidden layers: $$Z_1 = XW_1 + b_1$$
2. We apply activation (here the sigmoid): $$A_1 = \sigma(Z_1)$$
3. We compute the output layer: $$Z_2 = A_1 W_2 + b_2$$
4. We apply the output activation: $$\hat y = \sigma(Z_2)$$

In [17]:
def forward_propagation(X, W1, W2, b1, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = sigmoid(Z1)

    Z2 = np.dot(A1, W2) + b2
    y_prediction = sigmoid(Z2)
    return Z1, Z2, A1, y_prediction

As we now have forward propagation we need a backwards one too:
1. We compute loss gradient $$\frac{\partial \mathrm{Loss}}{\partial \hat y} = 2 \cdot \frac{(\hat y - y)}{N}$$
2. We calculate the gradient at the output layer $$\delta_2 = \frac{\partial \mathrm{Loss}}{\partial \hat y}\cdot \sigma'(Z_2)$$
3. We calculate the gradient for the hidden layer: $$\delta_1 = \delta_2 W_2^T \cdot \sigma'(Z_1)$$
4. Lastly we calculate the gradients with respect to the weights and biases $$\begin{align} \frac{\partial \mathrm{Loss}}{\partial W_2} &= A_1^T \delta_2 \\ \frac{\partial \mathrm{Loss}}{\partial b_2} & = \sum_i \delta_2 \\ \frac{\partial \mathrm{Loss}}{\partial W_1} &= X^T \delta_1 \\ \frac{\partial \mathrm{Loss}}{\partial b_1} &= \sum_i \delta_1 \end{align}$$

In [18]:
def backward_propagation(X, y, Z1, Z2, A1, y_prediction, W1, W2, b1, b2, learning_rate = 0.1):
    N = X.shape[0] # number of samples

    dLoss_y_prediction = 2 * (y_prediction - y) / N # 1.
    dZ2 = dLoss_y_prediction * sigmoid_derivative(Z2) # 2.

    #3.
    dW2 = np.dot(A1.T, dZ2)
    db2 = np.sum(dZ2, axis = 0, keepdims = True)
    #4. 
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * sigmoid_derivative(Z1)
    #5.
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis = 0, keepdims = True)
    #6. 
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1

    return W1, W2, b1, b2

Now the neural network should be ready to be trained, with a training loop:
1. Forward propagation
2. Compute loss
3. Backpropagation
4. Update parameters