In [3]:
import numpy as np
np.random.seed(42)
N = 100 # number of points per class
D = 2 # dimensionality
K = 2 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels

for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.rand(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

# Initialize parameters randomly
h1 = 50  # Size of the first hidden layer
h2 = 25  # Size of the second hidden layer
W1 = 0.01 * np.random.rand(D, h1)
b1 = np.zeros((1, h1))
W2 = 0.01 * np.random.rand(h1, h2)
b2 = np.zeros((1, h2))
W3 = 0.01 * np.random.rand(h2, K)
b3 = np.zeros((1, K))
# Some hyperparameters
step_size = 1

# Gradient descent loop
num_examples = X.shape[0]
for i in range(500):
    # Forward pass
    hidden_layer1 = np.maximum(0, np.dot(X, W1) + b1) # ReLU activation for the first hidden layer
    hidden_layer2 = np.maximum(0, np.dot(hidden_layer1, W2) + b2)  # ReLU activation for the second hidden layer
    scores = np.dot(hidden_layer2, W3) + b3
    
    # Compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)  # [N x K]

    # Compute the loss: average cross-entropy loss
    correct_logprobs = -np.log(probs[range(num_examples), y])
    loss = np.sum(correct_logprobs) / num_examples
    if i % 50 == 0:
        print("iteration %d: loss %f" % (i, loss))
        
    # Backpropagation
    dscores = probs
    dscores[range(num_examples), y] -= 1
    dscores /= num_examples

    # Backpropagation for the third layer
    dW3 = np.dot(hidden_layer2.T, dscores)
    db3 = np.sum(dscores, axis=0, keepdims=True)
    dhidden2 = np.dot(dscores, W3.T)

    # Backpropagation for the second hidden layer
    dhidden2[hidden_layer2 <= 0] = 0
    dW2 = np.dot(hidden_layer1.T, dhidden2)
    db2 = np.sum(dhidden2, axis=0, keepdims=True)
    dhidden1 = np.dot(dhidden2, W2.T)

    # Backpropagation for the first hidden layer
    dhidden1[hidden_layer1 <= 0] = 0
    dW1 = np.dot(X.T, dhidden1)
    db1 = np.sum(dhidden1, axis=0, keepdims=True)

    # Update parameters
    W1 -= step_size * dW1
    b1 -= step_size * db1
    W2 -= step_size * dW2
    b2 -= step_size * db2
    W3 -= step_size * dW3
    b3 -= step_size * db3
    

iteration 0: loss 0.693150
iteration 50: loss 0.693102
iteration 100: loss 0.692310
iteration 150: loss 0.435163
iteration 200: loss 0.478356
iteration 250: loss 0.438333
iteration 300: loss 0.387940
iteration 350: loss 0.289622
iteration 400: loss 0.180240
iteration 450: loss 0.162727
