In [1]:
import numpy as np

N = 100 # number of points per class
D = 2 # dimensionality
K = 2 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels

for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.rand(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

# Initialize parameters randomly
h1 = 50  # Size of the first hidden layer
h2 = 25  # Size of the second hidden layer
W1 = 0.01 * np.random.randn(D, h1)
b1 = np.zeros((1, h1))
W2 = 0.01 * np.random.randn(h1, h2)
b2 = np.zeros((1, h2))
W3 = 0.01 * np.random.randn(h2, K)
b3 = np.zeros((1, K))

print(W1.shape)
print(W2.shape)
print(W3.shape)

# Some hyperparameters
step_size = 1

# Gradient descent loop
num_examples = X.shape[0]
for i in range(5000):
    # Forward pass
    hidden_layer1 = np.maximum(0, np.dot(X, W1) + b1)  # ReLU activation for the first hidden layer
    hidden_layer2 = np.maximum(0, np.dot(hidden_layer1, W2) + b2)  # ReLU activation for the second hidden layer
    scores = np.dot(hidden_layer2, W3) + b3

    # Compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)  # [N x K]

    if i == 499:
        print(probs)

    # Compute the loss: average cross-entropy loss and regularization
    correct_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(correct_logprobs) / num_examples

    # Add regularization to the loss
    # reg = 0.01  # Regularization strength (you can adjust this)
    # reg_loss = 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(W3 * W3))
    #loss = data_loss + reg_loss
    loss = data_loss

    if i % 50 == 0:
        print("iteration %d: loss %f" % (i, loss))

    # Backpropagation
    dscores = probs
    dscores[range(num_examples), y] -= 1
    dscores /= num_examples

    # Backpropagation for the third layer
    dW3 = np.dot(hidden_layer2.T, dscores)
    db3 = np.sum(dscores, axis=0, keepdims=True)
    dhidden2 = np.dot(dscores, W3.T)
    dhidden2[hidden_layer2 <= 0] = 0

    # Backpropagation for the second hidden layer
    dhidden2[hidden_layer2 <= 0] = 0
    dW2 = np.dot(hidden_layer1.T, dhidden2)
    db2 = np.sum(dhidden2, axis=0, keepdims=True)
    dhidden1 = np.dot(dhidden2, W2.T)
    dhidden1[hidden_layer1 <= 0] = 0

    # Backpropagation for the first hidden layer
    dW1 = np.dot(X.T, dhidden1)
    db1 = np.sum(dhidden1, axis=0, keepdims=True)

    # Update parameters
    W1 += -step_size * dW1
    b1 += -step_size * db1
    W2 += -step_size * dW2
    b2 += -step_size * db2
    W3 += -step_size * dW3
    b3 += -step_size * db3



(2, 50)
(50, 25)
(25, 2)
iteration 0: loss 0.693145
iteration 50: loss 0.693094
iteration 100: loss 0.692300
iteration 150: loss 0.418811
iteration 200: loss 0.451199
iteration 250: loss 0.240887
iteration 300: loss 0.172424
iteration 350: loss 0.175255
iteration 400: loss 0.140763
iteration 450: loss 0.111915
[[9.06350390e-01 9.36496102e-02]
 [8.97374312e-01 1.02625688e-01]
 [8.96612985e-01 1.03387015e-01]
 [8.84483024e-01 1.15516976e-01]
 [8.91197096e-01 1.08802904e-01]
 [8.82319869e-01 1.17680131e-01]
 [8.91174358e-01 1.08825642e-01]
 [8.88335497e-01 1.11664503e-01]
 [9.16949194e-01 8.30508056e-02]
 [9.24229648e-01 7.57703522e-02]
 [9.11246895e-01 8.87531055e-02]
 [9.08267459e-01 9.17325412e-02]
 [9.60454577e-01 3.95454234e-02]
 [9.66803585e-01 3.31964145e-02]
 [9.49524466e-01 5.04755342e-02]
 [9.80901991e-01 1.90980090e-02]
 [9.72373911e-01 2.76260895e-02]
 [9.77116370e-01 2.28836301e-02]
 [9.88699140e-01 1.13008602e-02]
 [9.88849360e-01 1.11506403e-02]
 [9.90102378e-01 9.89762250e