In [77]:
import pandas as pd
import numpy as np

In [78]:
X = np.linspace(0, 10, 50).reshape(-1, 1)
noise = np.random.normal(0, 0.2, size=(50, 1))
y = np.log(X + 1) + noise

In [79]:
data={'X':X.flatten(),'y':y.flatten()}

In [80]:
df=pd.DataFrame(data)
df.head()

Unnamed: 0,X,y
0,0.0,-0.011801
1,0.204082,0.137718
2,0.408163,0.431927
3,0.612245,0.623751
4,0.816327,0.498844


In [81]:
df.shape

(50, 2)

In [82]:
#What input represents -
# X represents the input feature values.
# Target y increases non-linearly (log curve) with small noise.
#Why this range was chosen - So, we see both early rapid change and later saturation, which a linear model cannot fit

In [83]:
df.head()

Unnamed: 0,X,y
0,0.0,-0.011801
1,0.204082,0.137718
2,0.408163,0.431927
3,0.612245,0.623751
4,0.816327,0.498844


In [84]:
#Why target can't be fit well with a straight line - Because there is no linear relationshiop bw input and y. There is a non-linear relationship, so it can't be fit with a straight line.

In [85]:
#How many intermediate values - I chose 3
#Why more than 1? - More than 1 allows the model to combine multiple nonlinear pieces.
#why not too many? -Too many units would increase instability and over-complexity.

In [86]:
W1=np.random.normal(-1,1,size=(1,3))
b1=np.zeros((1,3))
W2=np.random.normal(-1,1,size=(3,1))
b2=np.zeros((1,1))

In [87]:
#What each parameter controls and Which ones affects shape vs position -
# W1: controls how input bends into hidden space (shape)
# b1: shifts hidden activations left/right (position)
# W2: combines hidden units to form output curve (shape)
# b2: shifts final prediction up/down (position)

In [88]:
def activation(z):
  return np.maximum(0, z)

In [89]:
def activation_slope(z):
  #return slope of activation
  return (z>0).astype(float)

In [90]:
def forward(X, W1, b1, W2, b2):
    z1 = X @ W1 + b1
    h = activation(z1)
    y_hat = h @ W2 + b2

    return z1, h, y_hat

In [91]:
def compute_loss(y_hat, y):
    error = y_hat - y
    loss = np.mean(error ** 2)
    return error, loss

In [92]:
def backward(X, y, z1, h, y_hat, W2):
    error = y_hat - y
    dL_dy = 2 * error / len(X)
    dL_dW2 = h.T @ dL_dy
    dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
    dL_dh = dL_dy @ W2.T
    dL_dz1 = dL_dh * activation_slope(z1)
    dL_dW1 = X.T @ dL_dz1
    dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)

    return dL_dW1, dL_db1, dL_dW2, dL_db2

In [93]:
def update(W1, b1, W2, b2, dW1, db1, dW2, db2, lr):

    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2

    return W1, b1, W2, b2

In [94]:
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):

    # forward
    z1, h, y_hat = forward(X, W1, b1, W2, b2)

    # loss
    error, loss = compute_loss(y_hat, y)

    # backward
    dW1, db1, dW2, db2 = backward(X, y, z1, h, y_hat, W2)

    # update
    W1, b1, W2, b2 = update(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

    if epoch % 100 == 0:
        print("Epoch:", epoch, "Loss:", loss)

Epoch: 0 Loss: 65.43763120729514
Epoch: 100 Loss: 0.5023449390377256
Epoch: 200 Loss: 0.4602784562432239
Epoch: 300 Loss: 0.45953859318994367
Epoch: 400 Loss: 0.4595255805180672
Epoch: 500 Loss: 0.4595253516518889
Epoch: 600 Loss: 0.4595253476266029
Epoch: 700 Loss: 0.4595253475558063
Epoch: 800 Loss: 0.4595253475545611
Epoch: 900 Loss: 0.45952534755453917
