In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [2]:
X = np.array([[5.1, 1.4],
              [7.0, 4.7],
              [6.3, 6.0],
              [4.9, 1.5]])  # shape (4,2)
y=np.array([0,1,2,0])


In [3]:
X


array([[5.1, 1.4],
       [7. , 4.7],
       [6.3, 6. ],
       [4.9, 1.5]])

In [4]:
# First Layer of 4 neuron 
w1=np.random.randn(2,4)*0.01
b1=np.zeros((1,4))
print(w1,'\n',b1)


[[ 0.00496714 -0.00138264  0.00647689  0.0152303 ]
 [-0.00234153 -0.00234137  0.01579213  0.00767435]] 
 [[0. 0. 0. 0.]]


In [5]:
z1 = X@w1 + b1
z1

array([[ 0.02205427, -0.0103294 ,  0.05514109,  0.08841861],
       [ 0.02376478, -0.02068294,  0.1195612 ,  0.14268152],
       [ 0.01724379, -0.02275887,  0.13555715,  0.14199696],
       [ 0.02082669, -0.01028701,  0.05542493,  0.08613998]])

In [6]:
def ReLU(z):
    return np.maximum(0, z)


In [7]:
A = ReLU(z1)
A

array([[0.02205427, 0.        , 0.05514109, 0.08841861],
       [0.02376478, 0.        , 0.1195612 , 0.14268152],
       [0.01724379, 0.        , 0.13555715, 0.14199696],
       [0.02082669, 0.        , 0.05542493, 0.08613998]])

In [8]:
# Second Layer/output layer of 3 neurons (cuz of 3 output class)
w2=np.random.randn(4,3)*0.01
b2=np.zeros((1,3))
print(w2,'\n',b2)

[[-0.00469474  0.0054256  -0.00463418]
 [-0.0046573   0.00241962 -0.0191328 ]
 [-0.01724918 -0.00562288 -0.01012831]
 [ 0.00314247 -0.00908024 -0.01412304]] 
 [[0. 0. 0.]]


In [9]:
# output z2
z2 = A@w2 + b2
z2

array([[-0.00077682, -0.00099326, -0.00190943],
       [-0.00172553, -0.00183892, -0.00333618],
       [-0.00197298, -0.00195803, -0.0034583 ],
       [-0.00078312, -0.00098082, -0.00187443]])

In [10]:
# Softmax layer with clipping for stability
def softmax(z):
    z_shift = z - np.max(z, axis=1, keepdims=True)  
    val_e = np.exp(z_shift)
    sum_row = np.sum(val_e, axis=1, keepdims=True)
    prob = val_e / sum_row
    prob = np.clip(prob, 1e-9, 1 - 1e-9) 
    return prob

p = softmax(z2)


In [11]:
p

array([[0.33348322, 0.33341105, 0.33310573],
       [0.33352486, 0.33348704, 0.3329881 ],
       [0.33349667, 0.33350165, 0.33300168],
       [0.33347655, 0.33341063, 0.33311282]])

In [12]:
# Loss - Cross binary entropy
def cross_entropy_loss(y_true, y_pred):
    n_samples = y_true.shape[0]
    correct_probs = y_pred[np.arange(n_samples), y_true]
    loss = -np.mean(np.log(correct_probs))
    return loss

loss = cross_entropy_loss(y, p)
print(loss)


1.098526115817355


# Backpropagation

In [13]:
def make_ohe(y):
    num_classes = y.max()+1
    one_hot = np.eye(num_classes)[y]
    return one_hot



In [14]:
# loss + softmax

def backpropLoss_softmax(y_true,probability_return_from_softmax):
    if y_true.ndim == 1:
        y_ohe = make_ohe(y_true)
        return probability_return_from_softmax- y_ohe
    return probability_return_from_softmax - y_ohe


In [15]:
# back prop loss+softmax 
dl_dzj = backpropLoss_softmax(y,p)

In [16]:
dl_dzj

array([[-0.66651678,  0.33341105,  0.33310573],
       [ 0.33352486, -0.66651296,  0.3329881 ],
       [ 0.33349667,  0.33350165, -0.66699832],
       [-0.66652345,  0.33341063,  0.33311282]])

In [17]:
dz_out_dw2 = A.T@ dl_dzj
dz_out_db2 = 1* np.sum(dl_dzj, axis=0, keepdims=True)

In [18]:
dz_out_dw2

array([[-0.01490413,  0.00420828,  0.01069585],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.01139001,  0.00238335, -0.01377336],
       [-0.02140346,  0.01045687,  0.01094659]])

In [19]:
dz_out_db2

array([[-0.66601871,  0.33381037,  0.33220833]])

In [20]:
# derivative of Relu

def relu_derivative(z):
    dz = np.zeros_like(z)
    dz[z > 0] = 1
    return dz


In [21]:
# dl_drelu_w = relu_derivative(dz_out_dw2)
# dl_drelu_b = relu_derivative(dz_out_db2)

In [22]:
dL_dA1 = dl_dzj @ w2.T 
dL_dZ1 = dL_dA1 * relu_derivative(z1)


In [23]:
dL_dZ1

array([[ 0.00339441, -0.        ,  0.00624834, -0.00982643],
       [-0.00672517, -0.        , -0.00537792,  0.00239739],
       [ 0.00333475,  0.        , -0.00087222,  0.00743977],
       [ 0.00339441, -0.        ,  0.00624839, -0.00982655]])

In [24]:
# gradients of First layer 
dz1_dw1=X
dz1_db1 = 1

In [25]:
# z1=X.t@w1+b1

dL_dW1 =  X.T @ dL_dZ1 
dL_db1 = np.sum(dL_dZ1, axis=0, keepdims=True)


In [26]:
dL_dW1

array([[ 0.00787682,  0.        ,  0.01934324, -0.03461259],
       [-0.00175601,  0.        , -0.01238925,  0.02740953]])