In [1]:
import numpy as np
import pandas as pd

In [2]:
def relu(x):
    return np.maximum(0,x)

def softmax(x):
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis= 1, keepdims = True)

In [3]:
def forward(X, W1, W2):
    z1 = np.dot(X,W1)
    a1 = relu(z1)
    z2 = np.dot(a1, W2)
    y_hat = softmax(z2)
    return y_hat, a1, z1

In [4]:
def compute_loss(y_hat, y):
    loss = -np.mean(y * np.log(y_hat))
    return loss

In [5]:
def backprop(X, y, y_hat, a1, z1, W1, W2):
    m = X.shape[0]
    dL_dz2 = (y_hat - y) / m
    dL_dW2 = np.dot(a1.T, dL_dz2)
    dL_da1 = np.dot(dL_dz2, W2.T)
    dL_dz1 = dL_da1 * (z1 > 0)
    dL_dW1 = np.dot(X.T, dL_dz1)
    return dL_dW1, dL_dW2

In [6]:
import random

input_size = 10
hidden_size = 100
output_size = 3
learning_rate = 0.01

W1 = np.random.randn(input_size, hidden_size)
W2 = np.random.randn(hidden_size, output_size)
print('Weights W1')
print(W1)
print('Weights W2')
print(W2)

X_train = np.random.randn(1000, 10)
y_train = np.random.randint(0,3, size=(1000,))
print('X_train')
print(X_train)
print('y_train')
print(y_train)

num_classes =3
y_train_one_hot = np.eye(num_classes)[y_train]
print('y_train_one_hot')
print(y_train_one_hot)

Weights W1
[[-3.38853529e-01 -1.04472832e+00 -1.32055072e+00 -3.67832114e-01
  -1.45742861e+00 -1.54058500e+00 -1.32278468e+00  7.27439227e-01
   8.02049632e-01  3.36085321e-01  8.02217292e-01  8.19502319e-01
   2.43160350e-01 -7.21816019e-01 -3.76734971e-01 -5.41142813e-01
   5.09073061e-01 -8.30618588e-01 -3.47997029e-01  1.83887664e+00
   4.25916975e-01  1.54000551e-01  2.39081305e-01 -1.57086517e-01
  -5.76419565e-01 -5.22933320e-01  8.59606551e-01 -3.83681653e-01
   9.40710578e-03  2.47354552e-01  7.29072920e-01 -6.08498935e-01
   1.23239312e+00 -8.98627161e-03  1.71623837e+00 -4.20315899e-01
   4.32617968e-01 -7.71983290e-02 -1.97259097e+00  1.29550393e-02
   7.96925236e-01  7.61499827e-01 -1.58846304e+00  1.06909611e+00
   8.08225796e-01 -5.73363073e-02 -1.34636257e+00 -6.31623132e-02
   3.35716386e-02 -9.23080202e-01 -9.47801748e-01  5.45643852e-01
   1.76432779e-01 -9.37712982e-01  6.05227113e-01  8.74568253e-01
   3.15416273e-01 -5.96867802e-01 -5.24313393e-01  1.64571336e+00

In [7]:
for epoch in range (1000):
    y_hat, a1, z1 = forward(X_train, W1, W2)
    
    loss = compute_loss(y_hat, y_train_one_hot)
    
    if epoch % 50 == 0:
        print('Epoch: ', epoch, 'Loss: ', loss)
        
    dL_dw1, dL_dw2 = backprop(X_train, y_train_one_hot, y_hat, a1, z1, W1, W2)
    
    W1 -= learning_rate * dL_dw1
    W2 -= learning_rate * dL_dw2
    
    predictions = np.argmax(y_hat, axis = 1)

Epoch:  0 Loss:  8.429285478196304
Epoch:  50 Loss:  4.119135621491082
Epoch:  100 Loss:  2.421155678738757
Epoch:  150 Loss:  1.894750127265348
Epoch:  200 Loss:  1.657120659457068
Epoch:  250 Loss:  1.4751110045467328
Epoch:  300 Loss:  1.3219337445731951
Epoch:  350 Loss:  1.192234461642787
Epoch:  400 Loss:  1.082839827801394
Epoch:  450 Loss:  0.989238025237132
Epoch:  500 Loss:  0.9096524359051346
Epoch:  550 Loss:  0.8427186584325439
Epoch:  600 Loss:  0.7873034935674147
Epoch:  650 Loss:  0.7409637099758205
Epoch:  700 Loss:  0.7018770326975837
Epoch:  750 Loss:  0.6688309094992521
Epoch:  800 Loss:  0.6404920009819802
Epoch:  850 Loss:  0.6162560884251271
Epoch:  900 Loss:  0.5954128296920281
Epoch:  950 Loss:  0.5774216089210659
