In [1]:
import numpy as np
from utils import load_cifar10
import matplotlib.pyplot as plt

In [2]:
def softmax(S):
    S -= np.max(S, axis=1, keepdims=True) #subtract max from each example for numerical stability
    S_exp   = np.exp(S)
    softmax = S_exp/(np.sum(S_exp, axis=1, keepdims=True))
    return softmax

In [3]:
X_train, y_train, X_test, y_test = load_cifar10('/home/ashwin/Resources/datasets/cifar-10-batches-py/')

In [4]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test  = X_test.reshape(X_test.shape[0], -1)

In [5]:
# normalizing image data
img_mean = X_train.mean(axis=0)
img_stdv = X_train.std(axis=0)

X_train  = (X_train - img_mean[:,])/img_stdv[:,]
X_test   = (X_test  - img_mean[:,])/img_stdv[:,]

In [6]:
print('shape of X_train:', X_train.shape, '& shape of y_train:', y_train.shape)

shape of X_train: (50000, 3072) & shape of y_train: (50000,)


In [7]:
n, m = X_train.shape # number of examples, feature dimensions

# Forward

## Hidden layer

In [8]:
h = 20 # nummber of hidden units / Output features for hidden layer

In [9]:
W1 = np.random.rand(h, m)*0.01 # (Output features, Input features)
b1 = np.zeros(h)
print('shape of W:', W1.shape, '& shape of b:', b1.shape)

shape of W: (20, 3072) & shape of b: (20,)


In [10]:
# Compute Model scores
S1 = X_train@W1.T + b1
print('shape of score matrix S:', S1.shape)

shape of score matrix S: (50000, 20)


In [11]:
# Non-Linearity - ReLU
A1 = np.maximum(S1, 0)
print('shape of activation/matrix A:', A1.shape)

shape of activation/matrix A: (50000, 20)


## Output layer

In [12]:
c = 10 # number of classes / Output features for output layer

In [13]:
W2 = np.random.rand(c, h)*0.01 # (Output features, Input features)
b2 = np.zeros(c)
print('shape of W:', W2.shape, '& shape of b:', b2.shape)

shape of W: (10, 20) & shape of b: (10,)


In [14]:
# Compute Model scores
S2 = A1@W2.T + b2
print('shape of score matrix S:', S2.shape)

shape of score matrix S: (50000, 10)


In [15]:
# Compute softmax from class scores
A2 = softmax(S2)
print('shape of activation/probability matrix A:', A2.shape)

shape of activation/probability matrix A: (50000, 10)


## Loss

In [16]:
# Calculate Cross Entropy loss
loss = -np.sum(np.log(A2[np.arange(n), y_train]))
print('loss:',loss)

loss: 115641.81329195091


# Backward

In [17]:
# d_loss/d_score

dS2 = A2 
dS2[np.arange(n), y_train] -= 1

print('shape of dL/dS:', dS2.shape)

shape of dL/dS: (50000, 10)


In [18]:
# calculate weight gradients

dA1  = dS2@W2
dW2  = dS2.T@A1
db2  = dS2.sum(axis=0)

print('shape of dL/dA:', dA1.shape)
print('shape of dL/dW:', dW2.shape)
print('shape of dL/db:', db2.shape)

shape of dL/dA: (50000, 20)
shape of dL/dW: (10, 20)
shape of dL/db: (10,)


In [19]:
# derivative for ReLU axtivation

dS1 = dA1
dS1[S1<0] = 0

print('shape of dL/dS:', dS1.shape)

shape of dL/dS: (50000, 20)


In [20]:
# calculate weight gradients

dW1  = dS1.T@X_train
db1  = dS1.sum(axis=0)

print('shape of dL/dW:', dW1.shape)
print('shape of dL/db:', db1.shape)

shape of dL/dW: (20, 3072)
shape of dL/db: (20,)


# Weight Update

In [21]:
# update weights with learning rate lr
lr = 0.01

W1 -= lr*dW1
W2 -= lr*dW2
b1 -= lr*db1
b2 -= lr*db2