In [1]:
import numpy as np
from utils import load_cifar10
import matplotlib.pyplot as plt

In [2]:
def softmax(S):
    S -= np.max(S, axis=1, keepdims=True) #subtract max from each example for numerical stability
    S_exp   = np.exp(S)
    softmax = S_exp/(np.sum(S_exp, axis=1, keepdims=True))
    return softmax

In [3]:
X_train, y_train, X_test, y_test = load_cifar10('/home/ashwin/Resources/datasets/cifar-10-batches-py/')

In [4]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test  = X_test.reshape(X_test.shape[0], -1)

In [5]:
# normalizing image data
img_mean = X_train.mean(axis=0)
img_stdv = X_train.std(axis=0)

X_train  = (X_train - img_mean[:,])/img_stdv[:,]
X_test   = (X_test  - img_mean[:,])/img_stdv[:,]

In [6]:
print('shape of X_train:', X_train.shape, '& shape of y_train:', y_train.shape)

shape of X_train: (50000, 3072) & shape of y_train: (50000,)


In [7]:
n, m = X_train.shape # number of examples, feature dimensions

# Forward

## Hidden layer

In [8]:
h = 20 # nummber of hidden units i.e. Output features for hidden layer

In [9]:
# Initialize weights
W1 = np.random.rand(h, m)*0.01 # (Output features, Input features)
b1 = np.zeros(h)
print('shape of W1:', W1.shape, '& shape of b1:', b1.shape)

shape of W1: (20, 3072) & shape of b1: (20,)


In [10]:
# hidden layer scores
S1 = X_train@W1.T + b1

# Activations (ReLU)
A1 = np.maximum(S1, 0)
print('shape of activation/matrix A1:', A1.shape)

shape of activation/matrix A1: (50000, 20)


## Output layer

In [11]:
c = 10 # number of classes i.e. Output features for output layer

In [12]:
# Initialize weights
W2 = np.random.rand(c, h)*0.01 # (Output features, Input features)
b2 = np.zeros(c)
print('shape of W2:', W2.shape, '& shape of b2:', b2.shape)

shape of W2: (10, 20) & shape of b2: (10,)


In [13]:
# Model scores
S2 = A1@W2.T + b2

# Output Activations (Softmax)
A2 = softmax(S2)
print('shape of activation/probability matrix A2:', A2.shape)

shape of activation/probability matrix A2: (50000, 10)


## Loss

In [14]:
# Calculate Cross Entropy loss
loss = -np.sum(np.log(A2[np.arange(n), y_train]))
print('loss:',loss)

loss: 115358.86242198483


# Backward

In [15]:
# d_loss/d_score

dS2 = A2 
dS2[np.arange(n), y_train] -= 1

print('shape of dL/dS2:', dS2.shape)

shape of dL/dS2: (50000, 10)


In [16]:
# calculate weight gradients

dA1  = dS2@W2
dW2  = dS2.T@A1
db2  = dS2.sum(axis=0)

print('shape of dL/dA1:', dA1.shape)
print('shape of dL/dW2:', dW2.shape)
print('shape of dL/db2:', db2.shape)

shape of dL/dA1: (50000, 20)
shape of dL/dW2: (10, 20)
shape of dL/db2: (10,)


In [17]:
# derivative for ReLU activation

dS1 = dA1
dS1[S1<0] = 0

print('shape of dL/dS1:', dS1.shape)

shape of dL/dS1: (50000, 20)


In [18]:
# calculate weight gradients

dW1  = dS1.T@X_train
db1  = dS1.sum(axis=0)

print('shape of dL/dW1:', dW1.shape)
print('shape of dL/db1:', db1.shape)

shape of dL/dW1: (20, 3072)
shape of dL/db1: (20,)


# Weight Update

In [19]:
# update weights with learning rate lr
lr = 0.01

W1 -= lr*dW1
W2 -= lr*dW2
b1 -= lr*db1
b2 -= lr*db2

# Training loop

In [20]:
# Initialize weights

n, m = X_train.shape # number of examples, feature dimensions
h = 20 # nummber of hidden units i.e. Output features for hidden layer
c = 10 # number of classes i.e. Output features for output layer

W1 = np.random.rand(h, m)*0.01 # (Output features, Input features)
b1 = np.zeros(h)

W2 = np.random.rand(c, h)*0.01 # (Output features, Input features)
b2 = np.zeros(c)

In [21]:
# Define learning rate lr and number of epochs
lr = 1e-6
epochs = 200

In [22]:
for epoch in range(epochs):


    # hidden layer activations
    S1 = X_train@W1.T + b1
    A1 = np.maximum(S1, 0)

    # Output layer activations
    S2 = A1@W2.T + b2
    A2 = softmax(S2)

    # Cross Entropy loss
    loss = -np.sum(np.log(A2[np.arange(n), y_train]))
    if ((epoch+1) % 50 == 0) or (epoch==0):
        print(f'Epoch {epoch+1}: Loss {np.round(loss, 2)}')
        y_pred = np.argmax(A2, axis=1)
        print(f'Train Accuracy: {np.round(np.mean(y_pred == y_train), 4)}\n')
    
    # d_loss/d_score
    dS2 = A2 
    dS2[np.arange(n), y_train] -= 1

    # Output layer gradients
    dA1  = dS2@W2
    dW2  = dS2.T@A1
    db2  = dS2.sum(axis=0)

    # derivative for ReLU activation
    dS1 = dA1
    dS1[S1<0] = 0

    # hidden layer gradients
    dW1  = dS1.T@X_train
    db1  = dS1.sum(axis=0)

    # update weights with learning rate lr
    W1 -= lr*dW1
    W2 -= lr*dW2
    b1 -= lr*db1
    b2 -= lr*db2

Epoch 1: Loss 115487.72
Train Accuracy: 0.0703

Epoch 50: Loss 111416.23
Train Accuracy: 0.1814

Epoch 100: Loss 102050.41
Train Accuracy: 0.2544

Epoch 150: Loss 95928.4
Train Accuracy: 0.3087

Epoch 200: Loss 91613.37
Train Accuracy: 0.3444

