In [1]:
## Load libraries
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [2]:
np.set_printoptions(precision=2)

In [3]:
import tensorflow as tf




In [4]:
tf.__version__

'2.15.0'

In [5]:
# Generate artificial data with 10 samples, 5 features per sample
# and 3 output classes
num_samples = 10 # number of samples
num_features = 5 # number of features (a.k.a. dimensionality)
num_labels = 3 # number of output labels
# Data matrix (each column = single sample)
X = np.random.choice(np.arange(3, 10), size = (num_features, num_samples), replace = True)
# Class labels
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)
print(X)
print('------')
print(y)
print('------')
# One-hot encode class labels
y = tf.keras.utils.to_categorical(y)
print(y)

[[5 7 8 8 9 4 6 3 4 6]
 [3 6 7 9 4 8 4 4 4 5]
 [9 3 8 3 7 8 3 5 3 4]
 [8 8 4 9 8 8 6 3 7 9]
 [6 4 6 7 8 3 3 5 7 6]]
------
[1 1 0 2 1 2 0 1 2 2]
------
[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]]


---

A generic layer class with forward and backward methods

----

In [6]:
class Layer:
  def __init__(self):
    self.input = None
    self.output = None

  def forward(self, input):
    pass

  def backward(self, output_gradient, learning_rate):
    pass

---

The softmax classifier steps for a generic sample $\mathbf{x}$ with (one-hot encoded) true label $\mathbf{y}$ (3 possible categories) using a randomly initialized weights matrix (with bias abosrbed as its last last column):

1. Calculate raw scores vector for a generic sample $\mathbf{x}$  (bias feature added): $$\mathbf{z} = \mathbf{Wx}.$$
2. Calculate softmax probabilities (that is, softmax-activate the raw scores) $$\mathbf{a} = \text{softmax}(\mathbf{z})\Rightarrow\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}= \text{softmax}\left(\begin{bmatrix}z_0\\z_1\\z_2\end{bmatrix}\right)=\begin{bmatrix}\frac{e^{z_0}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_1}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_2}}{e^{z_0}+e^{z_1}+e^{z_2}}\end{bmatrix}$$
3. Softmax loss for this sample is (where output label $y$ is not yet one-hot encoded)
$$\begin{align*}L &=  -\log([\mathbf{a}]_y) \\&= -\log\left(\left[\text{softmax}(\mathbf{z})\right]_y\right)\\ &= -\log\left(\left[\text{softmax}(\mathbf{Wx})\right]_y\right).\end{align*}$$
4. Predicted probability vector that the sample belongs to each one of the output categories is given a new name $$\hat{\mathbf{y}} = \mathbf{a}.$$
5. One-hot encoding the output label $$\underbrace{y\rightarrow\mathbf{y}}_{\text{e.g.}\,2\,\rightarrow\begin{bmatrix}0\\0\\1\end{bmatrix}}$$ results in the following representation for the softmax loss for the sample which is also referred to as the categorical crossentropy (CCE) loss:
$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\end{align*}.$$
5. Calculate the gradient of the loss for the sample w.r.t. weights by following the computation graph from top to bottom (that is, backward):
$$\begin{align*} L\\{\color{yellow}\downarrow}\\ \hat{\mathbf{y}} &= \mathbf{a}\\{\color{yellow}\downarrow}\\\mathbf{z}\\{\color{yellow}\downarrow}\\\mathbf{W}\end{align*}$$
$$\begin{align*}\Rightarrow \nabla_\mathbf{W}(L) &= \nabla_\mathbf{W}(\mathbf{z}) \times\nabla_\mathbf{z}(\hat{\mathbf{y}})\times\nabla_{\hat{\mathbf{y}}}(L)\\&= \underbrace{\nabla_\mathbf{W}(\mathbf{z})}_\text{first term} \times\underbrace{\nabla_\mathbf{z}(\mathbf{a})}_\text{second to last term}\times\underbrace{\nabla_\hat{\mathbf{y}}(L)}_\text{last term}.\end{align*}$$
7. Now focus on the last term $\nabla_\hat{\mathbf{y}}(L)$:
$$\begin{align*}\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$
8. Now focus on the second to last term $\nabla_\mathbf{z}(\mathbf{a})$:
$$\begin{align*}\nabla_\mathbf{z}(\mathbf{a}) &= \nabla_\mathbf{z}\left(\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}\right)\\ &= \begin{bmatrix}\nabla_\mathbf{z}(a_0)&\nabla_\mathbf{z}(a_1)&\nabla_\mathbf{z}(a_2)\end{bmatrix} \\&= \begin{bmatrix}\nabla_{z_0}(a_0)&\nabla_{z_0}(a_1)&\nabla_{z_0}(a_2)\\\nabla_{z_1}(a_0)&\nabla_{z_1}(a_1)&\nabla_{z_1}(a_2)\\\nabla_{z_2}(a_0)&\nabla_{z_2}(a_1)&\nabla_{z_2}(a_2)\end{bmatrix}\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_2a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}.\end{align*}$$
9. Now focus on the last term $\nabla_\mathbf{W}(\mathbf{z}) = \nabla_\mathbf{W}(\mathbf{Wx})$:

![](https://onedrive.live.com/embed?resid=37720F927B6DDC34%21103155&authkey=%21AMH79mXBdb_raAA&width=660)

The full gradient can be written as $\nabla_\mathbf{W}(L)=$

![](https://onedrive.live.com/embed?resid=37720F927B6DDC34%21103156&authkey=%21AIdyOQ3a-er-7-A&width=660)

$$\begin{align*}=\begin{bmatrix}a_1(1-a_1)&-a_2a_1&-a_3a_1\\-a_1a_2&a_2(1-a_2)&-a_3a_2\\-a_1a_3&-a_2a_3&a_3(1-a_3)\end{bmatrix}\times\begin{bmatrix}-y_1/\hat{y}_1\\-y_2/\hat{y}_2\\-y_3/\hat{y}_3\end{bmatrix}\mathbf{x}^\mathrm{T}.\end{align*}$$


---

---

CCE loss and its gradient

$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\\\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$


---

In [7]:
## Define the loss function and its gradient
def cce(y, yhat):
  return(-np.sum(y*np.log(yhat)))

def cce_gradient(y, yhat):
  return(-y/yhat)

# TensorFlow in-built function for categorical crossentropy loss
#cce = tf.keras.losses.CategoricalCrossentropy()

---

Softmax activation layer class
$$\begin{align*}\text{forward:}\ \mathbf{a} &=\text{softmax}(\mathbf{z}),\\\text{backward:}\ \nabla_\mathbf{z}(L) &= \nabla_{\mathbf{z}}(\mathbf{a})\times\nabla_{\mathbf{a}}(L) = \nabla_{\mathbf{z}}(\mathbf{a})\times\nabla_{\hat{\mathbf{y}}}(L)\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_2a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}\begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$


---

In [8]:
## Softmax activation layer class
class Softmax(Layer):
  def forward(self, input):
    self.output = tf.nn.softmax(input).numpy()

  def backward(self, output_gradient, learning_rate = None):
    return(np.dot((np.identity(np.size(self.output))-self.output.T) * self.output, output_gradient))

---

Dense layer class

$$\begin{align*}\text{forward:}\ \mathbf{z}&=\mathbf{Wx}\\\text{backward:}\ \nabla_\mathbf{W}(L)&=\nabla_{\mathbf{W}}(\mathbf{z})\times\nabla_{\mathbf{z}}(L)\\&=\nabla_{\mathbf{z}}(L)\mathbf{x}^\mathrm{T}.\end{align*}$$

---

In [9]:
## Dense layer class
class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size+1) # bias trick
        self.weights[:, -1] = 0.01 # Set all bias values to the same nonzero constant

    def forward(self, input):
        self.input = np.append(input, 1) # bias trick
        self.output= np.dot(self.weights, self.input)

    def backward(self, output_gradient, learning_rate):
        weights_gradient = np.dot(output_gradient.reshape(-1, 1), self.input.reshape(-1, 1).T)
        input_gradient = np.dot(self.weights.T, output_gradient)
        self.weights = self.weights + learning_rate * (-weights_gradient)
        return input_gradient

In [10]:
dlayer = Dense(num_features, num_labels)
print(dlayer.weights)
dlayer.forward(X[:, 0])
print('-------')
print(dlayer.input)
print('-------')
print(dlayer.output)

[[ 1.06 -0.49  1.1  -1.21  1.83  0.01]
 [-0.36 -1.    0.64 -1.15 -0.1   0.01]
 [-0.03  0.78 -1.17 -0.62 -0.74  0.01]]
-------
[5 3 9 8 6 1]
-------
[ 15.    -8.77 -17.76]


In [11]:
softmax = Softmax() # define softmax activation
softmax.forward(dlayer.output) # Softmax activate
print(softmax.output)
print(y[0, :])

[1.00e+00 4.74e-11 5.90e-15]
[0. 1. 0.]


In [12]:
# Forward and backward propagation for the 0th sample
learning_rate = 1e-02 # learning rate
dlayer = Dense(num_features, num_labels) # define dense layer
print(dlayer.weights)
dlayer.forward(X[:, 0]) # forward prop
softmax = Softmax() # define softmax activation
softmax.forward(dlayer.output) # Softmax activate
loss = cce(y[0, :], softmax.output) # forward prop is over

grad = cce_gradient(y[0, :], softmax.output)
grad = softmax.backward(grad)
grad = dlayer.backward(grad, learning_rate)
print('-----')
print(dlayer.weights)

[[ 2.23 -0.12  1.25  0.33 -0.11  0.01]
 [ 0.74  0.53  0.4  -0.51  0.73  0.01]
 [-0.79 -0.39  0.68 -0.11 -0.48  0.01]]
-----
[[ 2.23 -0.12  1.25  0.33 -0.11  0.01]
 [ 0.79  0.56  0.49 -0.43  0.79  0.02]
 [-0.79 -0.39  0.68 -0.11 -0.48  0.01]]


In [13]:
## Train the 0-layer neural network using batch training with batch size = 1

# Steps: run over each sample, calculate loss, gradient of loss,
# and update weights.

learning_rate = 1e-02 # learning rate
loss = 0 # initialize loss
dlayer = Dense(num_features, num_labels) # define dense layer
softmax = Softmax() # define softmax activation layer

for i in range(X.shape[1]):
  dlayer.forward(X[:, i]) # forward prop
  softmax.forward(dlayer.output) # Softmax activate
  loss += cce(y[i, :], softmax.output) # calculate loss for sample
  # Backward prop starts here
  grad = cce_gradient(y[i, :], softmax.output)
  grad = softmax.backward(output_gradient = grad)
  grad = dlayer.backward(grad, learning_rate)

print(loss/X.shape[1])

7.553806535569952
