# Dev notebook

### Getting started on the feed forward neural net

In [None]:
import nnfs 
import numpy as np

import matplotlib.pyplot as plt
from nnfs.datasets import spiral_data

nnfs.init()

In [None]:
X, y = spiral_data(samples=100, classes=3)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='brg')

####  Defining layers 

Notes:
- I have chosen to use a similar naming convention to that used by pytorch (why reinvent the wheel), this has the benefit of ensuring that when we compar|e implementations the architecture of the networks is the same. 
- since we intend to use ReLU as one of our activation functions we will use the He weight initialization method as described in https://arxiv.org/abs/1502.01852

In [None]:
class LinearLayer():
    """Linear transformation layer of the type o = ixW + b,
    
    where I is the incoming vector, W is the layers weight matrix, b is bias vector and o is the dot product of the 
    i and W plus the bias
    
    Args:
        in_features (int): the size of the input features 
        out_features (int): the size of the output features
        
    Attributes:
        weights (np_array) numpy array of in_features x n_neurons
        biases  (np_array) numpy array of 1 x n_neurons
        inputs  (np_array) numpy array of latest batch of inputs
        d_w     (np_array) The current gradients with respect to the weights 
        d_x     (np_array) The current gradients with respect to the inputs
        d_b     (np_array) The current gradients with respect to the biases
    """

    def __init__(self, in_features, out_features) -> None:
        # initializing weights and biases 
        self.weights = np.random.normal(0.0, np.sqrt(2/in_features), (in_features, out_features))
        self.bias = np.zeros((1, out_features))
        # initializing attributes needed for backwards 
        self.inputs = None

    def forward(self, inputs):
        # Saving inputs for backward step
        self.inputs = inputs
        return np.dot(inputs, self.weights) + self.bias

    def backward(self, d_vals):
        """Backpropagation  of the linear function

        Args:
            d_vals (np_array) array of derivatives from the previous layer/function.
        """
        self.d_w = np.dot(self.inputs.T, d_vals)
        self.d_x = np.dot(d_vals, self.weights.T)
        self.d_b = np.sum(d_vals, axis=0, keepdims=True)


        

#### Testing Linear Layer

In [None]:
linear1 = LinearLayer(2, 3)
linear1.forward(X)

#### Activation Functions - ReLu

$$y = \begin{cases}
   x &x> 0 \\
   0 & otherwise
\end{cases} $$

In [None]:
class ReLU:
    """Applies Rectified linear Unit function to vector."""
    def __init__(self) -> None:
        # initializing attributes needed for backwards 
        self.inputs = None
        self.d_relu = None
    
    def forward(self, x):
        # storing inputs needed for backwards 
        self.inputs = x
        return np.maximum(x, 0)
    
    def backward(self, d_vals):
        self.d_relu = d_vals.copy()
        self.d_relu[self.inputs <= 0] = 0

#### Testing ReLU


In [None]:
i = [-2, 3, 4, 0, 0.1, -44]
activator = ReLU()
activator.forward(i)

#### Activation Functions -Softmax

$$\text{softmax}(x)_i = \frac{exp(x_i)}{\sum_{j}^{ }exp(x_j))}$$

The soft max represents the confidence score for each output class and adds up to 1.

In [42]:
class Softmax:
    """Applies Softmax function to input matrix."""

    def __init__(self) -> None:
        self.confidence_scores = None

    def forward(self, x):
        # exponenets of each value
        exp_vals = np.exp(x - np.max(x, axis=1, keepdims=True))
        exp_sum = np.sum(exp_vals, axis=1, keepdims=True)
        # Normalization to get the proabilities 
        self.confidence_scores = exp_vals/exp_sum
        return self.confidence_scores

    def backward(self, d_vals):
        # Initialize array for gradients wrt to inputs
        self.d_soft = np.zeros_like(d_vals)
        
        _iter = enumerate(zip(self.confidence_scores, d_vals))
        for i, conf_score, d_val in _iter:
            # Flatten confidence scores
            cs = conf_score.reshape(-1, 1)
            # Find the Jacobian matrix of the output 
            j_matrix = np.diagflat(cs) - np.dot(cs, cs.T)
            # get the gradient 
            self.d_soft[i] = np.dot(j_matrix, d_val)
    
    def combo_backward(self, y_pred, y_true):
        """Does a the combined backward pass for CCE & Softmax as a single, faster step."""
        n = len(y_pred)

        # Getting descrete vals from one hot encoding 
        y_true = np.argmax(y_true, axis=1)
        
        self.d_soft = y_pred.copy()
        self.d_soft[range(n), y_true] -= 1
        self.d_soft = self.d_soft/n
        return self.d_soft

#### Testing Softmax

In [46]:
softmax = Softmax()
softmax.forward([[1,2,44]])

array([[2.11513104e-19, 5.74952226e-19, 1.00000000e+00]])

#### Calculating Loss - Categorical Cross-Entropy

$$ L_i = -\sum_j y_{i,j}\log(\hat{y}_{i,j}) $$

With taking one hot encoding into account we can simplify this down to:

$$ L_i = -y_{i,k}\log(\hat{y}_{i,k}) $$

where K is the index of the correct class

In [None]:
class CategoricalCrossEntropyLoss:
    """Calculates the CCE loss for a given set of predictions.
    This method expect a softmax output and one-hot encoded label mask
    
    y_pred (np_array): matrix of confidence scores of the prediction
    y_true (np_array): matrix of one-hot encoded true lables of the classes
    """
    def forward(y_pred, y_true):
        # Clipping and applying one hot encoded labels as mask 
        # to zero out scores corresponding to incorrect classes
        # We clip to make sure that none of the reaming classes are 0 or 
        # exactly 1 
        corrected = np.sum(np.clip(y_pred, 1e-7, 1-1e-7)*y_true, axis=1)
        # Taking the -ve log of the remaining confidence scores 
        negative_log = -np.log(corrected)
        return np.mean(negative_log)
    
    def backward(y_pred, y_true):
        """Backpropagation  of the CCE Loss

        Args:
            y_pred (np_array) array of predictions.
            y_true (np_array) array of correct labels.
        """
        return (-y_true/y_pred)/len(y_pred)

#### Testing CCE Loss

In [None]:
y_pred = np.array([[0.7, 0.1, 0.2], [0.1,0.5,0.4],[0.02,0.9,0.08]])
y_true = np.array([[1,0,0], [0,1,0], [0,1,0]])

loss_function = CategoricalCrossEntropyLoss
loss_function.forward(y_pred, y_true)

#### One-hot encoding function 

In [None]:
def one_hot_encode_index(y, n):
    return np.eye(n)[y]

#### Testing one hot masker

In [None]:
n=3
y_test = np.array([0,1,2, 1, 2])

one_hot_encode_index(y_test, n)

#### Integration Testing 

In [None]:
relu = ReLU
softmax = Softmax
cce_loss = CategoricalCrossEntropyLoss

linear1 = LinearLayer(2, 3)
linear2 = LinearLayer(3, 3)

out1 = relu.forward(linear1.forward(X))
out2 = softmax.forward(linear2.forward(out1))

out2[:10]

In [None]:
cce_loss.forward(out2, one_hot_encode_index(y, 3))

#### Backpropagation - simplified 

Backpropagation through ReLU, based on the example in NNFS to ensure a solid understanding of the underlying math (partial diff and chain rule) and mechanisms 

In [None]:
# simulating a forward pass 

x = [1.0, -2.0, 3.0]
w = [-3.0, -1.0, 2.0]
b = 1.0

# Multiplying inputs by weights
xw0 = x[0]* w[0]
xw1 = x[1]* w[1]
xw2 = x[2]* w[2]

# Summing weights and bias
z = xw0 + xw1 + xw2 +b

# applying relu
y = max(z, 0)


If we represent the forward pass as a function we can say:

$$\text{ReLU}\left(\sum[\text{inputs}\cdotp\text{weights}]+\text{bias}\right)$$

We now need to find the partial derivatives of all the function for all the parameters. For example if we wanted to know the effect that w0 had on the outcome we woul need to know:

$$ \frac{\partial}{\partial x_0}\left[\text{ReLU}\left(\sum[\text{inputs}\cdotp\text{weights}]+\text{bias}\right)\right] = \frac{d \text{ReLU()}}{d \text{sum()}}\cdot\frac{\partial\text{sum()}}{\partial mul(x_0,w_0)}\cdot\frac{\partial mul(x_0,w_0)}{\partial x_0} $$


In [None]:
# The backward pass

# derivative from previous layers
d_val = 1.0

# the derivative of relu wrt z 
d_relu_dz = d_val * (0,1)[z>0] # == i if z> 0, else 0 

# Recall the derivative of a sum opperator os always 1 
# derivative of the sum wrt x_n*w_n 
d_sum_dxwn = 1
d_relu_dxw0 = d_relu_dz * d_sum_dxwn
d_relu_dxw1 = d_relu_dz * d_sum_dxwn
d_relu_dxw2 = d_relu_dz * d_sum_dxwn

# derivative of the sum wrt b (bias) 
d_sum_db = 1
d_relu_db = d_relu_dz * d_sum_db

# Recall the derivative of a product is whateve input is being multiplied 
d_mul_dx0 = w[0]
d_mul_dx1 = w[1]
d_mul_dx2 = w[2]
d_relu_dx0 = d_mul_dx0 * d_relu_dxw0
d_relu_dx1 = d_mul_dx1 * d_relu_dxw2
d_relu_dx2 = d_mul_dx2 * d_relu_dxw2

d_mul_dw0 = x[0]
d_mul_dw1 = x[1]
d_mul_dw2 = x[2]
d_relu_dw0 = d_mul_dw0 * d_relu_dxw0
d_relu_dw1 = d_mul_dw1 * d_relu_dxw1
d_relu_dw2 = d_mul_dw2 * d_relu_dxw2

# Simplifying the above we can rewrite as:
d_relu_dx0 = d_val * (0,1)[z>0] * w[0]


In [None]:
# Optimized code for the backward pass 
# (yes, variables are being shadowed but it is okay this section is just for learning an not final code)
d_val = 1.0

d_x = [d_val*(0,1)[z>0]*_w for _w in w] # the derivative of the previous layer * d of relu * the corresponding weight for the input
d_w = [d_val*(0,1)[z>0]*_x for _x in x] # the derivative of the previous layer * d of relu * the corresponding input for the weight
d_b = d_val * (0,1)[z>0] # the derivative of the previous layer * d of relu (the derivative of the sum will always be 1)

#### Backpropagation - A layer of neurons 

Considering multiple neurons in a layer rather than just one 

In [None]:
# dummy passed in grads from previous layer 
d_val = np.array([[1.,1.,1.]])

weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T

# gradient for first input  
d_x0 = sum([weights[0][i] * d_val[0][i] for i in range(weights.shape[1])])
d_x1 = sum([weights[1][i] * d_val[0][i] for i in range(weights.shape[1])])
d_x2 = sum([weights[2][i] * d_val[0][i] for i in range(weights.shape[1])])
d_x3 = sum([weights[3][i] * d_val[0][i] for i in range(weights.shape[1])])
d_x = np.array([d_x0, d_x1, d_x2, d_x3])
d_x


In [None]:
# optimizing the above code and accounting for batches of samples we get:

d_val = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

d_x = np.dot(d_val, weights.T)
d_x

In [None]:
# To calculate the gradients wrt the weights we consider the input values 
inputs = np.array([[1, 2, 3, 2.5],
                    [2., 5., -1., 2],
                    [-1.5, 2.7, 3.3, -0.8]])

d_w = np.dot(inputs.T, d_val)
d_w


In [None]:
# Calculating the derivative of the bias 
d_b = np.sum(d_val, axis=0, keepdims=True)
d_b

In [None]:
# output for the linear component 
z = np.array([[1,2,-3,-4], [2,-7,-1,3], [-1, 2,5,-1]])
d_val = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
# Calcuting the derivative of Relu 
d_relu = np.zeros(z.shape)
d_relu[z>0] = 1
d_relu *= d_val
d_relu 

At this point I will go back and update the Linear Layer and the Relu with backward code.

#### Backpropagation of CCE Loss

We find that the derivative of CCE loss to be:

$$ \frac{\delta L_i}{\hat{y_{i,j}}} = -\frac{y_{i,j}}{\hat{y_{i,j}}} $$

I will now add this directly to the function

#### Backpropagation of Softmax activation

$$ \frac{\partial S_{i,j}}{\partial Z_{i,k}} = S_{i,j} \cdot (\delta_{j,k} - S_{i,k})$$

In [None]:
# Test implementation 

# Softmax output 
so = [0.7, 0.1, 0.2] 
so = np.array(so).reshape(-1, 1)
np.diagflat(so)

In [None]:
np.dot(so, so.T)

In [None]:
np.diagflat(so) - np.dot(so, so.T)

#### CCE and Softmax combined derivatives 

Combining the derivations of CCE and SOftmax together will alow is to solve them in a simpler and faster way 