# Creating a neural network from scratch

Big shoutout to the book which guides this project: "Neural Networks from Scratch in Python" by Harrison Kinsley & Daniel Kukieła

## Imports

In [26]:
import numpy as np
from nnfs.datasets import spiral_data

## Layer Classes

In [27]:
class Layer_Dense: 
    def __init__(self, n_input, n_neurons):
        self.weights = 0.01 * np.random.randn(n_input, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    def forward(self, inputs): 
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues): 
        # gradients on parameters for updating
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # gradients on values for next steps
        self.dinputs = np.dot(dvalues, self.weights.T)

## Activation Functions

In [28]:
class Activation_ReLU: 
    def forward(self, inputs): 
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    
    def backward(self, dvalues): 
        self.dinputs = dvalues.copy() # copy because of modifying 
        self.dinputs[self.inputs <= 0] = 0

In [29]:
class Activation_Softmax: 
    def forward(self, inputs):
        exp_value = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))  # subtracting max value for numerical stability => result will not change!
        self.output = exp_value / np.sum(exp_value, axis=1, keepdims=True)


In [30]:
class Activation_Sigmoid: 
    def forward(self, input): 
        pass

## Loss Functions

In [31]:
# parent class
class Loss: 
    def calculate(self, output, y):
        sample_loss = self.forward(output, y)
        data_loss = np.mean(sample_loss)
        return data_loss


In [32]:
class Loss_CategoricalCrossEntropy(Loss): 
    def forward(self, y_pred, y_true): 
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) # to prevent 0 => log(0) is undefined

        """
        example: 
        
        y_pred_clipped = [[0.7, 0.1, 0.2],
                          [0.1, 0.5, 0.4],
                          [0.02, 0.9, 0.08]]
        """
        
        # single dimension => categorical labels
        if len(y_true.shape) == 1: # if is 1D-Array:
            # e.g. y_true = [1, 0, 0]
            correct_confidences = y_pred_clipped[range(samples), y_true]
    
        # 2 dimensions => one-hot encoded labels
        elif len(y_true.shape) == 2: 
            # e.g. y_true = [[0, 1, 0], [1, 0, 0], [1, 0, 0]]
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)            
        else: 
            raise Exception("Please use a different shape for y_true: shape = {0}".format(y_true.shape))
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self, dvalues, y_true): 
        samples = len(dvalues) 
        labels = len(dvalues[0]) # number of labels in every sample

        # ensure one-hot encoding
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true]
        
        # calculate gradient 
        self.dinputs = -y_true / dvalues # derivative of loss function (-log(x)) => because of one-hot encoding the correct values get updated

        # normalize gradient => optimizers sum all the gradients before multiplying them with the learning rate 
        self.dinputs = self.dinputs / samples

## Testing 

In [33]:
X, y = spiral_data(samples=100, classes=3)  
layer1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()

layer2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

loss_function = Loss_CategoricalCrossEntropy()


layer1.forward(X)
activation1.forward(layer1.output) 
layer2.forward(activation1.output)
activation2.forward(layer2.output)

loss = loss_function.calculate(activation2.output, y)
predictions = np.argmax(activation2.output, axis=1)
accuracy = np.mean(predictions == y)

print("Loss: ", loss)
print("Acc: ", accuracy)

[0.33333333 0.33333351 0.33333353 0.33333356 0.33333367 0.33333358
 0.33333449 0.33333468 0.33333486 0.33333457 0.33333536 0.33333545
 0.33333542 0.33333583 0.33333656 0.33333723 0.33333642 0.33333773
 0.33333681 0.33334396 0.33334464 0.33333735 0.33334537 0.33334607
 0.33334711 0.33334749 0.33334511 0.33334859 0.33334402 0.33334884
 0.33334317 0.33335008 0.33335132 0.33335149 0.33335034 0.33333106
 0.33333088 0.33333656 0.33333805 0.33334569 0.33333093 0.33335663
 0.33333085 0.33333647 0.33332999 0.33332999 0.33333004 0.33332939
 0.33332935 0.33332925 0.33332936 0.33333123 0.333329   0.33332906
 0.33332939 0.33332907 0.33333587 0.33333505 0.3333285  0.33332856
 0.33332876 0.33333468 0.33332814 0.33333391 0.33332882 0.33333559
 0.33334435 0.33334778 0.33334208 0.33334619 0.33335646 0.3333463
 0.33334615 0.33334874 0.33336135 0.33334578 0.33336157 0.33335402
 0.33337454 0.33337845 0.3333768  0.33337944 0.33338009 0.33338047
 0.33336392 0.33336314 0.33338151 0.33335012 0.33338389 0.33338

In [34]:
# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])
# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([[2, 3, 0.5]])
# dbiases - sum values, do this over samples (first axis), keepdims
# since this by default will produce a plain list -
# we explained this in the chapter 4
dbiases = np.sum(dvalues, axis=0, keepdims=True)
print(dbiases)

[[6. 6. 6.]]
