[[Neural Networks from Scratch]]

##### Initialise the packages

In [None]:
import micropip

await micropip.install("numpy")
await micropip.install("nnfs")
await micropip.install("matplotlib")

import matplotlib.pyplot as plt
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
from nnfs.datasets import vertical_data


##### Backpropogating the ReLU function for a single neuron to act as if we intend to minimise the output for this single neuron (demonstration)

In [None]:
# Forward pass
x = [1.0, -2.0, 3.0]  # inputs
w = [-3.0, -1.0, 2.0]  # weights
b = 1.0  # bias

z = x[0]*w[0] + x[1]*w[1] + x[2]*w[2] + b
y = max(z, 0)

# Backward pass
dvalue = 1.0

drelu_dz = dvalue * (1. if z > 0 else 0.)

# Gradients via chain rule
dmul_dx0 = w[0] * drelu_dz * dvalue
dmul_dx1 = w[1] * drelu_dz * dvalue
dmul_dx2 = w[2] * drelu_dz * dvalue

dmul_dw0 = x[0] * drelu_dz * dvalue
dmul_dw1 = x[1] * drelu_dz * dvalue
dmul_dw2 = x[2] * drelu_dz * dvalue

print(dmul_dx0, dmul_dw0, dmul_dx1, dmul_dw1, dmul_dx2, dmul_dw2)


In the example above the **chained functions** can be loosely interpreted as:
$$
ReLU(\sum [inputs\times weights]+bias)
$$
or in the form that matches our code more precisely as:
$$
ReLU(x_0w_0 + x_1w_1 + x_2w_2 + b)
$$
or:
$$
ReLU(sum(mul(x_0, w_0), mul(x_1,w_1), mul(x_2, w_2), b))
$$
The above equation contains 3 nested functions:
1. ReLU
2. Sum of weighted inputs and a bias
3. Multiplications of the inputs and weights

##### Why do we need the partial derivatives of the sum components?
We need partial derivatives to compute gradients which show how each individual input or weight affects the total sum before activation. This allows us to measure each parameter's direct influence on the output, so we can precisely calculate the gradients needed for updating the parameters during training.

##### What is a partial derivative?
Measure of how a mulitvariable function (such as $f(x,y)$) changes as one variable changes, will keeping all other variables constant.

##### What is the chain rule?
The chain rule calculates the rate of change of a function that is composed of other functions.

If:
$$y=f(z), z=g(x)$$
then:
$$\frac{dy}{dx} = \frac{dy}{dz} \times \frac{dz}{dx}$$
It multiplies the rate of change of the outer function by the rate of change of the inner function.


##### Object Orientated Implementation of Backpropagation

In [None]:
nnfs.init()

class Layer_Dense:
	def __init__(self, n_inputs, n_neurons):
		self.weights = 0.10 * np.random.randn(n_inputs, n_neurons)
		self.biases = np.zeros((1, n_neurons))
	# Forward pass
	def forward(self, inputs):
		self.inputs = inputs
		self.output = np.dot(inputs, self.weights) + self.biases
	# Backward pass
	def backward(self, dvalues):
		# Gradients on parameters
		self.dweights = np.dot(self.inputs.T, dvalues)
		self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
		# Gradient on values
		self.dinputs = np.dot(dvalues, self.weights.T)

class Activation_ReLU:
	# Forward pass
	def forward(self, inputs):
		self.inputs = inputs
		self.output = np.maximum(0, inputs)
	# Backward pass
	def backward(self, dvalues):
		# We need to modify the original variable,
		# so we make a copy of the values first
		self.dinputs = dvalues.copy()

		# Zero gradient where input values were negative
		self.dinputs[self.inputs <= 0] = 0
class Activation_Softmax:
	def forward(self,inputs):
		exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
		probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
		self.output = probabilities

class Loss:
	def calculate(self, output, y):
		sample_losses = self.forward(output, y)
		data_loss = np.mean(sample_losses)
		return data_loss

class Loss_CategoricalCrossentropy(Loss):
	def forward(self, y_pred, y_true):
		samples = len(y_pred)
		epsilon = 1e-7
		y_pred_clipped = np.clip(y_pred, epsilon, 1-epsilon)
		
		# When y_true is a vector of scalar class values. E.g [0,1,1]
		if len(y_true.shape) == 1:
			correct_confidences = y_pred_clipped[range(samples), y_true]
		
		# When y_true is an array of vectors (one-hot coding) E.g [[1,0,0], [0,1,0], [0,0,1]]
		elif len(y_true.shape) == 2:
			correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
		negative_log_likelihoods = -np.log(correct_confidences)
		return negative_log_likelihoods


# Recreate and run forward pass, then backward pass

X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()

dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

# Forward pass
dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])

loss_function = Loss_CategoricalCrossentropy()
loss = loss_function.calculate(activation2.output, y)

print("Loss:", loss)

# Backward pass

# Gradient of loss w.r.t softmax output
# Using y to calculate the gradient of the loss with respect to the outputs of each layer starting with Softmax
samples = len(activation2.output)
if len(y.shape) == 1:
    y_one_hot = np.zeros_like(activation2.output)
    y_one_hot[np.arange(samples), y] = 1
else:
    y_one_hot = y

dvalues = (activation2.output - y_one_hot) / samples

# Backward through second dense layer
dense2.backward(dvalues)

# Backward through ReLU activation
activation1.backward(dense2.dinputs)

# Backward through first dense layer
dense1.backward(activation1.dinputs)

# Display gradients from backward pass (show first 5 values for clarity)
print("dweights dense2 (first 5):", dense2.dweights.flatten()[:5])
print("dbiases dense2:", dense2.dbiases)
print("dweights dense1 (first 5):", dense1.dweights.flatten()[:5])
print("dbiases dense1:", dense1.dbiases)



##### Flow of data visualising the backpropagation process above
[Loss]
   ↓ (gradient of loss with respect to softmax inputs: p - y)
[Softmax Layer]
   ↓
[Dense Layer 2]
   • Calculate dweights2 = a1^T · (p - y)
   • Calculate dbiases2 = sum(p - y)
   • Calculate dinputs2 = (p - y) · W2^T
   ↓
[Activation Layer 1 (ReLU)]
   • Backprop dinputs2, zero gradients where input ≤ 0
   ↓
[Dense Layer 1]
   • Calculate dweights1 = X^T · dinputs_relu
   • Calculate dbiases1 = sum(dinputs_relu)
   • Calculate dinputs1 = dinputs_relu · W1^T
   ↓
[Input X]
##### Why do we need derivatives of the operations in each layer?
We need derivatives in each layer to understand how changing inputs or weights changes the result. This helps us know exactly how to adjust each weight to reduce loss and improve the model

##### Categorical Cross-Entropy Loss Derivative Code Implementation

In [None]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
	#... rest of code
	# Backward pass
	def backward(self, dvalues, y_true):
		# number of samples
		samples = len(dvalues)
		# number of labels in every sample
		labels = len(dvalues[0])
		# if labels are sparse, turn them into one-hot vector
		if len(y_true.shape) == 1:
			y_true = np.eye(labels)[y_true]
		# Calculate gradient
		self.dinputs = -y_true / dvalues
		# Normalise gradient
		self.dinputs = self.dinputs / samples

###### How can you convert a list of class labels like [2, 0, 1] into one-hot encoded vectors using NumPy?
Use `np.eye(n)[labels]`, where `n` is the number of classes and `labels` is your list of class indices. For example:

In [None]:
import numpy as np
labels = [2, 0, 1]
one_hot = np.eye(3)[labels]

produces:

In [None]:
np.array([[0., 0., 1.],  # class 2
       [1., 0., 0.],  # class 0
       [0., 1., 0.]]) # class 1


**Next Step:** Complete Backpropagation Support for Softmax + Categorical Cross-Entropy

We already have the gradient of the loss with respect to the Softmax output. But this gradient has an efficient combined form for numerical stability and speed.

##### Softmax Activation and Categorical Cross-Entropy Classes for Reference:

In [None]:
class Activation_Softmax:
	def forward(self,inputs):
		exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
		probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True) 
		self.output = probabilities

class Loss_CategoricalCrossentropy(Loss):
	def forward(self, y_pred, y_true):
		samples = len(y_pred)
		epsilon = 1e-7
		y_pred_clipped = np.clip(y_pred, epsilon, 1-epsilon)
		# When y_true is a vector of scalar class values. E.g [0,1,1] 
		if len(y_true.shape) == 1:
			correct_confidences = y_pred_clipped[range(samples), y_true] 
		# When y_true is an array of vectors (one-hot coding) E.g [[1,0,0], [0,1,0], [0,0,1]]
		elif len(y_true.shape) == 2:
			correct_confidences = np.sum(y_pred_clipped*y_true, axis=1) 
			negative_log_likelihoods = -np.log(correct_confidences) 
		return negative_log_likelihoods

##### Efficiency Trick: Combine Softmax and Categorical Cross-Entropy
We create a new class to handle both forward and backward passess together

In [None]:
class Activation_Softmax_Loss_CategoricalCrossentropy:
	def __init__(self):
		self.activation = Activation_Softmax()
		self.loss = Loss_CategoricalCrossentropy()

	def forward(self, inputs, y_true):
		# Softmax activation
		self.activation.forward(inputs)
		# Store output
		self.output = self.activation.output
		# Return loss value
		return self.loss.calculate(self.output, y_true)

	def backward(self, dvalues, y_true):
		samples = len(dvalues)
		# If labels are one-hot, turn them into scalar class indices
		if len(y_true.shape) == 2:
			y_true = np.argmax(y_true, axis=1)
		# Copy the probabilities and calculate gradient
		self.dinputs = dvalues.copy()
		self.dinputs[range(samples), y_true] -= 1
		self.dinputs = self.dinputs / samples

##### Usage:

In [None]:
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Forward pass
loss = loss_activation.forward(dense2.output, y)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)


##### Why does this work?
When you combine Softmax with Categorical Cross-Entropy, the resulting derivative simplifies to:
$$
\frac{\partial L}{\partial z_i} = p_i - y_i
$$
This eliminates the need for manually computing Jacobians or second derivatives and it's faster and more stable.

##### Final Output: Gradients Flow Completely Back
At this point we have:
- Gradients of loss with respect to output layer (via softmax loss combo)
- Backprop through all hidden layers (ReLU, Dense)
- Access to all parameters gradients: `dweights`, `dbiases`, `dinputs` ready for an optimiser step.

**Next steps:**
- Implement Optimiser (e.g. SGD)
- Implement Training Loop


##### Bonus: Backward Propagation Results:

In [None]:
# Sample model predictions (probabilities)
print("Softmax Output (first 5):")
print(loss_activation.output[:5])

# Sample loss value
print("Loss:")
print(loss)

# Gradient of loss w.r.t. softmax input (first 5 samples)
print("dInputs (Softmax Loss Combo) (first 5):")
print(loss_activation.dinputs[:5])

# Gradients for Dense Layer 2
print("Dense2 dWeights (first 5):")
print(dense2.dweights.flatten()[:5])
print("Dense2 dBiases:")
print(dense2.dbiases)

# Gradients for Dense Layer 1
print("Dense1 dWeights (first 5):")
print(dense1.dweights.flatten()[:5])
print("Dense1 dBiases:")
print(dense1.dbiases)
