[[Neural Networks from Scratch]]

##### Why Batch Training?
Instead of using the whole dataset in a single go, we split it into small fixed-size batches and train on each one sequentially.

Why?
- Fits into memory
- Reduces gradient noise
- Faster convergence to minimum loss

##### Accuracy Measurement (Classification)
We measure how many predictions match the true labels. In classification:
$$
accuracy = \frac{correct \space predictions}{total \space predictions}
$$
For softmax outputs, we:
- Take `argmax` over model outputs -> predicted class
- Compare to `y_true` (actual class indices)

##### Full Integration Example: Spiral Classification

In [None]:
# Initialisation
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

# Dense layer
class Layer_Dense:
	def __init__(self, n_inputs, n_neurons):
		self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
		self.biases = np.zeros((1, n_neurons))

	def forward(self, inputs):
		self.inputs = inputs
		self.output = np.dot(inputs, self.weights) + self.biases

	def backward(self, dvalues):
		self.dweights = np.dot(self.inputs.T, dvalues)
		self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
		self.dinputs = np.dot(dvalues, self.weights.T)

# ReLU
class Activation_ReLU:
	def forward(self, inputs):
		self.inputs = inputs
		self.output = np.maximum(0, inputs)

	def backward(self, dvalues):
		self.dinputs = dvalues.copy()
		self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
	def forward(self, inputs):
		exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
		probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
		self.output = probabilities

# Combined softmax + cross-entropy loss
class Activation_Softmax_Loss_CategoricalCrossentropy:
	def __init__(self):
		self.activation = Activation_Softmax()
		self.loss = Loss_CategoricalCrossentropy()

	def forward(self, inputs, y_true):
		self.activation.forward(inputs)
		self.output = self.activation.output
		return self.loss.calculate(self.output, y_true)

	def backward(self, dvalues, y_true):
		samples = len(dvalues)
		if len(y_true.shape) == 2:
			y_true = np.argmax(y_true, axis=1)
		self.dinputs = dvalues.copy()
		self.dinputs[range(samples), y_true] -= 1
		self.dinputs = self.dinputs / samples

# Loss
class Loss:
	def calculate(self, output, y):
		sample_losses = self.forward(output, y)
		data_loss = np.mean(sample_losses)
		return data_loss

class Loss_CategoricalCrossentropy(Loss):
	def forward(self, y_pred, y_true):
		samples = len(y_pred)
		epsilon = 1e-7
		y_pred_clipped = np.clip(y_pred, epsilon, 1 - epsilon)

		if len(y_true.shape) == 1:
			correct_confidences = y_pred_clipped[range(samples), y_true]
		elif len(y_true.shape) == 2:
			correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

		negative_log_likelihoods = -np.log(correct_confidences)
		return negative_log_likelihoods

# Optimiser
class Optimiser_Adam:
	def __init__(self, learning_rate=0.02, decay=5e-5, epsilon=1e-7, beta_1=0.9, beta_2=0.999):
		self.learning_rate = learning_rate
		self.current_learning_rate = learning_rate
		self.decay = decay
		self.iterations = 0
		self.epsilon = epsilon
		self.beta_1 = beta_1
		self.beta_2 = beta_2

	def pre_update_params(self):
		if self.decay:
			self.current_learning_rate = self.learning_rate * \
				(1. / (1. + self.decay * self.iterations))

	def update_params(self, layer):
		if not hasattr(layer, 'weight_cache'):
			layer.weight_momentums = np.zeros_like(layer.weights)
			layer.weight_cache = np.zeros_like(layer.weights)
			layer.bias_momentums = np.zeros_like(layer.biases)
			layer.bias_cache = np.zeros_like(layer.biases)

		layer.weight_momentums = self.beta_1 * layer.weight_momentums + \
								 (1 - self.beta_1) * layer.dweights
		layer.bias_momentums = self.beta_1 * layer.bias_momentums + \
								(1 - self.beta_1) * layer.dbiases

		corrected_weight_momentums = layer.weight_momentums / \
									 (1 - self.beta_1 ** (self.iterations + 1))
		corrected_bias_momentums = layer.bias_momentums / \
								   (1 - self.beta_1 ** (self.iterations + 1))

		layer.weight_cache = self.beta_2 * layer.weight_cache + \
							 (1 - self.beta_2) * layer.dweights**2
		layer.bias_cache = self.beta_2 * layer.bias_cache + \
						   (1 - self.beta_2) * layer.dbiases**2

		corrected_weight_cache = layer.weight_cache / \
								 (1 - self.beta_2 ** (self.iterations + 1))
		corrected_bias_cache = layer.bias_cache / \
							   (1 - self.beta_2 ** (self.iterations + 1))

		layer.weights += -self.current_learning_rate * corrected_weight_momentums / \
						 (np.sqrt(corrected_weight_cache) + self.epsilon)
		layer.biases += -self.current_learning_rate * corrected_bias_momentums / \
						(np.sqrt(corrected_bias_cache) + self.epsilon)

	def post_update_params(self):
		self.iterations += 1



##### Training Loop with Batching and Accuracy (If testing this code block specifically **ensure to first run**  the full integrated example above)

In [None]:
# Data
X, y = spiral_data(samples=1000, classes=3)

# Model
dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimiser = Optimiser_Adam(learning_rate=0.05, decay=5e-7)

# Batch settings
batch_size = 128
steps = X.shape[0] // batch_size
if X.shape[0] % batch_size != 0:
	steps += 1

# Training loop
for epoch in range(10001):
	epoch_loss = 0
	epoch_accuracy = 0

	for step in range(steps):
		batch_X = X[step * batch_size:(step + 1) * batch_size]
		batch_y = y[step * batch_size:(step + 1) * batch_size]

		dense1.forward(batch_X)
		activation1.forward(dense1.output)
		dense2.forward(activation1.output)

		loss = loss_activation.forward(dense2.output, batch_y)
		epoch_loss += loss

		predictions = np.argmax(loss_activation.output, axis=1)
		accuracy = np.mean(predictions == batch_y)
		epoch_accuracy += accuracy

		loss_activation.backward(loss_activation.output, batch_y)
		dense2.backward(loss_activation.dinputs)
		activation1.backward(dense2.dinputs)
		dense1.backward(activation1.dinputs)

		optimiser.pre_update_params()
		optimiser.update_params(dense1)
		optimiser.update_params(dense2)
		optimiser.post_update_params()

	# Report
	if epoch % 100 == 0:
		print(f'epoch: {epoch}, acc: {epoch_accuracy / steps:.3f}, loss: {epoch_loss / steps:.3f}, lr: {optimiser.current_learning_rate}')

##### What do we see here?
- Efficient **batch training**
- Stable **accuracy tracking**
- Full integrated **adaptive optimiser (adam)**
##### Next Step
[[L1 and L2 Regularisation and Dropout]]