In [7]:
import numpy as np
from keras.datasets import mnist

In [19]:
class Math:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    @staticmethod
    def reLU(Z):
        return np.maximum(0, Z)

    @staticmethod
    def deriv_reLU(Z):
        return Z > 0

    @staticmethod
    def softmax(Z):
        # Subtract max per sample for numerical stability
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)

    # detta Ã¤r knappast en math funktion men okej
    @staticmethod
    def one_hot(Y, nbr_classes):
        one_hot_Y = np.zeros((Y.size, nbr_classes))
        one_hot_Y[np.arange(Y.size), Y] = 1
        one_hot_Y = one_hot_Y.T
        return one_hot_Y

In [52]:
class NeuralNetwork:
    
    def __init__(self, layers):
        self.layers = layers
        self.weights = []
        self.biases = []
        self.As = None
        self.Zs = None
        self.dWs = None
        self.dbs = None
        
        # init weights and biases
        for i in range(len(layers) - 1):
            n_in  = layers[i]
            n_out = layers[i + 1]
            
            # He initialization (best for ReLU)
            W = np.random.randn(n_out, n_in) * np.sqrt(2 / n_in)
            b = np.zeros((n_out, 1))
            
            self.weights.append(W)
            self.biases.append(b)
    
    
    def forward_prop(self, X):
        
        # store activations so to avoid recalculation in backprop
        self.Zs = []
        self.As = [X]
        A = X
        
        # all hidden layers with ReLU
        for i in range(len(self.layers) - 1):
            W = self.weights[i]
            b = self.biases[i]
            Z = np.dot(W, A) + b
            
            # activation depending on layer
            if i == len(self.layers) - 2:
                A = Math.softmax(Z)
            else:
                A = Math.reLU(Z)
                
            self.Zs.append(Z)
            self.As.append(A)
        
        return A
    
    
    def back_prop(self, Y):
        
        m = Y.size
        one_hot_Y = Math.one_hot(Y, self.layers[-1])
        
        # Use local variables during calculation
        dWs = [None] * len(self.weights)
        dbs = [None] * len(self.biases)
        
        # Start with output layer gradient
        dZ = self.As[-1] - one_hot_Y
        
        # Backpropagate through all layers (from last to first)
        for i in range(len(self.layers) - 2, -1, -1):
            A_prev = self.As[i]
            
            # Calculate gradients and store directly in correct position
            dWs[i] = (1/m) * dZ.dot(A_prev.T)
            dbs[i] = (1/m) * np.sum(dZ, axis=1, keepdims=True)
            
            # Propagate to previous layer (if not at input)
            if i > 0:
                W = self.weights[i]
                Z_prev = self.Zs[i-1]
                dZ = W.T.dot(dZ) * Math.deriv_reLU(Z_prev)
        
        # Store as instance variables at the end
        self.dWs = dWs
        self.dbs = dbs
                
    
    def update_params(self, learning_rate):
        
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * self.dWs[i]
            self.biases[i] -= learning_rate * self.dbs[i]

    
    def train(self, X, Y, epochs, learning_rate):
        
        batch_size = 64
        
        for epoch in range(epochs):
            
            # Shuffle dataset at the start of each epoch
            permutation = np.random.permutation(X.shape[1])
            X = X[:, permutation]
            Y = Y[permutation]
            
            for j in range(0, X.shape[1], batch_size):
                X_batch = X[:, j : j + batch_size]
                Y_batch = Y[j : j + batch_size]
                
                self.forward_prop(X_batch)
                self.back_prop(Y_batch)
                self.update_params(learning_rate)
                
            if epoch % 1 == 0:
                predictions = self.predict(X)
                accuracy = self.accuracy(predictions, Y)
                print(f"Epoch {epoch+1}: Accuracy = {accuracy:.4f}")
            
            
    def predict(self, X):
        return np.argmax(self.forward_prop(X), 0)
    
    def accuracy(self, predictions, Y):
        return np.sum(predictions == Y) / Y.size
        
        
        
    

In [None]:
# load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Prepare training data
m_train = X_train.shape[0]
X_train_flat = X_train.reshape(m_train, -1).T / 255.0
Y_train = y_train.astype(int)

# Prepare test data (separate, unseen data)
m_test = X_test.shape[0]
X_test_flat = X_test.reshape(m_test, -1).T / 255.0
Y_test = y_test.astype(int)

nn = NeuralNetwork([X_train_flat.shape[0], 512, 512, 256, Y_train.max() + 1])

# train and test model
nn.train(X_train_flat, Y_train, epochs=20, learning_rate=0.01)
predictions = nn.predict(X_test_flat)
accuracy = nn.accuracy(predictions, Y_test)
print(f"Test Accuracy = {accuracy:.4f}")

Epoch 1: Accuracy = 0.9114
Epoch 2: Accuracy = 0.9326
Epoch 3: Accuracy = 0.9434
Epoch 4: Accuracy = 0.9523
