In [2]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Set random seed for reproducibility
np.random.seed(0)

# Activation functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_x / np.sum(exp_x, axis=0, keepdims=True)

# Loss function: Categorical cross-entropy
def categorical_cross_entropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.sum(y_true * np.log(y_pred)) / y_true.shape[1]

# Derivative of categorical cross-entropy w.r.t. softmax output
def cce_derivative(y_true, y_pred):
    return y_pred - y_true

# Neural Network class
class NeuralNetwork:
    def __init__(self, layer_sizes):
        self.layer_sizes = layer_sizes
        self.weights = []
        self.biases = []
        self.initialize_parameters()
        print(f"Network architecture: {self.layer_sizes} (4 hidden layers)")
        print(f"Sample weights (first layer, first 5 values): {self.weights[0].flatten()[:5]}")

    def initialize_parameters(self):
        for i in range(1, len(self.layer_sizes)):
            # He initialization for ReLU layers
            self.weights.append(np.random.randn(self.layer_sizes[i], self.layer_sizes[i-1]) * np.sqrt(2.0 / self.layer_sizes[i-1]))
            self.biases.append(np.zeros((self.layer_sizes[i], 1)))

    def forward_propagation(self, X):
        self.A = [X.T]
        self.Z = []

        # Hidden layers (ReLU)
        for i in range(len(self.weights) - 1):
            Z = np.dot(self.weights[i], self.A[-1]) + self.biases[i]
            self.Z.append(Z)
            self.A.append(relu(Z))

        # Output layer (Softmax)
        Z = np.dot(self.weights[-1], self.A[-1]) + self.biases[-1]
        self.Z.append(Z)
        self.A.append(softmax(Z))

        return self.A[-1].T

    def backpropagation(self, X, y, y_pred, learning_rate):
        m = X.shape[0]
        y = y.T

        dA = cce_derivative(y, y_pred.T)

        dW = []
        db = []
        for i in range(len(self.weights) - 1, -1, -1):
            if i == len(self.weights) - 1:
                dZ = dA
            else:
                dZ = dA * relu_derivative(self.Z[i])

            dW.insert(0, np.dot(dZ, self.A[i].T) / m)
            db.insert(0, np.sum(dZ, axis=1, keepdims=True) / m)
            if i > 0:
                dA = np.dot(self.weights[i].T, dZ)

        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * dW[i]
            self.biases[i] -= learning_rate * db[i]

    def predict(self, X):
        y_pred = self.forward_propagation(X)
        return np.argmax(y_pred, axis=1)

    def train(self, X, y, epochs, learning_rate, batch_size=128):
        m = X.shape[0]
        print("Starting training...")
        try:
            for epoch in range(epochs):
                print(f"Starting epoch {epoch}")
                # Mini-batch gradient descent
                for i in range(0, m, batch_size):
                    X_batch = X[i:i+batch_size]
                    y_batch = y[i:i+batch_size]
                    y_pred = self.forward_propagation(X_batch)
                    self.backpropagation(X_batch, y_batch, y_pred, learning_rate)

                # Compute loss and training accuracy
                y_pred = self.forward_propagation(X)
                loss = categorical_cross_entropy(y.T, y_pred.T)
                train_accuracy = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y, axis=1))
                print(f"Epoch {epoch}, Loss: {loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        except Exception as e:
            print(f"Training interrupted: {e}")

# Load and preprocess MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Flatten and normalize
X_train = X_train.reshape(X_train.shape[0], -1) / 255.0
X_test = X_test.reshape(X_test.shape[0], -1) / 255.0

# One-hot encode labels
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Define network: 784 input neurons, 4 hidden layers (256, 128, 64, 32), 10 output neurons
nn = NeuralNetwork([784, 256, 128, 64, 32, 10])

# Train the network for 10 epochs
nn.train(X_train, y_train, epochs=10, learning_rate=0.01, batch_size=128)

# Evaluate on test set
y_pred = nn.predict(X_test)
test_accuracy = np.mean(y_pred == np.argmax(y_test, axis=1))
print(f"Test Accuracy: {test_accuracy:.4f}")

Network architecture: [784, 256, 128, 64, 32, 10] (4 hidden layers)
Sample weights (first layer, first 5 values): [0.0890981  0.02021099 0.04943373 0.1131822  0.09432592]
Starting training...
Starting epoch 0
Epoch 0, Loss: 0.4567, Train Accuracy: 0.8707
Starting epoch 1
Epoch 1, Loss: 0.3238, Train Accuracy: 0.9062
Starting epoch 2
Epoch 2, Loss: 0.2745, Train Accuracy: 0.9197
Starting epoch 3
Epoch 3, Loss: 0.2440, Train Accuracy: 0.9288
Starting epoch 4
Epoch 4, Loss: 0.2215, Train Accuracy: 0.9355
Starting epoch 5
Epoch 5, Loss: 0.2036, Train Accuracy: 0.9404
Starting epoch 6
Epoch 6, Loss: 0.1882, Train Accuracy: 0.9447
Starting epoch 7
Epoch 7, Loss: 0.1743, Train Accuracy: 0.9489
Starting epoch 8
Epoch 8, Loss: 0.1626, Train Accuracy: 0.9524
Starting epoch 9
Epoch 9, Loss: 0.1524, Train Accuracy: 0.9554
Test Accuracy: 0.9507
