In [1]:
import numpy as np
from torchvision.datasets import MNIST
import time
from sklearn.metrics import accuracy_score

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                   transform=lambda x: np.array(x).flatten() / 255.0,
                   download=True,
                   train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return np.array(mnist_data), np.array(mnist_labels)

# Download and prepare the data
train_X, train_Y = download_mnist(True)


In [2]:

test_X, test_Y = download_mnist(False)

# Convert labels to one-hot encoding
def create_one_hot(labels, num_classes=10):
    one_hot = np.zeros((len(labels), num_classes))
    one_hot[np.arange(len(labels)), labels] = 1
    return one_hot

train_Y = create_one_hot(train_Y)
test_Y = create_one_hot(test_Y)

In [10]:

class MLPClassifier:
    def __init__(
        self,
        input_size: int = 784,
        hidden_size: int = 100,
        output_size: int = 10,
        learning_rate: float = 0.1,
        batch_size: int = 128
    ):
        # Initialize weights using He initialization
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0/input_size)
        self.b1 = np.zeros(hidden_size)
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0/hidden_size)
        self.b2 = np.zeros(output_size)
        
        self.learning_rate = learning_rate
        self.batch_size = batch_size

    def relu(self, x: np.ndarray) -> np.ndarray:
        return np.maximum(0, x)

    def relu_derivative(self, x: np.ndarray) -> np.ndarray:
        return np.where(x > 0, 1, 0)

    def softmax(self, x: np.ndarray) -> np.ndarray:
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, X: np.ndarray, training: bool = True) -> tuple:
        # First layer
        R1 = np.dot(X, self.W1) + self.b1
        A1 = self.relu(R1)
        
        # Output layer
        R2 = np.dot(A1, self.W2) + self.b2
        A2 = self.softmax(R2)
        
        return R1, A1, A2

    def backward(self, X: np.ndarray, y: np.ndarray, R1: np.ndarray, 
                A1: np.ndarray, A2: np.ndarray) -> None:
        m = X.shape[0]
        
        # Output layer gradients
        dR2 = A2 - y # derivative of loss with respect to pre-activation
        dW2 = (1/m) * np.dot(A1.T, dR2)
        db2 = (1/m) * np.sum(dR2, axis=0)
        
        # Hidden layer gradients
        dA1 = np.dot(dR2, self.W2.T)
        dR1 = dA1 * self.relu_derivative(R1)
        dW1 = (1/m) * np.dot(X.T, dR1)
        db1 = (1/m) * np.sum(dR1, axis=0)
        
        # Update weights with L2 regularization
        lambda_reg = 0.0001  # L2 regularization parameter
        self.W2 -= self.learning_rate * (dW2 + lambda_reg * self.W2) # add the gradient of the L2
        self.b2 -= self.learning_rate * db2
        self.W1 -= self.learning_rate * (dW1 + lambda_reg * self.W1)
        self.b1 -= self.learning_rate * db1

    def train(self, X_train: np.ndarray, y_train: np.ndarray, 
              X_val: np.ndarray, y_val: np.ndarray, epochs: int = 10) -> list:
        start_time = time.time()
        history = []
        
        # Convert inputs to numpy arrays if they aren't already
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        X_val = np.array(X_val)
        y_val = np.array(y_val)
        
        n_samples = X_train.shape[0]
        
        for epoch in range(epochs):
            # Shuffle training data
            shuffle_idx = np.random.permutation(n_samples)
            X_train = X_train[shuffle_idx]
            y_train = y_train[shuffle_idx]
            
            # Mini-batch training
            for i in range(0, n_samples, self.batch_size):
                batch_X = X_train[i:i + self.batch_size]
                batch_y = y_train[i:i + self.batch_size]
                
                # Forward pass
                R1, A1, A2 = self.forward(batch_X, training=True)
                
                # Backward pass
                self.backward(batch_X, batch_y, R1, A1, A2)
            
            # Compute training metrics
            _, _, train_predictions = self.forward(X_train, training=False)
            train_accuracy = accuracy_score(np.argmax(y_train, axis=1), 
                                         np.argmax(train_predictions, axis=1))
            
            # Compute validation metrics
            _, _, val_predictions = self.forward(X_val, training=False)
            val_accuracy = accuracy_score(np.argmax(y_val, axis=1), 
                                       np.argmax(val_predictions, axis=1))
            
            history.append((train_accuracy, val_accuracy))
            
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"Training Accuracy: {train_accuracy:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            print(f"Time elapsed: {time.time() - start_time:.2f}s")
            print("-" * 50)
            
        return history

    def predict(self, X: np.ndarray) -> np.ndarray:
        _, _, _, predictions = self.forward(X, training=False)
        return predictions


# Initialize and train the model
mlp = MLPClassifier(
    input_size=784,
    hidden_size=100,
    output_size=10,
    learning_rate=0.1,
    batch_size=128
)

# Train the model
history = mlp.train(train_X, train_Y, test_X, test_Y, epochs=100)

# Make predictions
final_predictions = mlp.predict(test_X)
final_accuracy = accuracy_score(np.argmax(test_Y, axis=1), 
                              np.argmax(final_predictions, axis=1))
print(f"Final Test Accuracy: {final_accuracy:.4f}")

Epoch 1/100
Training Accuracy: 0.9172
Validation Accuracy: 0.9205
Time elapsed: 1.72s
--------------------------------------------------
Epoch 2/100
Training Accuracy: 0.9344
Validation Accuracy: 0.9344
Time elapsed: 3.19s
--------------------------------------------------
Epoch 3/100
Training Accuracy: 0.9457
Validation Accuracy: 0.9443
Time elapsed: 4.77s
--------------------------------------------------
Epoch 4/100
Training Accuracy: 0.9527
Validation Accuracy: 0.9501
Time elapsed: 6.28s
--------------------------------------------------
Epoch 5/100
Training Accuracy: 0.9588
Validation Accuracy: 0.9547
Time elapsed: 7.91s
--------------------------------------------------
Epoch 6/100
Training Accuracy: 0.9631
Validation Accuracy: 0.9596
Time elapsed: 9.34s
--------------------------------------------------
Epoch 7/100
Training Accuracy: 0.9680
Validation Accuracy: 0.9629
Time elapsed: 10.84s
--------------------------------------------------
Epoch 8/100
Training Accuracy: 0.9702
Va

KeyboardInterrupt: 

∂L/∂z_k = -∑(y_i/a_i * a_i(δik - a_k))
        = -∑(y_i(δik - a_k))
        = -y_k + a_k∑y_i
        = a_k - y_k  (since ∑y_i = 1 for one-hot encoded labels)