# Multi-Layer Perceptron from Scratch

In this notebook, we'll implement a complete MLP from scratch and train it on the MNIST dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns

plt.style.use('seaborn-v0_8')
np.random.seed(42)

## Load and Preprocess Data

In [None]:
# Load MNIST dataset
print("Loading MNIST dataset...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target.astype(int)

# Use subset for faster training
X = X[:10000] / 255.0  # Normalize to [0, 1]
y = y[:10000]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Classes: {np.unique(y)}")

In [None]:
# Visualize some samples
fig, axes = plt.subplots(2, 5, figsize=(12, 6))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_train[i].reshape(28, 28), cmap='gray')
    ax.set_title(f'Label: {y_train[i]}')
    ax.axis('off')
plt.tight_layout()
plt.show()

## MLP Implementation

In [None]:
class MLP:
    def __init__(self, layer_sizes, learning_rate=0.01):
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.weights = []
        self.biases = []
        
        # Initialize weights and biases
        for i in range(len(layer_sizes) - 1):
            # Xavier initialization
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2.0 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        return (x > 0).astype(float)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, X):
        self.activations = [X]
        self.z_values = []
        
        current_input = X
        
        # Forward through hidden layers
        for i in range(len(self.weights) - 1):
            z = np.dot(current_input, self.weights[i]) + self.biases[i]
            self.z_values.append(z)
            current_input = self.relu(z)
            self.activations.append(current_input)
        
        # Output layer
        z_output = np.dot(current_input, self.weights[-1]) + self.biases[-1]
        self.z_values.append(z_output)
        output = self.softmax(z_output)
        self.activations.append(output)
        
        return output
    
    def backward(self, X, y):
        m = X.shape[0]
        
        # One-hot encode labels
        y_onehot = np.eye(self.layer_sizes[-1])[y]
        
        # Output layer error
        dz = self.activations[-1] - y_onehot
        
        # Backpropagate
        for i in range(len(self.weights) - 1, -1, -1):
            # Calculate gradients
            dw = np.dot(self.activations[i].T, dz) / m
            db = np.mean(dz, axis=0, keepdims=True)
            
            # Update weights and biases
            self.weights[i] -= self.learning_rate * dw
            self.biases[i] -= self.learning_rate * db
            
            # Calculate error for previous layer
            if i > 0:
                dz = np.dot(dz, self.weights[i].T) * self.relu_derivative(self.z_values[i-1])
    
    def predict(self, X):
        output = self.forward(X)
        return np.argmax(output, axis=1)
    
    def accuracy(self, X, y):
        predictions = self.predict(X)
        return np.mean(predictions == y)

## Training the MLP

In [None]:
# Create and train MLP
mlp = MLP([784, 128, 64, 10], learning_rate=0.1)

# Training parameters
epochs = 100
batch_size = 32

# Training history
train_losses = []
train_accuracies = []
test_accuracies = []

print("Training MLP...")
for epoch in range(epochs):
    # Shuffle training data
    indices = np.random.permutation(len(X_train))
    X_shuffled = X_train[indices]
    y_shuffled = y_train[indices]
    
    epoch_loss = 0
    num_batches = len(X_train) // batch_size
    
    # Mini-batch training
    for i in range(0, len(X_train), batch_size):
        X_batch = X_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]
        
        # Forward and backward pass
        predictions = mlp.forward(X_batch)
        mlp.backward(X_batch, y_batch)
        
        # Calculate loss
        y_onehot = np.eye(10)[y_batch]
        loss = -np.mean(y_onehot * np.log(predictions + 1e-15))
        epoch_loss += loss
    
    # Calculate metrics
    avg_loss = epoch_loss / num_batches
    train_acc = mlp.accuracy(X_train, y_train)
    test_acc = mlp.accuracy(X_test, y_test)
    
    train_losses.append(avg_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss: {avg_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

print("Training completed!")

## Visualize Training Progress

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
ax1.plot(train_losses, label='Training Loss', linewidth=2)
ax1.set_title('Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy plot
ax2.plot(train_accuracies, label='Training Accuracy', linewidth=2)
ax2.plot(test_accuracies, label='Test Accuracy', linewidth=2)
ax2.set_title('Model Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final Test Accuracy: {test_accuracies[-1]:.4f}")

## Model Analysis

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

y_pred = mlp.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=range(10), yticklabels=range(10))
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Visualize some predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 6))
for i, ax in enumerate(axes.flat):
    idx = np.random.randint(0, len(X_test))
    image = X_test[idx].reshape(28, 28)
    true_label = y_test[idx]
    pred_label = y_pred[idx]
    
    ax.imshow(image, cmap='gray')
    color = 'green' if true_label == pred_label else 'red'
    ax.set_title(f'True: {true_label}, Pred: {pred_label}', color=color)
    ax.axis('off')

plt.tight_layout()
plt.show()

## Weight Visualization

In [None]:
# Visualize first layer weights
first_layer_weights = mlp.weights[0].T  # Shape: (128, 784)

fig, axes = plt.subplots(4, 8, figsize=(16, 8))
for i, ax in enumerate(axes.flat):
    if i < first_layer_weights.shape[0]:
        weight_image = first_layer_weights[i].reshape(28, 28)
        ax.imshow(weight_image, cmap='RdBu', vmin=-0.5, vmax=0.5)
        ax.set_title(f'Neuron {i}')
    ax.axis('off')

plt.suptitle('First Layer Weight Visualization')
plt.tight_layout()
plt.show()

## Comparison with Different Architectures

In [None]:
# Compare different architectures
architectures = {
    'Shallow': [784, 128, 10],
    'Medium': [784, 128, 64, 10],
    'Deep': [784, 256, 128, 64, 32, 10]
}

results = {}

for name, arch in architectures.items():
    print(f"Training {name} network...")
    model = MLP(arch, learning_rate=0.1)
    
    # Quick training (fewer epochs)
    for epoch in range(20):
        indices = np.random.permutation(len(X_train))
        X_shuffled = X_train[indices]
        y_shuffled = y_train[indices]
        
        for i in range(0, len(X_train), batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            
            model.forward(X_batch)
            model.backward(X_batch, y_batch)
    
    test_acc = model.accuracy(X_test, y_test)
    results[name] = test_acc
    print(f"{name} Test Accuracy: {test_acc:.4f}")

# Plot comparison
plt.figure(figsize=(10, 6))
names = list(results.keys())
accuracies = list(results.values())

bars = plt.bar(names, accuracies, color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Architecture Comparison')
plt.ylabel('Test Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom')

plt.grid(True, alpha=0.3)
plt.show()