In [None]:
# Import necessary libraries for building and training the neural network
import torch  # PyTorch for tensor operations and neural network computations
import matplotlib.pyplot as plt  # For creating visualizations and plots
from sklearn.datasets import make_moons  # Generate non-linearly separable dataset
from sklearn.model_selection import train_test_split  # Split data into train/test sets
from sklearn.preprocessing import StandardScaler  # Normalize features to have mean=0, std=1

In [None]:
# Generate a non-linearly separable "moons" dataset for binary classification
# This dataset consists of two interleaving half circles, which requires a 
# non-linear decision boundary that a simple linear classifier cannot learn
X, y = make_moons(n_samples=10000, noise=0.2, random_state=42)

# Split the dataset into 80% training and 20% testing sets
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

# Standardize the features to have mean=0 and standard deviation=1
# This is crucial for neural networks as it:
# 1. Helps gradient descent converge faster
# 2. Prevents features with larger scales from dominating the learning
# 3. Keeps activations in a reasonable range
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit scaler on training data and transform
X_test = scaler.transform(X_test)  # Transform test data using training statistics

# Convert NumPy arrays to PyTorch tensors for neural network operations
# float32 is used for efficiency (GPUs are optimized for 32-bit floats)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)  # Reshape to column vector
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)  # Reshape to column vector

In [None]:
# Visualize the training and testing datasets side-by-side
# This helps us understand the data distribution and verify the split
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot training data
# - c=y_train.squeeze() colors points by their class label
# - cmap='viridis' uses a perceptually uniform colormap
# - edgecolors='k' adds black borders to make points more visible
# - alpha=0.7 adds slight transparency
ax1.scatter(X_train[:, 0], X_train[:, 1], c=y_train.squeeze(), cmap='viridis', edgecolors='k', alpha=0.7)
ax1.set_title('Training Dataset')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')

# Plot testing data with the same styling
# This allows us to visually compare if the test set has a similar distribution
ax2.scatter(X_test[:, 0], X_test[:, 1], c=y_test.squeeze(), cmap='viridis', edgecolors='k', alpha=0.7)
ax2.set_title('Testing Dataset')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')

plt.tight_layout()  # Adjust spacing between subplots for better appearance
plt.savefig('../figures/moons_dataset.png', dpi=300, bbox_inches='tight')  # Save the figure to a file
plt.show()

In [None]:
class SimpleMLP:
    """
    A simple Multi-Layer Perceptron (MLP) neural network with one hidden layer.
    
    This implementation uses manual backpropagation to update weights and biases,
    demonstrating the fundamental mechanics of neural network training without
    relying on automatic differentiation frameworks.
    
    Architecture:
        Input Layer -> Hidden Layer (sigmoid) -> Output Layer (sigmoid)
    
    Attributes:
        W1 (torch.Tensor): Weight matrix for input to hidden layer (input_size x hidden_size)
        b1 (torch.Tensor): Bias vector for hidden layer (1 x hidden_size)
        W2 (torch.Tensor): Weight matrix for hidden to output layer (hidden_size x output_size)
        b2 (torch.Tensor): Bias vector for output layer (1 x output_size)
        z1 (torch.Tensor): Pre-activation values for hidden layer
        a1 (torch.Tensor): Activated values for hidden layer
        z2 (torch.Tensor): Pre-activation values for output layer
        a2 (torch.Tensor): Activated values for output layer (final predictions)
    """
    
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize the MLP with random weights and biases.
        
        Args:
            input_size (int): Number of input features
            hidden_size (int): Number of neurons in the hidden layer
            output_size (int): Number of output neurons (typically 1 for binary classification)
        """
        # Initialize weights randomly from a standard normal distribution
        # requires_grad=True enables automatic gradient computation (though we compute manually)
        self.W1 = torch.randn(input_size, hidden_size, requires_grad=True)
        self.b1 = torch.randn(1, hidden_size, requires_grad=True)
        self.W2 = torch.randn(hidden_size, output_size, requires_grad=True)
        self.b2 = torch.randn(1, output_size, requires_grad=True)
    
    def forward(self, X):
        """
        Perform a forward pass through the network.
        
        This method computes the network's predictions by:
        1. Computing hidden layer pre-activations: z1 = X @ W1 + b1
        2. Applying sigmoid activation: a1 = sigmoid(z1)
        3. Computing output layer pre-activations: z2 = a1 @ W2 + b2
        4. Applying sigmoid activation: a2 = sigmoid(z2)
        
        Args:
            X (torch.Tensor): Input data of shape (batch_size, input_size)
        
        Returns:
            torch.Tensor: Network predictions of shape (batch_size, output_size)
                         Values are in range [0, 1] due to sigmoid activation
        """
        # Hidden layer: Linear transformation followed by sigmoid activation
        # z1 = X * W1 + b1 (matrix multiplication)
        self.z1 = torch.matmul(X, self.W1) + self.b1
        
        # Apply sigmoid activation: σ(z) = 1 / (1 + e^(-z))
        # This introduces non-linearity, allowing the network to learn complex patterns
        self.a1 = torch.sigmoid(self.z1)
        
        # Output layer: Linear transformation of hidden layer activations
        # z2 = a1 * W2 + b2
        self.z2 = torch.matmul(self.a1, self.W2) + self.b2
        
        # Apply sigmoid to get final predictions in range [0, 1]
        # For binary classification, we can interpret this as P(class=1)
        self.a2 = torch.sigmoid(self.z2)
        
        return self.a2

    def backward(self, X, y, output, lr=0.01):
        """
        Perform backpropagation to compute gradients and update weights.
        
        This method implements the backpropagation algorithm manually:
        1. Computes gradients for output layer (dW2, db2)
        2. Propagates error back to hidden layer
        3. Computes gradients for hidden layer (dW1, db1)
        4. Updates all weights and biases using gradient descent
        
        The gradient calculations assume Mean Squared Error (MSE) loss and
        sigmoid activations throughout the network.
        
        Args:
            X (torch.Tensor): Input data of shape (batch_size, input_size)
            y (torch.Tensor): True labels of shape (batch_size, output_size)
            output (torch.Tensor): Network predictions from forward pass
            lr (float): Learning rate for gradient descent (default: 0.01)
        """
        m = X.shape[0]  # Number of training examples in the batch
        
        # ===== BACKPROPAGATION THROUGH OUTPUT LAYER =====
        # For MSE loss L = (1/2m) * Σ(output - y)^2
        # dL/dz2 = output - y (derivative of MSE w.r.t. pre-activation)
        dz2 = output - y
        
        # Gradient of loss w.r.t. W2: dL/dW2 = a1^T * dz2
        # This tells us how much to adjust W2 to reduce the loss
        dW2 = torch.matmul(self.a1.T, dz2)
        
        # Gradient of loss w.r.t. b2: dL/db2 = average of dz2 across batch
        db2 = torch.sum(dz2, axis=0) / m

        # ===== BACKPROPAGATION THROUGH HIDDEN LAYER =====
        # Propagate error backward to the hidden layer
        # dL/da1 = dz2 * W2^T (chain rule)
        da1 = torch.matmul(dz2, self.W2.T)
        
        # Apply derivative of sigmoid activation: σ'(z) = σ(z) * (1 - σ(z))
        # dL/dz1 = dL/da1 * da1/dz1 = da1 * σ'(z1)
        dz1 = da1 * (self.a1 * (1 - self.a1))
        
        # Gradient of loss w.r.t. W1: dL/dW1 = X^T * dz1 / m
        dw1 = torch.matmul(X.T, dz1) / m
        
        # Gradient of loss w.r.t. b1: dL/db1 = average of dz1 across batch
        db1 = torch.sum(dz1, axis=0) / m
        
        # ===== GRADIENT DESCENT UPDATE =====
        # Update weights and biases: θ_new = θ_old - learning_rate * gradient
        # torch.no_grad() disables gradient tracking for efficiency
        with torch.no_grad():
            self.W1 -= lr * dw1  # Update input-to-hidden weights
            self.b1 -= lr * db1  # Update hidden layer biases
            self.W2 -= lr * dW2  # Update hidden-to-output weights
            self.b2 -= lr * db2  # Update output layer biases
            
    def train(self, X, y, epochs=1000, lr=0.01):
        """
        Train the neural network using gradient descent.
        
        This method performs the complete training loop:
        1. Forward pass to compute predictions
        2. Calculate Mean Squared Error (MSE) loss
        3. Backward pass to update weights
        4. Track and print loss at regular intervals
        
        Args:
            X (torch.Tensor): Training input data of shape (batch_size, input_size)
            y (torch.Tensor): Training labels of shape (batch_size, output_size)
            epochs (int): Number of training iterations (default: 1000)
            lr (float): Learning rate for gradient descent (default: 0.01)
        
        Returns:
            list: Loss values for each epoch, useful for plotting training progress
        """
        losses = []  # Track loss history for visualization
        
        # Training loop: iterate over the entire dataset multiple times
        for epoch in range(epochs):
            # Forward pass: compute predictions
            output = self.forward(X)
            
            # Compute Mean Squared Error (MSE) loss
            # MSE = (1/m) * Σ(prediction - actual)^2
            # Lower loss indicates better model performance
            loss = torch.mean((output - y) ** 2)
            losses.append(loss.item())  # Store loss value (convert to Python float)
            
            # Backward pass: compute gradients and update weights
            self.backward(X, y, output, lr)
            
            # Print progress every 1000 epochs to monitor training
            if (epoch + 1) % 1000 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
                
        return losses

In [None]:
# Define the neural network architecture
input_size = 2  # Two features (x and y coordinates from the moons dataset)
hidden_size = 4  # Four neurons in the hidden layer (enough to learn the moons pattern)
output_size = 1  # Single output for binary classification (probability of class 1)

# Create an instance of the SimpleMLP model
model = SimpleMLP(input_size, hidden_size, output_size)

# Train the model on the training data
# epochs=50000: Run 50,000 iterations over the entire training set
# lr=0.1: Use a relatively high learning rate for faster convergence
# The model will learn to separate the two moon-shaped classes
losses = model.train(X_train, y_train, epochs=5000, lr=0.1)

In [None]:
# Visualize the training loss over time to assess learning progress
# A decreasing loss indicates the model is successfully learning the pattern

plt.plot(losses)  # Plot loss values for each epoch
plt.xlabel("Epoch")  # X-axis: training iteration number
plt.ylabel("Loss")  # Y-axis: Mean Squared Error loss value

# Use logarithmic scale on y-axis to better visualize exponential decay
# This makes it easier to see improvements even when loss becomes very small
plt.yscale("log")

# Set specific tick marks for clearer reading
plt.yticks([0.01, 0.1, 1], ['0.01', '0.1', '1'])

# Add a subtle grid for easier reading of values
plt.grid(True, alpha=0.3)
plt.savefig('../figures/training_loss.png', dpi=300, bbox_inches='tight')  # Save the figure to a file
plt.show()

In [None]:
# Evaluate the trained model on the test set
# This measures how well the model generalizes to unseen data

# Disable gradient computation for inference (saves memory and computation)
with torch.no_grad():
    # Get model predictions on test data
    test_output = model.forward(X_test)
    
    # Convert probabilities to binary predictions using 0.5 threshold
    # If output > 0.5, predict class 1; otherwise predict class 0
    test_output = (test_output > 0.5).float()

# Calculate accuracy: percentage of correct predictions
# Compares predicted labels with true labels and computes the mean
accuracy = torch.mean((test_output == y_test).float())

# Display the test accuracy as a percentage
print(f"Test Accuracy: {accuracy.item() * 100:.2f}%")

In [None]:
# Visualize the decision boundary learned by the MLP
# This shows how the model divides the feature space into two classes

# Create a mesh grid covering the entire feature space
# This grid will be used to visualize the decision boundary
x_min, x_max = X_test[:, 0].min() - 0.5, X_test[:, 0].max() + 0.5  # Add padding for visualization
y_min, y_max = X_test[:, 1].min() - 0.5, X_test[:, 1].max() + 0.5
h = 0.02  # Step size in the mesh (smaller = higher resolution, but slower)

# Create a 2D grid of points covering the feature space
# Each point will be classified to show the decision regions
xx, yy = torch.meshgrid(torch.arange(x_min, x_max, h), torch.arange(y_min, y_max, h), indexing='ij')

# Flatten the grid into a list of (x, y) coordinates
# This converts the 2D grid into a format the model can process
grid_points = torch.stack([xx.ravel(), yy.ravel()], dim=1)

# Predict class probabilities for every point in the grid
with torch.no_grad():
    Z = model.forward(grid_points)  # Get predictions for all grid points
    Z = Z.reshape(xx.shape)  # Reshape back to 2D grid for visualization

# Create side-by-side comparison of model predictions vs ground truth
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# ===== LEFT SUBPLOT: MLP PREDICTIONS =====
# Show decision regions as colored background
# Darker regions = model predicts class 0, lighter = class 1
ax1.contourf(xx, yy, Z, levels=20, cmap='viridis', alpha=0.6)

# Draw the decision boundary (where probability = 0.5) in red
# This line separates the two predicted classes
ax1.contour(xx, yy, Z, levels=[0.5], colors='red', linewidths=2)

# Overlay the actual test points with their PREDICTED labels
# This shows which points were classified correctly (match ground truth)
ax1.scatter(X_test[:, 0], X_test[:, 1], c=test_output.squeeze(), cmap='viridis', edgecolors='k', alpha=0.8)
ax1.set_title('MLP Labels')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')

# ===== RIGHT SUBPLOT: GROUND TRUTH =====
# Use the same decision boundary for comparison
ax2.contourf(xx, yy, Z, levels=20, cmap='viridis', alpha=0.6)
ax2.contour(xx, yy, Z, levels=[0.5], colors='red', linewidths=2)

# Overlay the actual test points with their TRUE labels
# Comparing left and right plots shows where the model makes mistakes
ax2.scatter(X_test[:, 0], X_test[:, 1], c=y_test.squeeze(), cmap='viridis', edgecolors='k', alpha=0.8)
ax2.set_title('Ground Truth')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')

plt.tight_layout()  # Adjust spacing for better appearance
plt.savefig('../figures/decision_boundary.png', dpi=300, bbox_inches='tight')  # Save the figure to a file
plt.show()