In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits, load_breast_cancer

In [19]:
class MLP:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01, lambda_reg=0.01,
                 epochs=1000, tol=1e-3, patience=10, batch_size=32, dropout_rate=0.5, loss_func = 'mse'):
        """
        Initialize the 2-layer neural network model with dropout and batch normalization.

        Parameters:
            input_size (int): Number of features in the input.
            hidden_size (int): Number of neurons in the hidden layer.
            output_size (int): Number of output neurons (classes).
            learning_rate (float): Step size for gradient updates.
            lambda_reg (float): L2 regularization strength.
            epochs (int): Number of training iterations.
            tol (float): Tolerance for early stopping.
            patience (int): Number of epochs to wait for improvement before stopping.
            batch_size (int): Size of each mini-batch for gradient descent.
            dropout_rate (float): Dropout rate for regularization.
            loss_func (string): Loss function type for model (mse or cross_entropy)
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.epochs = epochs
        self.tol = tol
        self.patience = patience
        self.batch_size = batch_size
        self.dropout_rate = dropout_rate
        self.loss_func = loss_func
        
        # Initialize weights and biases using He initialization
        self.W_1 = np.random.randn(hidden_size, input_size) * np.sqrt(2. / input_size)
        self.W_2 = np.random.randn(output_size, hidden_size) * np.sqrt(2. / hidden_size)
        self.b_1 = np.zeros(hidden_size)
        self.b_2 = np.zeros(output_size)
        
        # Batch Norm parameters
        self.gamma = np.ones(hidden_size)  # Scale parameter for batch normalization
        self.beta = np.zeros(hidden_size)  # Shift parameter for batch normalization
        self.epsilon = 1e-5  # Small value to avoid division by zero
        self.running_mean = np.zeros(hidden_size)
        self.running_var = np.ones(hidden_size)
        
        self.loss_history = []

    def relu(self, x):
        """ReLU activation function."""
        return np.maximum(0, x)
    
    def batch_norm_forward(self, x, training=True):
        """
        Perform batch normalization on input data.
        
        Parameters:
            x (np.array): Input data.
            training (bool): Whether the model is in training mode.
        
        Returns:
            Tuple: Normalized data, mean, and variance.
        """
        if training:
            mean = np.mean(x, axis=0)
            var = np.var(x, axis=0)
            self.running_mean = 0.9 * self.running_mean + 0.1 * mean
            self.running_var = 0.9 * self.running_var + 0.1 * var
        else:
            mean = self.running_mean
            var = self.running_var
        
        x_norm = (x - mean) / np.sqrt(var + self.epsilon)
        return self.gamma * x_norm + self.beta, x_norm, mean, var
    
    def compute_loss(self, X, y, loss_type="mse"):
        """
        Compute the loss function with L2 regularization.

        Parameters:
            X (np.array): Feature matrix.
            y (np.array): Target labels.
            loss_type (str): "mse" for Mean Squared Error, "cross_entropy" for Cross Entropy Loss.

        Returns:
            float: The loss value.
        """
        m = X.shape[0]
        
        # Forward pass
        hidden_input = np.dot(X, self.W_1.T) + self.b_1
        hidden_output, _, _, _ = self.batch_norm_forward(self.relu(hidden_input), training=False)
        output_input = np.dot(hidden_output, self.W_2.T) + self.b_2

        if loss_type == "mse":
            output = output_input  # Regression output
            loss = np.mean((output - y) ** 2)

        elif loss_type == "cross_entropy":
            exp_scores = np.exp(output_input - np.max(output_input, axis=1, keepdims=True))  # Stability fix
            probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)  # Softmax
            loss = -np.mean(np.sum(y * np.log(probs + 1e-9), axis=1))  # Cross-Entropy
        
        else:
            raise ValueError("Invalid loss_type. Choose 'mse' or 'cross_entropy'.")

        # Add L2 regularization
        loss += (self.lambda_reg / 2) * (np.sum(self.W_1 ** 2) + np.sum(self.W_2 ** 2))
        
        return loss
    
    def fit(self, X, y, verbose=False):
        """
        Train the neural network using mini-batch gradient descent.
        
        Parameters:
            X (np.array): Feature matrix.
            y (np.array): Target labels.
            verbose (bool): If True, print loss progress.
        """
        m = X.shape[0]
        best_loss = float('inf')
        no_improve_count = 0
        
        for epoch in range(self.epochs):
            indices = np.random.permutation(m)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for i in range(0, m, self.batch_size):
                X_batch = X_shuffled[i:i + self.batch_size]
                y_batch = y_shuffled[i:i + self.batch_size]
                
                # Forward pass
                hidden_input = np.dot(X_batch, self.W_1.T) + self.b_1
                hidden_output, x_norm, mean, var = self.batch_norm_forward(self.relu(hidden_input))
                
                # Apply dropout during training
                dropout_mask = (np.random.rand(*hidden_output.shape) > self.dropout_rate) / (1.0 - self.dropout_rate)
                hidden_output *= dropout_mask
                
                output_input = np.dot(hidden_output, self.W_2.T) + self.b_2
                output = output_input
                
                # Compute loss
                loss = self.compute_loss(X_batch, y_batch, loss_type='cross_entropy')
                self.loss_history.append(loss)
                
                # Backpropagation
                d_output = 2 * (output - y_batch) / X_batch.shape[0]
                d_W_2 = np.dot(d_output.T, hidden_output) + self.lambda_reg * self.W_2
                d_b_2 = np.sum(d_output, axis=0)
                
                d_hidden = np.dot(d_output, self.W_2) * (hidden_output > 0) * dropout_mask
                d_hidden_norm = d_hidden * self.gamma / np.sqrt(var + self.epsilon)
                
                d_W_1 = np.dot(d_hidden_norm.T, X_batch) + self.lambda_reg * self.W_1
                d_b_1 = np.sum(d_hidden_norm, axis=0)
                
                # Update weights and biases
                self.W_1 -= self.learning_rate * d_W_1
                self.b_1 -= self.learning_rate * d_b_1
                self.W_2 -= self.learning_rate * d_W_2
                self.b_2 -= self.learning_rate * d_b_2
                
                # Update BatchNorm parameters
                self.gamma -= self.learning_rate * np.sum(d_hidden * x_norm, axis=0)
                self.beta -= self.learning_rate * np.sum(d_hidden, axis=0)
            
            if verbose and epoch % 5 == 0:
                print(f"Epoch {epoch}: Loss = {loss:.6f}")
            
            if loss < best_loss - self.tol:
                best_loss = loss
                no_improve_count = 0
            else:
                no_improve_count += 1
                if no_improve_count >= self.patience:
                    print(f"Early stopping at epoch {epoch}. Best loss: {best_loss:.6f}")
                    break

    def predict(self, X):
        """
        Predict the class labels for the given input data.

        Parameters:
            X (np.array): Feature matrix.

        Returns:
            np.array: Predicted class labels (0 or 1).
        """
        hidden_input = np.dot(X, self.W_1.T) + self.b_1
        hidden_output = self.relu(hidden_input)
        output_input = np.dot(hidden_output, self.W_2.T) + self.b_2
        
        # For classification, return the class with the highest score (argmax)
        return np.argmax(output_input, axis=1)  # Use argmax to get the class label

In [26]:
mnist = load_digits()
X = mnist.data
y = mnist.target
y = np.eye(10)[y.astype(int)]
    
# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)
    
# Split data
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=282025
)
    
# Train model
model = MLP(input_size=64, 
            output_size=10, 
            hidden_size=256, 
            learning_rate=0.001, 
            epochs=1000,
            batch_size=32,
            loss_func='cross_entropy') #
model.fit(X_train, y_train)
    
# Evaluate
y_pred = model.predict(X_test)
y_test_labels = np.argmax(y_test, axis=1)
accuracy = np.mean(y_pred == y_test_labels)
print(f"\nTest accuracy: {accuracy:.4f}")

Early stopping at epoch 51. Best loss: 4.104257

Test accuracy: 0.9167
