In [3]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
def softmax(z):
    """
    Compute softmax activation.
    
    Parameters:
        z (np.array): Linear combination input of shape (n_samples, n_classes)
            
    Returns:
        np.array: Softmax probabilities
    """
    # Subtract max for numerical stability
    exp = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp / np.sum(exp, axis=1, keepdims=True)

class SoftmaxRegressionImplement:
    def __init__(self, learning_rate=0.01, epochs=100, batch_size=32, tol=1e-3, patience=10):
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.tol = tol
        self.patience = patience
        self.weights = None
        self.loss_history = []
        self.n_classes = None

    def compute_loss(self, y, y_pred):
        """
        Compute categorical cross-entropy loss.

        Parameters:
            y (np.array): True labels (one-hot encoded)
            y_pred (np.array): Predicted probabilities

        Returns:
            float: The categorical cross-entropy loss
        """
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(np.sum(y * np.log(y_pred), axis=1))

    def to_one_hot(self, y):
        """Convert label vector to one-hot encoded matrix."""
        one_hot = np.zeros((y.shape[0], self.n_classes))
        one_hot[np.arange(y.shape[0]), y] = 1
        return one_hot

    def fit(self, X, y, verbose=False):
        """
        Train the softmax regression model using Mini-batch SGD.
        
        Parameters:
            X (np.array): Feature matrix
            y (np.array): Target vector (class labels)
            verbose (bool): If True, print loss progress
        """
        # Get number of classes
        self.n_classes = len(np.unique(y))
        
        # Convert y to one-hot encoding
        y_one_hot = self.to_one_hot(y)
        
        # Add bias term
        X_b = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        n_samples, n_features = X_b.shape

        # Initialize weights for all classes
        self.weights = np.random.randn(n_features, self.n_classes) * 0.01

        # Early stopping variables
        best_loss = float("inf")
        no_improve_count = 0

        # Training loop
        for epoch in range(self.epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X_b[indices]
            y_shuffled = y_one_hot[indices]

            # Process mini-batches
            for i in range(0, n_samples, self.batch_size):
                X_batch = X_shuffled[i:i + self.batch_size]
                y_batch = y_shuffled[i:i + self.batch_size]

                # Forward pass
                z = np.dot(X_batch, self.weights)
                y_pred = softmax(z)

                # Compute gradient
                error = y_pred - y_batch
                grad = np.dot(X_batch.T, error) / len(y_batch)

                # Update weights
                self.weights -= self.lr * grad

            # Compute loss over full dataset
            z_all = np.dot(X_b, self.weights)
            y_pred_all = softmax(z_all)
            loss = self.compute_loss(y_one_hot, y_pred_all)
            self.loss_history.append(loss)

            if verbose and epoch % 5 == 0:
                print(f"Epoch {epoch}: Loss = {loss:.6f}")

            # Early stopping check
            if loss < best_loss - self.tol:
                best_loss = loss
                no_improve_count = 0
            else:
                no_improve_count += 1
                if no_improve_count >= self.patience:
                    print(f"Early stopping at epoch {epoch}. Best loss: {best_loss:.6f}")
                    break

        return self

    def predict_proba(self, X):
        """Predict class probabilities."""
        X_b = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        z = np.dot(X_b, self.weights)
        return softmax(z)

    def predict(self, X):
        """Predict class labels."""
        return np.argmax(self.predict_proba(X), axis=1)

In [6]:
iris = load_iris()
X, y = iris.data, iris.target
    
# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)
    
# Split data
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
)
    
# Train model
model = SoftmaxRegressionImplement(
    learning_rate=0.1, 
    epochs=100, 
    batch_size=32, 
    tol=1e-4, 
    patience=10
)
model.fit(X_train, y_train, verbose=True)
    
# Evaluate
y_pred = model.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"\nTest accuracy: {accuracy:.4f}")

Epoch 0: Loss = 0.820129
Epoch 5: Loss = 0.501041
Epoch 10: Loss = 0.424129
Epoch 15: Loss = 0.383676
Epoch 20: Loss = 0.356190
Epoch 25: Loss = 0.335218
Epoch 30: Loss = 0.317913
Epoch 35: Loss = 0.302942
Epoch 40: Loss = 0.289778
Epoch 45: Loss = 0.278012
Epoch 50: Loss = 0.267512
Epoch 55: Loss = 0.258033
Epoch 60: Loss = 0.249207
Epoch 65: Loss = 0.241185
Epoch 70: Loss = 0.233779
Epoch 75: Loss = 0.226923
Epoch 80: Loss = 0.220632
Epoch 85: Loss = 0.214768
Epoch 90: Loss = 0.209310
Epoch 95: Loss = 0.204215

Test accuracy: 1.0000
