# Softmax Regression

$$
\text{Let the model be defined as:} \quad z_i = \mathbf{w}_i^\top \mathbf{x} + b_i
$$

$$
\text{Softmax function:} \quad \text{Softmax}(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{K} e^{z_j}}
$$

$$
\text{Loss function (Cross-Entropy Loss):} \quad L = -\sum_{i=1}^{K} y_i \log(\text{Softmax}(z_i))
$$

$$
\text{Gradient of the loss with respect to } z_i:
$$
$$
\frac{\partial L}{\partial z_i} = \text{Softmax}(z_i) - y_i
$$

$$
\text{Gradient with respect to the weight } \mathbf{w}_i:
$$
$$
\frac{\partial L}{\partial \mathbf{w}_i} = \frac{\partial L}{\partial z_i} \cdot \frac{\partial z_i}{\partial \mathbf{w}_i} = (\text{Softmax}(z_i) - y_i) \cdot \mathbf{x}
$$

$$
\text{Gradient with respect to the bias } b_i:
$$
$$
\frac{\partial L}{\partial b_i} = \frac{\partial L}{\partial z_i} \cdot \frac{\partial z_i}{\partial b_i} = \text{Softmax}(z_i) - y_i
$$

Summary of Gradients
$$
\boxed{
\begin{aligned}
\frac{\partial L}{\partial \mathbf{w}_i} &= (\text{Softmax}(z_i) - y_i) \mathbf{x} \\
\frac{\partial L}{\partial b_i} &= \text{Softmax}(z_i) - y_i
\end{aligned}
}
$$


$$
\text{Softmax}(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{K} e^{z_j}}
$$

$$
\frac{\partial \text{Softmax}(z_i)}{\partial z_j} =
\begin{cases}
\text{Softmax}(z_i) \left(1 - \text{Softmax}(z_i)\right) & \text{if } i = j \\
- \text{Softmax}(z_i) \text{Softmax}(z_j) & \text{otherwise}
\end{cases}
$$

$$
L = -\sum_{i=1}^{K} y_i \log(\text{Softmax}(z_i))
$$


In [None]:
import numpy as np

def softmax(z):
    """
    Compute the softmax of each row of the input array.

    Args:
        z (np.ndarray): Input array of shape (n_samples, n_classes).

    Returns:
        np.ndarray: Softmax probabilities of shape (n_samples, n_classes).
    
    e^z / sum(e^z)
    """
    # For numerical stability, subtract the max from each row
    z_shift = z - np.max(z, axis=1, keepdims=True)
    # apply softmax for each row
    exp_z = np.exp(z_shift)
    sum_exp_z = np.sum(exp_z, axis=1, keepdims=True)
    return exp_z / sum_exp_z


class CCE_Loss:
    """
    Categorical Cross Entropy Loss
    loss = -sum(y_true * log(y_pred))
    """

    def __call__(self, y_true, y_pred):
        # To prevent log(0), add a small epsilon
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.sum(y_true * np.log(y_pred), axis=1)  # Sum over classes
        self.y_true, self.y_pred = y_true, y_pred
        return np.mean(loss)  # Average over samples

    def get_loss_grad(self):
        """
        Gradient of CCE loss with respect to logits z
        Assuming y_pred = softmax(z), the gradient is (y_pred - y_true) / n_samples
        """
        loss_grad = (self.y_pred - self.y_true) / self.y_true.shape[0]
        return loss_grad


In [None]:
class SoftmaxModel:
    def __init__(self, input_dim, output_dim):
        """
        Initialize weights and bias for multiclass classification.

        Args:
            input_dim (int): Number of input features.
            output_dim (int): Number of classes.
        """
        self.weight = np.random.randn(input_dim, output_dim) * 0.01  # Small random weights
        self.bias = np.zeros((1, output_dim))  # Bias initialized to zeros

    def __call__(self, X):
        """
        Forward pass: compute logits and softmax probabilities.

        Args:
            X (np.ndarray): Input data of shape (n_samples, input_dim).

        Returns:
            np.ndarray: Predicted probabilities of shape (n_samples, output_dim).
        """
        self.X = X  # Store input for backward pass
        z = X @ self.weight + self.bias  # Compute logits
        y_pred = softmax(z)  # Apply softmax activation
        return y_pred

    def backward(self, loss_grad):
        """
        Backward pass: compute gradients of weights and bias.

        Args:
            loss_grad (np.ndarray): Gradient of loss w.r. to logits z, shape (n_samples, output_dim).
        """
        self.weight_grad = self.X.T @ loss_grad  # Gradient w.r. to weights
        self.bias_grad = np.sum(loss_grad, axis=0, keepdims=True)  # Gradient w.r. to bias

    def step(self, lr=0.01):
        """
        Update weights and bias using gradient descent.

        Args:
            lr (float): Learning rate.
        """
        self.weight -= lr * self.weight_grad
        self.bias -= lr * self.bias_grad


# Resource
- https://awjuliani.medium.com/simple-softmax-in-python-tutorial-d6b4c4ed5c16

In [None]:
def generate_multiclass_data(n_samples=300, n_features=2, n_classes=3):
    """
    Generate synthetic multiclass classification data.

    Args:
        n_samples (int): Total number of samples.
        n_features (int): Number of input features.
        n_classes (int): Number of classes.

    Returns:
        tuple: Tuple containing:
            - X (np.ndarray): Feature matrix of shape (n_samples, n_features).
            - y_true (np.ndarray): One-hot encoded labels of shape (n_samples, n_classes).
    """
    np.random.seed(42)  # For reproducibility
    X = np.random.randn(n_samples, n_features) # (n, p)
    true_weights = np.random.randn(n_features, n_classes) # (n, k)
    true_bias = np.random.randn(1, n_classes) # (1, 3) not (n, k) b.c hat each class has a single bias term that applies to all samples.

    logits = X @ true_weights + true_bias
    y_prob = softmax(logits)
    y_indices = np.argmax(y_prob, axis=1)

    # One-hot encode the labels
    y_true = np.zeros((n_samples, n_classes)) ()
    y_true[np.arange(n_samples), y_indices] = 1

    return X, y_true
