In [None]:
import numpy as np

class SoftmaxRegression:
    def __init__(self,learning_rate=0.01,epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.n_classes = None
        self.weights = None
        self.bias = None

    def softmax(self,z):
        z = z - np.max(z,axis=1,keepdims=True)
        exp_z = np.exp(z)
        return exp_z / np.sum(exp_z,axis=1,keepdims=True)
    
    def calculate_loss(self,y_pred,y_true):
        eps = 1e-9
        return -np.mean(np.sum(y_true*np.log(y_pred+eps),axis=1))

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64)
        Y = np.asarray(Y,dtype=np.int64)

        self.n_classes = len(np.unique(Y))
        y_onehot = np.eye(self.n_classes)[Y]

        num_samples, num_features = X.shape
        self.weights = np.zeros((num_features,self.n_classes))  #(n,c)
        self.bias = np.zeros((1,self.n_classes))

        for epoch in range(self.epochs):
            y_pred = self.softmax(X @ self.weights  + self.bias)

            dw = (1/num_samples)*(X.T @ (y_pred - y_onehot))
            db = (1/num_samples)*np.sum(y_pred - y_onehot,axis=0,keepdims=True)

            self.weights -= self.learning_rate*dw
            self.bias -= self.learning_rate*db

            self.calculate_loss(y_pred,y_onehot)

        return self

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        probabilities = self.softmax(X @ self.weights + self.bias)
        return np.argmax(probabilities,axis=1).astype(int)
    
    def score(self, X, Y):
        """Compute accuracy."""
        Y = np.asarray(Y, dtype=np.int64)
        preds = self.predict(X)
        return np.mean(preds == Y)
    

np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(3,200)

model = SoftmaxRegression(learning_rate=0.01,epochs=1000)
model.fit(X,Y)
model.predict(X[:20])
model.score(X[:20],Y[:20])   

# Converting classes to numbers
# class_names = df["label"].unique()
# class_to_int = {name:i for i,name in enumerate(class_names)}

# label_int = df["label"].map(class_to_int).values

In [None]:
# Multi Class Softmax Cross Entropy
import numpy as np

class SoftmaxRegressionGD:
    """
    Multinomial Logistic Regression (Softmax Regression) using Gradient Descent.
    """

    def __init__(self, learning_rate=0.01, epochs=1000, tol=1e-8, verbose=False):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.tol = tol
        self.verbose = verbose
        self.weights = None      # (n_features+1, n_classes)
        self.loss_history = []
        self.n_classes = None

    def add_bias(self, X):
        """Add bias column (intercept term)."""
        return np.hstack((np.ones((X.shape[0], 1)), X))

    def softmax(self, z):
        """Row-wise softmax function for probability distribution."""
        # exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # stability trick
        exp_z = np.exp(z)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def calculate_loss(self, X, Y_onehot):
        """Cross-entropy loss for multi-class classification."""
        m = X.shape[0]
        probs = self.softmax(X @ self.weights)
        eps = 1e-9  # prevent log(0)
        return -np.sum(Y_onehot * np.log(probs + eps)) / m

    def fit(self, X, Y):
        """
        Train the softmax regression model using batch gradient descent.
        
        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features)
        Y : np.ndarray of shape (n_samples,) with integer class labels
        """
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(Y, dtype=np.int64)

        # One-hot encode labels
        self.n_classes = len(np.unique(Y))
        Y_onehot = np.eye(self.n_classes)[Y]   #this creates an Identity matrix with n_classes rows , with 1 in it's diagonal

        # Add bias
        X = self.add_bias(X)
        m, n = X.shape

        # Initialize weights
        self.weights = np.zeros((n, self.n_classes))

        prev_loss = float("inf")
        for epoch in range(self.epochs):
            # Forward pass: probabilities
            probs = self.softmax(X @ self.weights)

            # Gradient: X.T @ (probs - Y_onehot)
            gradients = (X.T @ (probs - Y_onehot)) / m

            # Update weights
            self.weights -= self.learning_rate * gradients

            # Compute and store loss
            loss = self.calculate_loss(X, Y_onehot)
            self.loss_history.append(loss)

            # Early stopping
            if abs(prev_loss - loss) < self.tol:
                if self.verbose:
                    print(f"Early stopping at epoch {epoch}, Loss = {loss:.6f}")
                break
            prev_loss = loss

            if self.verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss = {loss:.6f}")

        return self

    def predict_proba(self, X):
        """Predict class probabilities."""
        X = np.asarray(X, dtype=np.float64)
        X = self.add_bias(X)
        return self.softmax(X @ self.weights)

    def predict(self, X):
        """Predict class labels (argmax over probabilities)."""
        return np.argmax(self.predict_proba(X), axis=1)

    def score(self, X, Y):
        """Compute accuracy."""
        Y = np.asarray(Y, dtype=np.int64)
        preds = self.predict(X)
        return np.mean(preds == Y)

In [None]:
import pandas as pd
import numpy as np

# Sample dataframe
df = pd.DataFrame({
    "feature1": [5.1, 4.9, 6.7, 1.3],
    "feature2": [3.5, 3.0, 3.1, 1.3],
    "label": ["cat", "dog", "cat","elephant"]
})

class_names = df["label"].unique()              # e.g., ["cat", "dog"]
class_to_int = {name: i for i, name in enumerate(class_names)}

Y_int = df["label"].map(class_to_int).values    # convert to numeric
print(Y_int)
# [0 1 0]

[0 1 0 2]


In [5]:
# Fake dataset: 3 classes, 2 features
np.random.seed(42)
X = np.random.randn(150, 2)
Y = np.random.choice(3, 150)  # labels in {0,1,2}

# Train softmax regression
model = SoftmaxRegressionGD(learning_rate=0.1, epochs=1000, verbose=True)
model.fit(X, Y)

# Predictions
print("Predicted labels:", model.predict(X[:20]))
print("Predicted probabilities:\n", model.predict_proba(X[:5]))
print("Accuracy:", model.score(X, Y))

Epoch 0, Loss = 1.096622
Epoch 100, Loss = 1.067714
Epoch 200, Loss = 1.067621
Early stopping at epoch 228, Loss = 1.067621
Predicted labels: [0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0]
Predicted probabilities:
 [[0.4182568  0.23357357 0.34816963]
 [0.46319455 0.21116189 0.32564355]
 [0.42373916 0.26340791 0.31285292]
 [0.42613931 0.18666336 0.38719733]
 [0.44753514 0.26346265 0.28900221]]
Accuracy: 0.3933333333333333


Below is a better implementation of Softmax Regression from Scratch

In [None]:
import numpy as np

class SoftmaxRegression:
    def __init__(self,learning_rate=0.01,epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.n_classes = None
        self.weights = None
        self.bias = None

    def softmax(self,z):
        z = z - np.max(z,axis=1,keepdims=True)
        exp_z = np.exp(z)
        return exp_z / np.sum(exp_z,axis=1,keepdims=True)
    
    def calculate_loss(self,y_pred,y_true):
        eps = 1e-9
        return -np.mean(np.sum(y_true*np.log(y_pred+eps),axis=1))

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64)
        Y = np.asarray(Y,dtype=np.int64)

        self.n_classes = len(np.unique(Y))
        y_onehot = np.eye(self.n_classes)[Y]

        num_samples, num_features = X.shape
        self.weights = np.zeros((num_features,self.n_classes))  #(n,c)
        self.bias = np.zeros((1,self.n_classes))

        for epoch in range(self.epochs):
            y_pred = self.softmax(X @ self.weights  + self.bias)

            dw = (1/num_samples)*(X.T @ (y_pred - y_onehot))
            db = (1/num_samples)*np.sum(y_pred - y_onehot,axis=0,keepdims=True)

            self.weights -= self.learning_rate*dw
            self.bias -= self.learning_rate*db

            self.calculate_loss(y_pred,y_onehot)

        return self

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        probabilities = self.softmax(X @ self.weights + self.bias)
        return np.argmax(probabilities,axis=1).astype(int)
    
    def score(self, X, Y):
        """Compute accuracy."""
        Y = np.asarray(Y, dtype=np.int64)
        preds = self.predict(X)
        return np.mean(preds == Y)
    

np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(3,200)

model = SoftmaxRegression(learning_rate=0.01,epochs=1000)
model.fit(X,Y)
model.predict(X[:20])
model.score(X[:20],Y[:20])   

# Converting classes to numbers
# class_names = df["label"].unique()
# class_to_int = {name:i for i,name in enumerate(class_names)}

# label_int = df["label"].map(class_to_int).values

np.float64(0.2)

In [2]:
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(3,200)

model = SoftmaxRegression(learning_rate=0.01,epochs=1000)
model.fit(X,Y)

model.predict(X[:20])

model.score(X[:20],Y[:20])

np.float64(0.2)

In [17]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
     "feature1": [5.1, 4.9, 6.7, 1.3],
    "feature2": [3.5, 3.0, 3.1, 1.3],
    "label": ["cat", "dog", "cat","elephant"]
})

class_names = df["label"].unique()
class_to_int = {name:i for i,name in enumerate(class_names)}

label_int = df["label"].map(class_to_int).values
print(label_int)

[0 1 0 2]
