# Goal

#### Implement batch gradient descent with early stopping for softmax regression from scratch. Use it
#### on a classification task on the Penguins dataset.

# Tools

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import accuracy_score

# Load dataset

In [2]:
data = sns.load_dataset('penguins')
data


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


# Split data

In [3]:
data.dropna(inplace=True)
X = data.drop(['species', 'island', 'sex'], axis=1).values
y = data['species'].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add bias term

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define softmax function

In [6]:
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

# Define batch gradient descent function with early stopping

In [7]:
class SoftmaxRegression:
    def __init__(self, n_features, n_classes, learning_rate=0.1, max_iter=1000, tol=1e-4, verbose=False):
        self.n_features = n_features
        self.n_classes = n_classes
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
    
    def fit(self, X, y):
        # Add bias term to input features
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        
        # Convert labels to one-hot encoding
        y = pd.get_dummies(y).values
        
        # Initialize weight matrix with small random values
        self.W = np.random.randn(self.n_features + 1, self.n_classes) * 0.01
        
        # Batch gradient descent with early stopping
        train_losses = []
        val_losses = []
        for i in range(self.max_iter):
            # Compute softmax probabilities for each example
            y_pred = softmax(X @ self.W)
            
            # Compute cross-entropy loss
            train_loss = -np.sum(y * np.log(y_pred)) / X.shape[0]
            train_losses.append(train_loss)
            
            # Compute gradient of loss function
            grad = X.T @ (y_pred - y) / X.shape[0]
            
            # Update weight matrix
            self.W -= self.learning_rate * grad
            
            # Compute validation loss for early stopping
            if self.verbose and i % 10 == 0:
                y_val_pred = softmax(X_val @ self.W)
                val_loss = -np.sum(y_val * np.log(y_val_pred)) / X_val.shape[0]
                val_losses.append(val_loss)
                print(f"Iteration {i}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
                
                # Check for early stopping
                if len(val_losses) > 1 and val_losses[-1] > val_losses[-2] - self.tol:
                    print(f"Stopped early after {i} iterations")
                    break
                    
        self.train_losses = train_losses
        self.val_losses = val_losses
    
    def predict(self, X):
        # Add bias term to input features
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        
        # Compute softmax probabilities for each example
        y_pred = softmax(X @ self.W)
        
        # Convert probabilities to class labels
        y_pred = np.argmax(y_pred, axis=1)
        
        return y_pred

# Run batch gradient descent with early stopping

In [8]:
model = SoftmaxRegression(n_features=X_train.shape[1], n_classes=len(np.unique(y_train)))

In [9]:
model.fit(X_train, y_train)

In [10]:
# Make predictions on test data
y_pred = model.predict(X_test)
y_pred

array([0, 2, 0, 1, 0, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 1, 0, 2, 0, 0,
       2, 1, 0, 0, 2, 2, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 0, 0, 0, 0, 1, 1,
       0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 1, 1, 2, 2, 2, 0, 0, 2,
       0], dtype=int64)

In [15]:
y_test

array(['Adelie', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Chinstrap', 'Chinstrap', 'Adelie',
       'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Adelie',
       'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo',
       'Chinstrap', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
       'Gentoo', 'Adelie'], dtype=object)

In [18]:
from sklearn.preprocessing import LabelEncoder

labels = np.array(['Adelie', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie', 'Gentoo',
   'Gentoo', 'Chinstrap', 'Chinstrap', 'Chinstrap', 'Adelie',
   'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Adelie',
   'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo',
   'Chinstrap', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap',
   'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo',
   'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
   'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
   'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo',
   'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Chinstrap',
   'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
   'Gentoo', 'Adelie'])

# Create LabelEncoder object
le = LabelEncoder()

# Fit and transform labels
labels_encoded = le.fit_transform(labels)

In [19]:
labels_encoded

array([0, 2, 0, 1, 0, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 1, 0, 2, 0, 0,
       2, 1, 0, 0, 2, 2, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 0, 0, 0, 0, 1, 1,
       0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 1, 1, 2, 2, 2, 0, 0, 2,
       0], dtype=int64)

In [20]:
accuracy = np.mean(y_pred == labels_encoded)
print(f"Accuracy = {accuracy:.8f}")

Accuracy = 1.00000000
