Implement and train Softmax Regression with mini-batch SGD and early stopping.

The expected outcome.
* Implement Softmax Regression Model.
* Implement mini-batch SGD.
* The training should support early stopping.
* Train and evaluate the model with cross-validation. The evaluation metric is the *accuracy*.
* Retrain the model with early stopping.


**DO NOT USE SKLEARN**

In [1]:
import numpy as np
import pandas as pd 

from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit

np.random.seed(42)

In [2]:
iris = datasets.load_iris()
X = iris["data"]
y = iris["target"]
df = pd.DataFrame({fname: values for fname, values in zip(iris["feature_names"], X.T)})
df["target"] = y

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Your Code
You can start writing your code from here. Please don't modify any of the previous code.

In [3]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [4]:
def to_one_hot(y):
    n_classes = y.max() + 1
    Y_one_hot = np.zeros((len(y), n_classes))
    Y_one_hot[np.arange(len(y)), y] = 1
    return Y_one_hot

In [5]:
def prediction(X_valid, theta):
    return np.argmax(softmax(X_valid@theta), axis=1)

In [6]:
def mini_batch_gd(X, y, batch_size, epochs, alpha, tolerance=1e-6):
    m = X.shape[0]
    theta = np.zeros((4, 3))
    thetas, lols = [], []
    shuffled_indices = np.random.permutation(m)
    X_shuffled = X[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for epoch in range(epochs):
        for i in range(0, m, batch_size):
            xi = X_shuffled[i: i+batch_size]
            yi = y_shuffled[i: i+batch_size]
            
            gradients = (1/batch_size) * (xi.T @ (softmax(xi@theta) - yi))
            theta = theta - alpha * gradients
            
            thetas.append(theta)
        loss = -np.mean(np.sum(y * np.log((softmax(X@theta)) + 1e-7), axis=1))
        lols.append(loss)
        
        # Early Stopping
        if (epoch > 10) and (lols[-2] - lols[-1]) < tolerance:
            print('epoch:', epoch, 'loss:',(lols[-2] - lols[-1]), 'STOPPING')
            break
            
    return theta 

In [7]:
theta = mini_batch_gd(X, to_one_hot(y), 20, 5000, 0.05)

epoch: 3996 loss: 9.995146595248583e-07 STOPPING


In [8]:
x_pred = prediction(X, theta)
x_pred == y

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

Using the following cell to train and evaluate your model.

In [9]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
cntr = 0

for train_index, test_index in split.split(df, df["target"]):
    cntr += 1
    print(f"fold no {cntr}")
    
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    
    X = strat_train_set.iloc[:, :-1].to_numpy()
    y = strat_train_set.iloc[:, -1].to_numpy()
    
    X_test = strat_test_set.iloc[:, :-1].to_numpy()
    y_test = strat_test_set.iloc[:, -1].to_numpy()
    
    theta = mini_batch_gd(X, to_one_hot(y), 20, 5000, 0.05)
    pred = prediction(X_test, theta)
    accuracy = np.sum(pred == y_test) / len(pred)
    
    print(f'Accuracy = {accuracy}')

fold no 1
epoch: 4251 loss: 9.996340306073215e-07 STOPPING
Accuracy = 1.0
fold no 2
Accuracy = 0.9
fold no 3
epoch: 4968 loss: 9.998329316834864e-07 STOPPING
Accuracy = 0.9
