<a href="https://colab.research.google.com/github/Naghiman/AppliedMachineLearning/blob/master/Multi-Class%20Logistic%20Regression%20and%20Gradient%20Descent/LR_GD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')

# if sklearn version is not up to date then
# load_digits(as_frame=True) will fail 
# run these commands the first time to get version 0.24
'''
!pip uninstall scikit-learn -y

!pip install Cython
!pip install git+git://github.com/scikit-learn/scikit-learn.git
!pip freeze | grep scikit
'''


import sklearn
sklearn.__version__


'0.23.2'

# Datasets

## 1. Digits

In [2]:
# LOAD AND PREPROCESS DIGITS DATASET

digits = load_digits(as_frame=True)
ddf = digits.frame
y_digits = ddf['target'].to_numpy()
ddf.drop("target", axis=1, inplace=True)
X_digits = ddf.to_numpy()

print(f'X_digits.shape: {X_digits.shape}')
print(f'y_digits.shape: {y_digits.shape}')

X_digits.shape: (1797, 64)
y_digits.shape: (1797,)


In [3]:
# train, val, test split
X_trn_digits, X_tst_digits, y_trn_digits, y_tst_digits = sklearn.model_selection.train_test_split(X_digits, y_digits, test_size=1/5, random_state=0)
X_trn_digits, X_val_digits, y_trn_digits, y_val_digits = sklearn.model_selection.train_test_split(X_trn_digits, y_trn_digits, test_size=1/4, random_state=0)

# preprocess scales
scaler_digits = sklearn.preprocessing.StandardScaler().fit(X_trn_digits)
X_trn_digits = scaler_digits.transform(X_trn_digits)
X_val_digits = scaler_digits.transform(X_val_digits)
X_tst_digits = scaler_digits.transform(X_tst_digits)

## 2. Credit

In [4]:
# LOAD AND PREPROCESS CREDIT-G DATASET
credit = fetch_openml(name='credit-g',as_frame=True)
cdf = credit.frame

# CONVERT CATEGORICAL FEATURES TO ONE-HOT ENCODING IN CREDIT-G
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(cdf[['checking_status','credit_history','purpose','savings_status','employment','personal_status','other_parties','property_magnitude','other_payment_plans','housing','job','own_telephone','foreign_worker']]).toarray())
cdf = cdf.join(enc_df)
cdf.drop(['checking_status','credit_history','purpose','savings_status','employment','personal_status','other_parties','property_magnitude','other_payment_plans','housing','job','own_telephone','foreign_worker'], axis=1, inplace=True)
class_dict = {"bad": 0, "good": 1}
y_credit = (cdf.replace({"class": class_dict})['class']).to_numpy()
cdf.drop("class", axis=1, inplace=True)
X_credit = cdf.to_numpy()

print(f'X_credit.shape: {X_credit.shape}')
print(f'y_credit.shape: {y_credit.shape}')

X_credit.shape: (1000, 61)
y_credit.shape: (1000,)


In [5]:
# train, val, test split
X_trn_credit, X_tst_credit, y_trn_credit, y_tst_credit = sklearn.model_selection.train_test_split(X_credit, y_credit, test_size=1/5, random_state=0)
X_trn_credit, X_val_credit, y_trn_credit, y_val_credit = sklearn.model_selection.train_test_split(X_trn_credit, y_trn_credit, test_size=1/4, random_state=0)

# preprocess scales
scaler_credit = sklearn.preprocessing.StandardScaler().fit(X_trn_credit)
X_trn_credit = scaler_credit.transform(X_trn_credit)
X_val_credit = scaler_credit.transform(X_val_credit)
X_tst_credit = scaler_credit.transform(X_tst_credit)

# Softmax Regression

## Multi-class logistic regression model

In [6]:
class SoftmaxRegression:
    def __init__(self, add_bias=True, reg=0):
        self.add_bias = add_bias
        self.reg = reg

    def fit(self, x, y, optimizer):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x, np.ones(N)])
        N,D = x.shape
        C = len(np.unique(y))

        def gradient(x, y, w):                          # define the gradient function
            N = x.shape[0]

            # Softmax calculation
            scores = x.dot(w)
            scores -= np.max(scores, axis=1, keepdims=True)
            exp_scores = np.exp(scores)
            softmax = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

            # dw calculation
            indices = np.arange(N)
            softmax_editted = softmax
            softmax_editted[indices, y] -= 1
            dw = np.dot(x.T, softmax_editted)
            dw /= N
            dw += self.reg * 2 * w
            return dw

        w0 = np.zeros((D, C))                                # initialize the weights to 0
        self.w = optimizer.run(gradient, x, y, w0)      # run the optimizer to get the optimal weights
        print(self.w.shape)
        return self
    
    def predict(self, x):
        if self.add_bias:
            x = np.column_stack([x, np.ones(x.shape[0])])
        yh = x@self.w
        y_pred = np.argmax(yh, axis=1)
        return y_pred

## Mini-batch optimization

In [7]:
class MiniBatchGradientMomentum:
    def __init__(self, learning_rate=.001, batch_size=16, momentum=0.9, max_iters=1e4, epsilon=1e-8, record_history=False):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.record_history = record_history
        self.epsilon = epsilon
        self.momentum = momentum
        self.batch_size = batch_size
        if record_history:
            self.w_history = []                

    def run(self, gradient_fn, x, y, w):
        grad = np.inf
        t = 1
        delta_w = 0
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            batch_inds = np.random.choice(x.shape[0], self.batch_size)
            grad = gradient_fn(x[batch_inds], y[batch_inds], w)
            delta_w = self.momentum * delta_w + (1 - self.momentum) * grad              
            w = w - self.learning_rate * delta_w       
            if self.record_history:
                self.w_history.append(w)
            t += 1
        return w

# Train

## 1. Digits

In [8]:
optimizer = MiniBatchGradientMomentum(learning_rate=.005, max_iters=1000, batch_size=16, record_history=True)
model = SoftmaxRegression()
model.fit(X_trn_digits, y_trn_digits, optimizer)

(65, 10)


<__main__.SoftmaxRegression at 0x2d3823e0888>

In [9]:
train_acc_digits = sklearn.metrics.accuracy_score(y_trn_digits, model.predict(X_trn_digits))
val_acc_digits = sklearn.metrics.accuracy_score(y_val_digits, model.predict(X_val_digits))
test_acc_digits = sklearn.metrics.accuracy_score(y_tst_digits, model.predict(X_tst_digits))
print(f'Digits train accuracy: {train_acc_digits}')
print(f'Digits validation accuracy: {val_acc_digits}')
print(f'Digits test accuracy: {test_acc_digits}')

Digits train accuracy: 0.9387186629526463
Digits validation accuracy: 0.9083333333333333
Digits test accuracy: 0.9222222222222223


## 2. Credit

In [10]:
optimizer = MiniBatchGradientMomentum(learning_rate=.005, max_iters=1000, batch_size=16, record_history=True)
model = SoftmaxRegression()
model.fit(X_trn_credit, y_trn_credit, optimizer)

(62, 2)


<__main__.SoftmaxRegression at 0x2d38239ce48>

In [11]:
train_acc_credit = sklearn.metrics.accuracy_score(y_trn_credit, model.predict(X_trn_credit))
val_acc_credit = sklearn.metrics.accuracy_score(y_val_credit, model.predict(X_val_credit))
test_acc_credit = sklearn.metrics.accuracy_score(y_tst_credit, model.predict(X_tst_credit))
print(f'Credit-G train accuracy: {train_acc_credit}')
print(f'Credit-G validation accuracy: {val_acc_credit}')
print(f'Credit-G test accuracy: {test_acc_credit}')

Credit-G train accuracy: 0.8
Credit-G validation accuracy: 0.755
Credit-G test accuracy: 0.725


# Analysis

In [12]:

# softmax class is refactored a bit, predict method moved to optimizer class
# validation sets given as parameters to calculate accuracy for each iteration
class SoftmaxRegression:
    def __init__(self, add_bias=True, reg=0):
        self.add_bias = add_bias
        self.reg = reg

    def fit(self, x, y, X_val, y_val, optimizer, limit):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x, np.ones(N)])
        N,D = x.shape
        C = len(np.unique(y))

        def gradient(x, y, w):                          # define the gradient function
            N = x.shape[0]

            # Softmax calculation
            scores = x.dot(w)
            scores -= np.max(scores, axis=1, keepdims=True)
            exp_scores = np.exp(scores)
            softmax = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

            # dw calculation
            indices = np.arange(N)
            softmax_editted = softmax
            softmax_editted[indices, y] -= 1
            dw = np.dot(x.T, softmax_editted)
            dw /= N
            dw += self.reg * 2 * w
            return dw

        w0 = np.zeros((D, C))                                                # initialize the weights to 0
        self.w = optimizer.run(gradient, x, y, w0, X_val, y_val, limit)      # run the optimizer to get the optimal weights      
        return self


class MiniBatchGradientMomentum:
    def __init__(self, add_bias=True, learning_rate=.001, batch_size=16, momentum=0.9, max_iters=25000, epsilon=1e-8, record_history=False):
        self.add_bias = add_bias
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.record_history = record_history
        self.epsilon = epsilon
        self.momentum = momentum
        self.batch_size = batch_size
        if record_history:
            self.w_history = []                

    def run(self, gradient_fn, x, y, w, X_val, y_val, limit):
        grad = np.inf
        t = 1
        delta_w = 0
        val_accs = []

        # need at least (limit) iterations to look at
        # check validation accuracy values have not decreased for last (limit) iterations
        # add a max iterations check so we don't loop forever in case 2 conditions above fail to stop loop
        while t < self.max_iters and (t < limit or not self.decreasing(val_accs, t, limit, self.epsilon)):
            batch_inds = np.random.choice(x.shape[0], self.batch_size)
            grad = gradient_fn(x[batch_inds], y[batch_inds], w)
            delta_w = self.momentum * delta_w + (1 - self.momentum) * grad              
            w = w - self.learning_rate * delta_w       
            if self.record_history:
                self.w_history.append(w)
            val_acc = sklearn.metrics.accuracy_score(y_val, self.predict(X_val, w))
            val_accs.append(val_acc)
            t += 1
      
        # it is possible the last (limit) iterations happened at the exact so we need an extra check
        # for non decreasing values
        if t == self.max_iters and not self.decreasing(val_accs, t-1, limit, self.epsilon):
            print(f'best validation accuracy ({val_acc}) overshoot to the maximum {t} iterations based on last {limit} values')
        else:
            print(f'best validation accuracy ({val_acc}) found after {t} iterations based on last {limit} values')

        # since we stored accuracy values at each iteration we can compare with the real best value
        print(f'true best result is {max(val_accs)} which happened after {val_accs.index(max(val_accs))} iterations')
        return w
    
    def decreasing(self, values, t, limit, epsilon):
        return all(0<=x-y<=epsilon for x, y in zip(values[t-limit:t], values[t-limit+1:t+1]))

    def predict(self, x, w):
        if self.add_bias:
            x = np.column_stack([x, np.ones(x.shape[0])])
        yh = x@w
        y_pred = np.argmax(yh, axis=1)
        return y_pred

In [None]:
model = SoftmaxRegression()

optimizer = MiniBatchGradientMomentum(learning_rate=.005, max_iters=10000, batch_size=16, record_history=True)
model.fit(X_trn_credit, y_trn_credit, X_val_credit, y_val_credit, optimizer, 200)

optimizer = MiniBatchGradientMomentum(learning_rate=.005, max_iters=20000, batch_size=16, record_history=True)
model.fit(X_trn_digits, y_trn_digits, X_val_digits, y_val_digits, optimizer, 4000)


best validation accuracy (0.765) found after 910 iterations based on last 200 values
true best result is 0.815 which happened after 21 iterations
