# Mini Batch Gradient Descent

In [1]:
# as usual we will need a reference
from sklearn.datasets import load_diabetes

from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
x, y = load_diabetes(return_X_y=True)

In [3]:
print(x.shape)
print(y.shape)

(442, 10)
(442,)


In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

### Reference - Linear Regression (without Gradient Descent)

In [5]:
reg = LinearRegression()
reg.fit(x_train, y_train)

In [6]:
print(reg.coef_)
print(reg.intercept_)

[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238]
151.88331005254165


In [7]:
y_pred = reg.predict(x_test)
r2_score(y_test, y_pred)

0.43993386615689667

### Custom Class for Mini Batch Gradient Descent

In [8]:
import random

class MBGDRegressor:

    def __init__(self, batch_size, learning_rate = 0.1, epochs = 100):
        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, x_train, y_train):
        # initialise the coef's
        self.intercept_ = 0
        self.coef_ = np.ones(x_train.shape[1])
        
        # instead of updating for all the rows in one epochs
        # we will update 'batch_size' number of times in each epochs
        for i in range(self.epochs):
            for j in range(int(x_train.shape[0]/self.batch_size)):

                # find a random indexes from all the rows present
                idx = random.sample(range(x_train.shape[0]), self.batch_size)

                # now the y_hat
                y_hat = np.dot(x_train[idx], self.coef_) + self.intercept_
                intercept_der = -2 * np.mean(y_train[idx] - y_hat)
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)

                coef_der = -2 * np.dot((y_train[idx] - y_hat), x_train[idx])
                self.coef_ = self.coef_ - (self.lr * coef_der)

        print(self.intercept_)
        print(self.coef_)

    def predict(self, x_test):
        return np.dot(x_test, self.coef_) + self.intercept_

In [9]:
# let's say we divide the dataset into 50 parts
mbr = MBGDRegressor(batch_size = int(x_train.shape[0]/50), learning_rate = 0.01, epochs = 100)

In [10]:
mbr.fit(x_train, y_train)
# note this is better than just Stocastic Regressor

151.2865040271679
[  27.93373245 -139.76662835  460.55799173  307.72570823  -23.58558278
  -89.23235736 -193.83552777  107.18198491  412.50752118  118.87189232]


In [11]:
y_pred = mbr.predict(x_test)

In [12]:
# checking the accracy
r2_score(y_test, y_pred)
# we got close enough
# if we use Cross-Val-Score we might go even near to sklearn

0.4525102020588744

### Using Sklearn SGDRegressor 
- we are making sure the learning rate is constant
- by default the SGDRegressor takes a variable learning rate and gives better results than our class
- to keep the comparision fair to our custom class we have to fix the learning rate and input a batch size

In [13]:
from sklearn.linear_model import SGDRegressor

In [14]:
sgd = SGDRegressor(learning_rate = 'constant', eta0=0.1)

In [15]:
# keep the batch size also constant
batch_size = 35

# we are running fit for 100 epochs with partial_fit!!
for i in range(100):
    idx = random.sample(range(x_train.shape[0]), batch_size)
    # partial_fit will run ony one epoch
    sgd.partial_fit(x_train[idx], y_train[idx])

In [16]:
sgd.coef_

array([  62.39306818,  -68.36031491,  363.45441375,  235.37719928,
         12.44224441,  -33.66058404, -167.00934673,  116.60886505,
        307.09635181,  138.20649363])

In [17]:
sgd.intercept_

array([155.91680966])

In [18]:
y_pred = sgd.predict(x_test)

In [19]:
r2_score(y_test, y_pred)
# yup we are pretty close!!

0.43327822777022706

Note the "learning schedule" is automatically implemented in SGDRegressor. We just have to leave the learning_rate to it's default value.