# Gradient Descent


In [1]:
import numpy as np
import matplotlib.pyplot as plt

## Linear Function

In [3]:
def LinearFunction(W,b,x):
    return (W*x + b)

## Mean square Error/ Loss Function

In [5]:
def MeanSquareError(W,b,x,y): #loss function
    return np.mean((LinearFunction(W,b,x) - y)**2)

## Gradient Function

change in Loss function with respect to W = x * Error,

change in Loss function with respect to b = Error

In [7]:
def gradient(W,b,x,y): #mse derivatives
    return np.mean(x*(W*x+b-y), axis=-1), np.mean(W*x+b-y, axis=-1)

## Vanilla Gradient Descent

In [None]:
def vanilla_gradient_descent(W,b,x,y,lr = 1e-5, epsilon = 1e-4):
    prev_error = 0
    error = np.array([])
    while True:
        gradient_W, gradient_b = gradient(W,b,x,y)

        if abs(MeanSquareError(W, b, x, y) - prev_error) < epsilon:
            break
            
        prev_error = MeanSquareError(W,b,x,y)
        error = np.insert(error, len(error), prev_error)

        W -= lr * gradient_W
        b -= lr * gradient_b
        
    return a, b, error

## Momentum Gradient Descent

In [14]:
def momentum_gradient_descent(W,b,x,y,lr=1e-5,momentum=0.9,epsilon=1e-4, batch_size=0):
    if batch_size == 0: batch_size = len(x)
    prev_grad_W = 0
    prev_grad_b = 0
    prev_error = 0
    error = np.array([])
    while True:
        x_shuffled, y_shuffled = shuffle(x,y)
        gradient_W, gradient_b = gradient(W,b,x_shuffled[:batch_size],y_shuffled[:batch_size])

        if abs(MeanSquareError(W, b, x_shuffled, y_shuffled) - prev_error) < epsilon:
            break
        prev_error = MeanSquareError(W,b,x_shuffled,y_shuffled)
        error = np.insert(error, len(error), prev_error)

        W -= lr * gradient_W + momentum * prev_grad_W
        b -= lr * gradient_b + momentum * prev_grad_b
        
        prev_grad_W = lr * gradient_W + momentum * prev_grad_W
        prev_grad_b = lr * gradient_b + momentum * prev_grad_b
        
    return W, b, error

## ADAGRAD
 -Adaptive technique on basis of how gradient has been changing for all previous iterations, we try to change the learning rate

In [15]:
def adagrad_gradient_descent(W, b, x, y, lr=1e-5, epsilon=1e-4):
    prev_error = 0
    adagrad_W = 0
    adagrad_b = 0
    error = np.array([])
    while True:
        gradient_W, gradient_b = gradient(W, b, x, y)

        if abs(MeanSquareError(W, b, x, y) - prev_error) < epsilon:
            break
        prev_error = MeanSquareError(W, b, x, y)
        error = np.insert(error, len(error), prev_error)

        adagrad_W += gradient_W**2
        adagrad_b += gradient_b**2
        W -= (lr / (adagrad_W**0.5 + 1e-8)) * gradient_W
        b -= (lr / (adagrad_b**0.5 + 1e-8)) * gradient_b
        
    return W, b, error

## RMS Prop

- Damps out oscillation in vertical direction and so converges quickly
- Allows use of larger value of learning rate

In [16]:
def rmsprop_gradient_descent(a, b, x, y, lr=1e-5, gamma=0.9, epsilon=1e-4):
    prev_error = 0
    rmsprop_a = 0
    rmsprop_b = 0
    error = np.array([])
    while True:
        gradient_a, gradient_b = gradient(a, b, x, y)
#         print(abs(mse(a, b, x, y) - prev_error))
        if abs(MeanSquareError(a, b, x, y) - prev_error) < epsilon:
            break
        prev_error = MeanSquareError(a, b, x, y)
        error = np.insert(error, len(error), prev_error)

        rmsprop_a = gamma * rmsprop_a + (1-gamma) * (gradient_a**2)
        rmsprop_b = gamma * rmsprop_b + (1-gamma) * (gradient_b**2)
        a -= (lr / (rmsprop_a**0.5 + 1e-8)) * gradient_a
        b -= (lr / (rmsprop_b**0.5 + 1e-8)) * gradient_b
    return a, b, error

## AdamOptimizer

- Momentum + RMS_prop

In [17]:
def adam_gradient_descent(a, b, x, y, lr=1e-5, b1=0.9, b2=0.999, epsilon=1e-4):
    prev_error = 0
    m_a = v_a = m_b = v_b = 0
    moment_m_a = moment_v_a = moment_m_b = moment_v_b = 0
    t = 0
    error = np.array([])
    while True:
        gradient_a, gradient_b = gradient(a, b, x, y)
#         print(abs(mse(a, b, x, y) - prev_error))
        if abs(MeanSquareError(a, b, x, y) - prev_error) < epsilon:
            break
        t += 1
        prev_error = MeanSquareError(a, b, x, y)
        error = np.insert(error, len(error), prev_error)

        m_a = b1 * m_a + (1-b1)*gradient_a
        v_a = b2 * v_a + (1-b2)*gradient_a**2
        m_b = b1 * m_b + (1-b1)*gradient_b
        v_b = b2 * v_b + (1-b2)*gradient_b**2
        moment_m_a = m_a / (1-b1**t)
        moment_v_a = v_a / (1-b2**t)
        moment_m_b = m_b / (1-b1**t)
        moment_v_b = v_b / (1-b2**t)
        a -= (lr*moment_m_a) / (moment_v_a**0.5 + 1e-8)
        b -= (lr*moment_m_b) / (moment_v_b**0.5 + 1e-8)
    return a, b, error