In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from numpy.linalg import norm
from sklearn.metrics import r2_score

In [2]:
def grad_descent(x, y, alpha, max_iterations):
    theta0, theta1= 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    no_samples = len(y)
    for i in range(max_iterations):
        # Calculating the predicted values
        predicted = theta0 + theta1 * x
        predicted_lst.append(predicted)
        # Calculating the cost
        cost = np.sum((predicted - y)**2) / (2 * no_samples)
        cost_lst.append(cost)
        # Calculating the gradient
        grad0 = np.sum(predicted - y) / no_samples
        grad1 = np.sum((predicted - y)*x) / no_samples
        grad = np.array([grad0, grad1])
        # Updating the thetas
        theta0 = theta0 - (alpha * grad0)
        theta1 = theta1 - (alpha * grad1) 
        theta0_lst.append(theta0)
        theta1_lst.append(theta1)
        #counter += 1
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]

In [3]:
def mini_batch_grad_descent(x, y, alpha, max_iterations, batches):
    theta0, theta1 = 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    batch_samples = int(len(y) / batches)
    for i in range(max_iterations):
        for j in range(batch_samples):
            # Calculating the predicted values
            predicted = theta0 + theta1 * x
            predicted_lst.append(predicted)
            # Calculating the cost
            cost = np.sum((predicted - y)**2) / (2 * batch_samples)
            cost_lst.append(cost)
            # Calculating the gradient
            grad0 = np.sum(predicted - y) / batch_samples
            grad1 = np.sum((predicted - y)*x) / batch_samples
            grad = np.array([grad0, grad1])
            # Updating the thetas
            theta0 = theta0 - (alpha * sum(predicted - y) / batch_samples)
            theta1 = theta1 - (alpha * sum((predicted - y) * x)) / (batch_samples) 
            theta0_lst.append(theta0)
            theta1_lst.append(theta1)
            # counter += 1
         # Stop conditions
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]

In [4]:
def stochastic_grad_descent(x, y, alpha, max_iterations):
    theta0, theta1 = 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    no_samples = len(y)
    for i in range(max_iterations):
        for j in range(no_samples):
            # Calculating the predicted values
            predicted = theta0 + theta1 * x[j]
            predicted_lst.append(predicted)
            # Calculating the cost
            cost = (predicted - y[j])**2 / 2 
            cost_lst.append(cost)
            # Calculating the gradient
            grad0 = predicted - y[j]
            grad1 = (predicted - y[j])*x[j]
            grad=np.array([grad0, grad1])
            # Updating the thetas
            theta0 = theta0 - (alpha * grad0)
            theta1 = theta1 - (alpha * grad1) 
            theta = np.array([theta0, theta1])
            theta0_lst.append(theta0)
            theta1_lst.append(theta1)
            # counter += 1
        # Stop conditions
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]

In [5]:
def adagrad(x, y, alpha, epsilon, max_iterations):
    theta0, theta1, v0, v1 = 0, 0, 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    no_samples = len(y)
    for i in range(max_iterations):
        # Calculating the predicted values
        predicted = theta0 + theta1 * x
        predicted_lst.append(predicted)
        # Calculating error
        err = predicted - y
        # Calculating the cost
        cost = np.sum(err**2) / (2 * no_samples)
        cost_lst.append(cost)
        # Calculating the gradient
        grad0 = np.sum(err) / no_samples
        grad1 = np.sum(err*x) / no_samples
        grad = np.array([grad0, grad1])
        # Calculating v for applying adaptive learning rate
        v0 = v0 + (grad0**2)
        v1 = v1 + (grad1**2)
        # Calculating adaptive learning rate
        alr0 = alpha / (np.sqrt(v0) + epsilon)
        alr1 = alpha / (np.sqrt(v1) + epsilon)
        # Updating the thetas
        theta0 = theta0 - (alr0 * grad0)
        theta1 = theta1 - (alr1 * grad1)
        theta0_lst.append(theta0)
        theta1_lst.append(theta1)
        # counter += 1
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]

In [6]:
def rmsprob(x, y, alpha, epsilon, beta, max_iterations):
    theta0, theta1, v0, v1 = 0, 0, 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    no_samples = len(y)
    for i in range(max_iterations):
        # Calculating the predicted values
        predicted = theta0 + theta1 * x
        predicted_lst.append(predicted)
        # Calculating error
        err = predicted - y
        # Calculating the cost
        cost = np.sum(err**2) / (2 * no_samples)
        cost_lst.append(cost)
        # Calculating the gradient
        grad0 = np.sum(err) / no_samples
        grad1 = np.sum(err*x) / no_samples
        grad = np.array([grad0, grad1])
        # Calculating v for applying adaptive learning rate
        v0 = beta * v0 + (1-beta) * (grad0**2)
        v1 = beta * v1 + (1 - beta) * (grad1**2)
        # Calculating adaptive learning rate
        alr0 = alpha / (np.sqrt(v0) + epsilon)
        alr1 = alpha / (np.sqrt(v1) + epsilon)
        # Updating the thetas
        theta0 = theta0 - (alr0 * grad0)
        theta1 = theta1 - (alr1 * grad1)
        theta0_lst.append(theta0)
        theta1_lst.append(theta1)
        # counter += 1
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]

In [7]:
def adam(x, y, alpha, epsilon, beta1, beta2, max_iterations):
    theta0, theta1, v0, v1, m0, m1 = 0, 0, 0, 0, 0, 0
    # counter = 0
    theta0_lst, theta1_lst, cost_lst, predicted_lst = [], [], [], []
    no_samples = len(y)
    for i in range(max_iterations):
        # Calculating the predicted values
        predicted = theta0 + theta1 * x
        predicted_lst.append(predicted)
        # Calculating error
        err = predicted - y
        # Calculating the cost
        cost = np.sum(err**2) / (2 * no_samples)
        cost_lst.append(cost)
        # Calculating the gradient
        grad0 = np.sum(err) / no_samples
        grad1 = np.sum(err*x) / no_samples
        grad = np.array([grad0, grad1])
        # Calculating moment
        m0 = beta1 * m0 + (1-beta1) * grad0
        m1 = beta1 * m1 + (1-beta1) * grad1
        # Calculating v for applying adaptive learning rate
        v0 = beta2 * v0 + (1-beta2) * (grad0**2)
        v1 = beta2 * v1 + (1-beta2) * (grad1**2)
        # Bias Correction
        m0_t = m0 / (1-(beta1**(i+1)))
        m1_t = m1 / (1-(beta1**(i+1)))
        
        v0_t = v0 / (1-(beta2**(i+1)))
        v1_t = v1 / (1-(beta2**(i+1)))
        # Calculating adaptive learning rate
        alr0 = alpha / (np.sqrt(v0_t) + epsilon)
        alr1 = alpha / (np.sqrt(v1_t) + epsilon)
        # Updating the thetas
        theta0 = theta0 - (alr0 * m0_t)
        theta1 = theta1 - (alr1 * m1_t)
        theta0_lst.append(theta0)
        theta1_lst.append(theta1)
        # counter += 1
        if len(cost_lst) > 1:
            if norm(grad) < 0.0001:
                break
            elif np.absolute(cost_lst[i] - cost_lst[i-1]) < 0.0001:
                break
            elif np.absolute(norm([theta0_lst[i], theta1_lst[i]]) - norm([theta0_lst[i-1], theta1_lst[i-1]])) < 0.0001:
                break
    return theta0_lst[-1], theta1_lst[-1]