In [18]:
import numpy as np
import math
import numpy as np
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

class Regressor:

    def __init__(self) -> None: 
        self.X, self.y = self.generate_dataset(n_samples=200, n_features=1)
        n, d = self.X.shape
        self.w = np.zeros((d, 1))
        self.history = list()
        self.weights=np.zeros((d, 1))

    def generate_dataset(self, n_samples, n_features):
        """
        Generates a regression dataset
        Returns:
            X: a numpy.ndarray of shape (100, 2) containing the dataset
            y: a numpy.ndarray of shape (100, 1) containing the labels
        """
        from sklearn.datasets import make_regression
        
        np.random.seed(42)
        X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=30)
        y = y.reshape(n_samples, 1)
        return X, y


    def linear_regression(self):
        """
        Performs linear regression on a dataset
        Returns:
            y: a numpy.ndarray of shape (n, 1) containing the predictions
        """
        y = np.dot(self.X, self.w)
        return y


    def predict(self, X):
        """
        Predicts the labels for a given dataset
        X: a numpy.ndarray of shape (n, d) containing the dataset
        Returns:
            y: a numpy.ndarray of shape (n,) containing the predictions
        """
        y = np.dot(X, self.w).reshape(X.shape[0])
        return y


    def compute_loss(self):
        """
        Computes the MSE loss of a prediction
        Returns:
            loss: the loss of the prediction
        """
        predictions = self.linear_regression()
        loss = np.mean((predictions - self.y)**2)
        return loss


    def compute_gradient(self):
        """
        Computes the gradient of the MSE loss
        Returns:
            grad: the gradient of the loss with respect to w
        """
        predictions = self.linear_regression()
        dif = (predictions - self.y)
        grad = 2 * np.dot(self.X.T, dif)
        return grad


    def fit(self, optimizer="adam", n_iters=1000, render_animation=False):
        """
        Trains the model
        optimizer: the optimization algorithm to use
        X: a numpy.ndarray of shape (n, d) containing the dataset
        y: a numpy.ndarray of shape (n, 1) containing the labels
        n_iters: the number of iterations to train for
        """        

        figs = []

        for i in range(1, n_iters+1):
            

            if optimizer == 'gd':
                # TODO: implement gradient descent
                self.weights = self.gradient_descent(alpha=0.01)
                
                
            elif optimizer == "sgd":
                # TODO: implement stochastic gradient descent
                self.sgd_optimizer(alpha=0.01)
                
            elif optimizer == "sgdMomentum":
                # TODO: Implement the SGD with momentum
                self.sgd_momentum(alpha=0.01, momentum=0.7)
                
            elif optimizer == "adagrad":
                # TODO: Implement Adagrad
                self.adagrad_optimizer(g=0, alpha=0.3, epsilon=10**-8 )
                
            elif optimizer == "rmsprop":
                # TODO: implement RMSprop
                self.rmsprop_optimizer(g=0, alpha=0.3, beta=0.7, epsilon=10**-8)
                
            elif optimizer == "adam":
                # TODO: implement Adam optimizer
                self.adam_optimizer(m=0, v=0, alpha=0.3, beta1=0.9, beta2=0.999, epsilon=10**-8, iter_num=i)

            # TODO: implement the stop criterion
            if math.isinf(float(self.compute_loss())):
                    break
                    
                 
            
            if i % 10 == 0:
                print("Iteration: ", i)
                J = self.compute_loss()
                self.history.append(J)
                print("Loss: ", J)
            
            if render_animation:
                import matplotlib.pyplot as plt
                from moviepy.video.io.bindings import mplfig_to_npimage

                fig = plt.figure()
                plt.scatter(self.X, self.y, color='red')
                plt.plot(self.X, self.predict(self.X), color='blue')
                plt.xlim(self.X.min(), self.X.max())
                plt.ylim(self.y.min(), self.y.max())
                plt.title(f'Optimizer:{optimizer}\nIteration: {i}')
                plt.close()
                figs.append(mplfig_to_npimage(fig))
            
        
        if render_animation and len(figs) > 0:
            from moviepy.editor import ImageSequenceClip
            clip = ImageSequenceClip(figs, fps=5)
            clip.write_gif(f'{optimizer}_animation.gif', fps=5)


    def gradient_descent(self, alpha):
        """
        Performs gradient descent to optimize the weights
        alpha: the learning rate
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
        """
        w=self.w
        # TODO: implement gradient descent
        n, d = self.X.shape
        diff=np.dot(self.X,self.w)-self.y
        TEMP=np.dot(self.X.T,diff)
        coef=alpha/n
        self.w=self.w-coef*TEMP

        return self.w


    def sgd_optimizer(self, alpha):
        """
        Performs stochastic gradient descent to optimize the weights
        alpha: the learning rate
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
        """
        w=self.w
        gradian = self.compute_gradient()
        # TODO: implement gradient descent
        n, d = self.X.shape
#         diff=np.dot(self.X,self.w)-self.y
#         TEMP=np.dot(self.X.T,diff)
        coef=alpha/n
        self.w=self.w-coef*gradian

        return self.w


    def sgd_momentum(self, alpha=0.01, momentum=0.7):
        """
        Performs SGD with momentum to optimize the weights
        alpha: the learning rate
        momentum: the momentum
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
        """
        change = 0.0
        w = self.w
        # TODO: implement stochastic gradient descent
        gradient=self.compute_gradient()
        change = 0.0
        new_change = alpha * gradient + momentum * change
        self.w=self.w-new_change
        change =new_change

        return self.w

    
    def adagrad_optimizer(self, g, alpha, epsilon):
        
        """
        Performs Adagrad optimization to optimize the weights
        alpha: the learning rate
        epsilon: a small number to avoid division by zero
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
            ...
        """
        w = self.w
        # TODO: implement stochastic gradient descent
        gradient=self.compute_gradient()
#         squered_grad = np.zeros(gradient.shape)
#         new_squered_grad =  np.power(gradient,2) 
        new_g= g + np.power(gradient,2) 
        temp = np.sqrt(new_g  )        
        self.w=self.w -alpha*(gradient/temp+ epsilon)
        g = new_g
        return self.w

    
    def rmsprop_optimizer(self, g, alpha, beta, epsilon):
        """
        Performs RMSProp optimization to optimize the weights
        g: sum of squared gradients
        alpha: the learning rate
        beta: the momentum
        epsilon: a small number to avoid division by zero
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
            ...
        """
        w = self.w
        # TODO: implement stochastic gradient descent
        gradient=self.compute_gradient()
        
        new_g= beta*g + (1-beta)*np.power(gradient,2) 
        temp = np.sqrt(new_g  )        
        self.w=self.w -alpha*(gradient/temp+ epsilon)
        g = new_g
        return self.w


    def adam_optimizer(self, m, v, alpha, beta1, beta2, epsilon, iter_num):
        """
        Performs Adam optimization to optimize the weights
        m: the first moment vector
        v: the second moment vector
        alpha: the learning rate
        beta1: the first momentum
        beta2: the second momentum
        epsilon: a small number to avoid division by zero
        Returns:
            w: a numpy.ndarray of shape (d, 1) containing the optimized weights
            ...
        """
        w = self.w
        # TODO: implement stochastic gradient descent
        gradient=self.compute_gradient()
        
        new_m= beta1*m + (1-beta1)*gradient
        new_v= beta2*v + (1-beta2)*np.power(gradient,2)
        
        m = new_m
        v = new_v
        
        
        m_corrected =m/(1-beta1**iter_num)
        v_corrected =v/(1-beta2**iter_num)
        
        
        
        temp = np.sqrt(v_corrected)        
        self.w=self.w -alpha*(m_corrected/temp+ epsilon)
        
        return self.w


#     def plot_gradient(self):
#         """
#         Plots the gradient descent path for the loss function
#         Useful links: 
#         -   http://www.adeveloperdiary.com/data-science/how-to-visualize-gradient-descent-using-contour-plot-in-python/
#         -   https://www.youtube.com/watch?v=zvp8K4iX2Cs&list=LL&index=2
#         """
#         # TODO: Bonus!
#         # grid over which we will calculate J
# #         theta0_vals = np.linspace(-10, 10, 100)
# #         theta1_vals = np.linspace(-1, 4, 100)

#         # initialize J_vals to a matrix of 0's
#         J_vals = self.history

# #         # Fill out J_vals
# #         for i, theta0 in enumerate(theta0_vals):
# #             for j, theta1 in enumerate(theta1_vals):
# #                 J_vals[i, j] = computeCost(X, y, [theta0, theta1])

       
# #         J_vals = J_vals.T

#         # surface plot
#         fig = pyplot.figure(figsize=(12, 5))
#         ax = fig.add_subplot(121, projection='3d')
#         ax.plot_surface(self.weights, self.history, cmap='viridis')
#         pyplot.xlabel('theta0')
#         pyplot.ylabel('theta1')
#         pyplot.title('Surface')

        
#         ax = pyplot.subplot(122)
#         pyplot.contour(self.weights, self.history, linewidths=2, cmap='viridis', levels=np.logspace(-2, 3, 20))
#         pyplot.xlabel('theta0')

#         pyplot.plot(self.weights , 'ro', ms=10, lw=2)
#         pyplot.title('Contour, showing minimum')
#         pass
        
        
        

In [19]:
R=Regressor()
R.fit()
# R.plot_gradient()

Iteration:  10
Loss:  7120.4637724083805
Iteration:  20
Loss:  6907.244158003389
Iteration:  30
Loss:  6677.092566674425
Iteration:  40
Loss:  6424.156026588362
Iteration:  50
Loss:  6149.717994214673
Iteration:  60
Loss:  5857.0917597460375
Iteration:  70
Loss:  5550.077707476951
Iteration:  80
Loss:  5232.425481357666
Iteration:  90
Loss:  4907.672793404865
Iteration:  100
Loss:  4579.12340455296
Iteration:  110
Loss:  4249.869374398241
Iteration:  120
Loss:  3922.8222367334447
Iteration:  130
Loss:  3600.741676376906
Iteration:  140
Loss:  3286.2590914425637
Iteration:  150
Loss:  2981.89627039719
Iteration:  160
Loss:  2690.0801001133123
Iteration:  170
Loss:  2413.1542164314833
Iteration:  180
Loss:  2153.388333613759
Iteration:  190
Loss:  1912.9858085578621
Iteration:  200
Loss:  1694.08985055113
Iteration:  210
Loss:  1498.788679651622
Iteration:  220
Loss:  1329.1198589528647
Iteration:  230
Loss:  1187.0739699868354
Iteration:  240
Loss:  1074.5977599932166
Iteration:  250
Lo