In [33]:
import numpy as np

In [34]:
def gradient_descent(X, y, alpha, num_iters):
    """
    Performs gradient descent to learn theta by taking num_iters gradient steps
    with learning rate alpha.

    Parameters
    ----------
    X : array-like
        The feature matrix of shape (m, n+1) where m is the number of examples
        and n is the number of features. The first column should be all ones.
    y : array-like
        The target values of shape (m,).
    alpha : float
        The learning rate.
    num_iters : int
        The number of iterations to run gradient descent.

    Returns
    -------
    theta : array-like
        The learned linear regression parameters. A vector of shape (n+1,).
    J_history : list
        A list of the cost function values for each iteration of gradient descent.
    """

    m, n = X.shape
    theta = np.zeros(n)  # initialize parameters to zero
    J_history = []  # to keep track of the cost function values

    for i in range(num_iters):
        # Compute the hypothesis (predicted values)
        h = np.dot(X, theta)

        # Compute the gradient of the cost function with respect to theta
        #We then compute the gradient of the cost function with respect to theta. 
        #This is done using the formula for the gradient of the mean squared error (MSE) cost function
        grad = (1/m) * np.dot(X.T, (h - y))

        # Update the parameters
        theta = theta - alpha * grad

        # Compute the cost function J for the current parameters
        # The cost function is defined as the mean squared error
        J = 1/(2*m) * np.sum((h - y)**2)
        J_history.append(J)
    
    #Finally, we return the learned parameters theta and the list of cost function values J_history.
    return theta, J_history


In [40]:
X = np.array([[1, 2], [1, 3], [1, 4], [1, 5]])
y = np.array([3, 4, 5, 6])

# Add a column of ones to X
# We add a column of ones to X because we want to incorporate the bias term (also known as the intercept term)
# in our linear regression model.
# np.c_ is used for horizontal concatenation 

"""In [1]: from numpy import c_
In [2]: a = ones(4)
In [3]: b = zeros((4,10))    
In [4]: c_[a,b]
Out[4]: 
array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])"""

X = np.c_[np.ones((X.shape[0], 1)), X]
X

array([[1., 1., 2.],
       [1., 1., 3.],
       [1., 1., 4.],
       [1., 1., 5.]])

In [41]:
theta, J_history = gradient_descent(X, y, alpha=0.1, num_iters=1000)

In [42]:
print("theta: ", theta)
print("Last few values of J_history: ", J_history[-5:])

theta:  [0.49999998 0.49999998 1.00000001]
Last few values of J_history:  [5.0590067994971576e-17, 4.8954226890936755e-17, 4.737128076267675e-17, 4.583952215259846e-17, 4.435728835764239e-17]


In [43]:
# Predict y for new data point [1, 6, 7]
x_new = np.array([1, 6, 7])
y_pred = np.dot(x_new, theta)
print("Predicted value of y for [1, 6, 7]: ", y_pred)

Predicted value of y for [1, 6, 7]:  10.499999949400335
