In [55]:
import copy, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Multiple Linear Regression
We're going to use a small toy data set to apply what we've learnt about multiple linear regression. Each row is an observation of a student's test results for three term exams and their finals. We'll be using three features: Exams 1, 2, and 3 to try and predict the the student's finals test result. 

In [6]:
df = pd.read_csv("psych_test_results.csv")
df.head()

Unnamed: 0,EXAM1,EXAM2,EXAM3,FINAL
0,73,80,75,152
1,93,88,93,185
2,89,91,90,180
3,96,98,100,196
4,73,66,70,142


In [15]:
# We're training on the whole data set which means we have no data to
# judge the model's performance. We'll fix this later

X_train = np.array(df.iloc[:,0:3])
y_train = np.array(df.iloc[:,3])

In [21]:
print(f"X_train shape: {X_train.shape} and\ny_train shape: {y_train.shape}")

X_train shape: (25, 3) and
y_train shape: (25,)


In [44]:
b_init = 0
w_init = np.zeros(3,)
print(w_init.shape)

(3,)


We're carefully keeping track of the shape of our data and ensuring that our model's weights (w) have the correct shape for the number of features. 

The model's prediction with multiple variables is given by the linear model:

$$ f_{\mathbf{w},b}(\mathbf{x}) =  w_0x_0 + w_1x_1 +... + w_{n-1}x_{n-1} + b \tag{1}$$
or in vector notation:
$$ f_{\mathbf{w},b}(\mathbf{x}) = \mathbf{w} \cdot \mathbf{x} + b  \tag{2} $$ 
where $\cdot$ is a vector `dot product`

In [48]:
def compute_cost(X, y, w, b):
    """
    
    computes cost of model
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """
    
    m = X.shape[0] # number of examples
    cost = 0.0 # initializing cost
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b   # dot product of the ith example. A scalar
        cost = cost + (f_wb_i - y[i])**2 # summing cost
    cost = cost/(2*m)
    return cost

In [49]:
def compute_gradient(X, y, w, b):
    """
    Computes gradient for linear regression
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters
      b (scalar)       : model parameter
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w
      dj_db (scalar       : The gradient of the cost w.r.t. the parameter b
    """
    m,n = X.shape #m examples, n features
    # initialize gradient w.r.t paremeters 
    dj_dw = np.zeros((n,)) # vector (array) of zeros of size n
    dj_db = 0.0
    
    for i in range(m):
        err = (np.dot(X[i], w) + b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i, j]
        dj_db = dj_db + err
    dj_dw = dj_dw / m
    dj_db = dj_db / m
    
    return dj_db, dj_dw    

In [50]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))  : Data, m examples with n features
      y (ndarray (m,))   : target values 
      w_in (ndarray (n,)): initial model parameters
      b_in(scalar)       : initial model paramter
      cost_function      : function to compute cost
      gradient_function  : function to compute gradient
      alpha (float)      : learning rate
      num_iters (int)    : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : updated values of parameters 
      b (scalar)       : updated value of parameter
    """
    
    w = copy.deepcopy(w_in)
    b = b_in
    
    for i in range(num_iters):
        
        dj_db, dj_dw = gradient_function(X, y, w, b)
        
        # update parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
    return w, b

In [51]:
cost = compute_cost(X_train, y_train, w_init, b_init)
print(f'cost for parameters w and b initialized at 0: {cost}')

cost for parameters w and b initialized at 0: 13405.98


In [57]:
alpha = 0.00001

In [66]:
w, b = gradient_descent(X_train, y_train, w_init, b_init, compute_cost, compute_gradient, alpha, 1000)

In [67]:
compute_cost(X_train, y_train, w, b)

4.974475611055593