# Bock 4: Multivariate Linear Regression

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)

Before:  (406, 9)
After:  (392, 9)


In [5]:
# example of categorical encoding
s = pd.Series(['Apple', 'Banana', 'Strawberry'])
r = pd.get_dummies(s)

In [8]:
def optimize(X, y, eta, steps):

    # z-score (NxK - 1xK) / 1xK = NxK
    X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

    # add a column of ones
    ones_col = np.ones((X.shape[0], 1)) # Nx1
    X = np.hstack((ones_col, X))
    
    # randomly initialize solution 
    Beta = np.random.rand(X.shape[1]) # K

    # iterate for steps
    history = []

    for i in range(steps):
        # compute model predictions
        yhat = X @ Beta # N
        mse = np.mean(np.square(yhat - y))
        history.append([Beta, mse])

        # compute gradient at those predictions
        # (NxK).T @ N = K
        Beta_grad = 2 * X.T @ (yhat - y) / X.shape[0]
        
        # update solution
        Beta = Beta - eta * Beta_grad
        
    return history 

# construct input features
numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration']
X = df[numeric_features].to_numpy()
dummies = pd.get_dummies(df['Origin']).to_numpy().astype(float)
X = np.hstack((X, dummies))

history = optimize(X, df.Miles_per_Gallon.to_numpy(), 0.1, 100)
final_p, final_mse = history[-1]
print(final_p)
print(final_mse)

[23.44591836 -0.53549452 -0.2016696  -2.2226008  -3.36943651 -0.20985255
  0.50922226  1.29688988  0.25837811]
17.060868202428217
