# Multivariate Linear Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_json('../data/cars.json')

#
# We want to do: Exclude ANY row that has ANY missing value
#

# find everywhere where there are missing values
missing_vals = pd.isna(df)

# we want to go over every row and find how many missing missing values there are
num_missing = np.sum(missing_vals, axis=1) # this will take NxK => N

# we want an array that tells us which rows should be included
ix = (num_missing == 0)
np.sum(ix)

# use ix to index into the dataframe and make it clean
print("Before cleaning: ", df.shape)
df = df[ix]
print("After cleaning: ", df.shape)

Before cleaning:  (406, 9)
After cleaning:  (392, 9)


In [3]:
s = pd.Series(['Apple', 'Banana', 'Strawberry'])
s

pd.get_dummies(s)

Unnamed: 0,Apple,Banana,Strawberry
0,True,False,False
1,False,True,False
2,False,False,True


In [4]:
df

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA
...,...,...,...,...,...,...,...,...,...
401,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,1982-01-01,USA
402,vw pickup,44.0,4,97.0,52.0,2130,24.6,1982-01-01,Europe
403,dodge rampage,32.0,4,135.0,84.0,2295,11.6,1982-01-01,USA
404,ford ranger,28.0,4,120.0,79.0,2625,18.6,1982-01-01,USA


In [5]:
# We want to convert the year string into a number representing the year
# so we don't have to categorically encode it
#year_col = pd.to_datetime(df.Year)
#year_col.dt.year
df.Year = pd.to_datetime(df.Year).dt.year

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 405
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              392 non-null    object 
 1   Miles_per_Gallon  392 non-null    float64
 2   Cylinders         392 non-null    int64  
 3   Displacement      392 non-null    float64
 4   Horsepower        392 non-null    float64
 5   Weight_in_lbs     392 non-null    int64  
 6   Acceleration      392 non-null    float64
 7   Year              392 non-null    int32  
 8   Origin            392 non-null    object 
dtypes: float64(4), int32(1), int64(2), object(2)
memory usage: 29.1+ KB


In [7]:
# Let's identify categorical columns in a dataframe
categorical_cols = df.select_dtypes(include='object').columns
print(categorical_cols)
# Let's identify the numeric columns in the dataframe
numeric_cols = df.select_dtypes(include='number').columns
numeric_cols = numeric_cols[1:] # We do not want the output column
numeric_cols

Index(['Name', 'Origin'], dtype='object')


Index(['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs',
       'Acceleration', 'Year'],
      dtype='object')

In [28]:
# We want to construct the input features into the model
# We will use a numpy array that contains both numeric and categorically encoded values
X = df[numeric_cols].to_numpy() # 392x6 (NxK)

# Now we need to z-score the numeric features so that they can lead to efficient learning
col_means = np.mean(X, axis=0) # K
col_stds = np.std(X, axis=0, ddof=1) # K

# Z-score
# (NxK - 
#  1xK) 
#  / 
#  (1xK)
Z = (X - col_means[None, :]) / col_stds[None, :]

# Now we want to code the categorical columns using one-hot encoding
categorical_cols = ['Origin']
for col in categorical_cols:
    # NxC (C is the number of unique values in the column)
    # So for origin this will be Nx3 
    dummies = pd.get_dummies( df[col] ).to_numpy() 
    
    # concatenate dummies matrix onto Z
    Z = np.hstack((Z, dummies)) 

# finally we want to add a column of ones at the start of Z
ones_col = np.ones((Z.shape[0], 1)) # Nx1

Z = np.hstack((ones_col, Z))
Z.shape

(392, 10)

In [26]:
# Digression: hstack versus vstack

a = np.array([1,2,3])
b = np.array([4,5,6])

# we want to create a matrix of size 3x2 where first column is a and second is b
np.hstack((a[:, None], b[:, None]))
np.vstack((a, b)).T

array([[1, 4],
       [2, 5],
       [3, 6]])

In [43]:
# let's create a function that enapsulates all the complicated input creation operations
# we are assuming that input_df contains ONLY the input columns (no output column)
def prepare_inputs(input_df):

    # Let's identify categorical columns in a dataframe
    categorical_cols = input_df.select_dtypes(include='object').columns
    
    # Let's identify the numeric columns in the dataframe
    numeric_cols = input_df.select_dtypes(include='number').columns

    # We want to construct the input features into the model
    # We will use a numpy array that contains both numeric and categorically encoded values
    X = df[numeric_cols].to_numpy() # (NxK)
    
    # Now we need to z-score the numeric features so that they can lead to efficient learning
    col_means = np.mean(X, axis=0) # K
    col_stds = np.std(X, axis=0, ddof=1) # K
    
    # Z-score
    # (NxK - 
    #  1xK) 
    #  / 
    #  (1xK)
    Z = (X - col_means[None, :]) / col_stds[None, :]
    
    # Now we want to code the categorical columns using one-hot encoding
    for col in categorical_cols:
        # NxC (C is the number of unique values in the column)
        # So for origin this will be Nx3 
        dummies = pd.get_dummies( input_df[col] ).to_numpy() 
        
        # concatenate dummies matrix onto Z
        Z = np.hstack((Z, dummies)) 
    
    # finally we want to add a column of ones at the start of Z
    ones_col = np.ones((Z.shape[0], 1)) # Nx1
    
    Z = np.hstack((ones_col, Z))

    return Z

input_df = df[['Cylinders', 
               'Displacement', 
               'Origin', 'Horsepower', 'Year', 'Weight_in_lbs','Acceleration']]
Z = prepare_inputs(input_df)
Z.shape

(392, 2)

In [44]:
def forward_fn(Beta, Z):
    # Beta is size K
    # Z is size NxK
    # (NxK) @ K = N
    # this is the linear regression model
    return Z @ Beta

# lets test this function: initialize some random Beta
# Beta is [b0, b1, b2, ...]
Beta = np.random.randn(Z.shape[1]) # K
Beta

forward_fn(Beta, Z)

def predict(Beta, input_df):
    Z = prepare_inputs(input_df)
    return forward_fn(Beta, Z)
#predict(Beta, input_df)

In [45]:
#
# Objective: write the learning algorithm (gradient descent)
#
def optimize(input_df, y, learning_rate, epochs):
    """
        Input parameters:
            input_df: dataframe containing input columns
            y: a vector of outputs that we wish to predict
            learning_rate: how quickly we want gradient descent learning
            epochs: the number of steps of gradient descent
        Output:
            Beta: fitted model parameters
    """

    # Prepare our inputs into the linear regression
    Z = prepare_inputs(input_df) # NxK

    # Randomly initialize our solution
    Beta = np.random.randn(Z.shape[1]) # K

    # Run gradient descent loop
    for i in range(epochs):

        # Compute model's predictions
        yhat = forward_fn(Beta, Z) # N

        # Compute the gradient at those predictions
        # Z is NxK
        # yhat is N
        # y is N
        # KxN @ N = K
        Beta_grad = 2 * Z.T @ (yhat - y) / Z.shape[0]
        
        # Update the parameters
        Beta = Beta - learning_rate * Beta_grad

    # Beta is the fitted parameter values
    return Beta

best_Beta = optimize(input_df = input_df, 
                     y = df.Miles_per_Gallon.to_numpy(),
                     learning_rate = 0.1,
                     epochs = 100)
best_Beta

# How good was the model?
# What we need to do: run the model forward function and then compute loss
yhat = predict(Beta = best_Beta, 
               input_df = input_df) # N

# compute mean squared error
y = df.Miles_per_Gallon.to_numpy()
np.mean(np.square(yhat - y))

np.float64(24.02017956815553)