# Bock 4: Multivariate Linear Regression

## Objectives

- Learn categorical coding
- Create a function that prepares our inputs for training
- Create a function that implements the "forward computation"
- Create a function that implements the optimzation
- Create a function that implements the prediction
- Run the model using the functions created above

In [18]:
import numpy as np 
import pandas as pd 

In [19]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)

Before:  (406, 9)
After:  (392, 9)


In [21]:
# example of categorical encoding
s = pd.Series(['Apple', 'Banana', 'Strawberry'])
r = pd.get_dummies(s)
r

Unnamed: 0,Apple,Banana,Strawberry
0,True,False,False
1,False,True,False
2,False,False,True


In [22]:
def prepare_inputs(df):
    # we need to separate categorical from numeric features
    # because they require separate processing
    # let's get categorical columns
    categorical_cols = df.select_dtypes(include='object').columns
        
    # let's get numeric
    ordinal_cols = df.select_dtypes(include='number').columns

    # construct input features
    X = df[ordinal_cols].to_numpy()

    # z-score (NxK' - 1xK') / 1xK' = NxK'
    X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

    # code categorical features
    for feature in categorical_cols:
        dummies = pd.get_dummies(df[feature]).to_numpy().astype(float)
        X = np.hstack((X, dummies)) 

    # add a column of ones
    ones_col = np.ones((X.shape[0], 1)) # Nx1
    X = np.hstack((ones_col, X)) # K

    return X 
prepare_inputs(df[['Cylinders', 'Origin']])

array([[ 1.        ,  1.48394702,  0.        ,  0.        ,  1.        ],
       [ 1.        ,  1.48394702,  0.        ,  0.        ,  1.        ],
       [ 1.        ,  1.48394702,  0.        ,  0.        ,  1.        ],
       ...,
       [ 1.        , -0.86401356,  0.        ,  0.        ,  1.        ],
       [ 1.        , -0.86401356,  0.        ,  0.        ,  1.        ],
       [ 1.        , -0.86401356,  0.        ,  0.        ,  1.        ]],
      shape=(392, 5))

In [23]:
def forward_fn(Beta, X):
    """
        Beta: K
        X: NxK
    """
    return X @ Beta # NxK @ K = N

def optimize(df, y, eta, steps):

    X = prepare_inputs(df)
    
    # randomly initialize solution 
    Beta = np.random.rand(X.shape[1]) # K

    # iterate for steps
    history = []

    for i in range(steps):
        yhat = forward_fn(Beta, X)
        mse = np.mean(np.square(yhat - y))
        history.append([Beta, mse])

        # compute gradient at those predictions
        # (NxK).T @ N = K
        Beta_grad = 2 * X.T @ (yhat - y) / X.shape[0]
        
        # update solution
        Beta = Beta - eta * Beta_grad
        
    return Beta, history 

input_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']
Beta, history = optimize(df[input_features], df.Miles_per_Gallon.to_numpy(), 0.1, 100)
final_p, final_mse = history[-1]
print(Beta)
print(final_mse)

[17.52842327 -0.38014354 -0.57425414 -1.86862122 -3.55452421 -0.11681402
  5.8074257   7.67167712  5.36318299]
17.1171629392896


In [25]:
def predict(Beta, df):
    X = prepare_inputs(df)
    return forward_fn(Beta, X)

yhat = predict(Beta, df[input_features])
yhat

array([18.41225338, 15.70394505, 17.7069566 , 17.75405727, 18.24768659,
       11.01419255,  9.79535561, 10.3125067 ,  9.20692558, 13.73828016,
       15.88787964, 16.50228715, 15.95820713, 14.81744594, 28.99657753,
       23.82192666, 23.96642921, 25.30658783, 30.45995212, 31.61897202,
       26.17466728, 27.1865113 , 27.06321956, 26.89763001, 24.85583895,
        9.24931318, 11.22869888, 10.72062177,  9.94539118, 30.45995212,
       27.21410153, 29.64234485, 24.33187769, 20.64829162, 21.21487706,
       21.91131493, 21.48557514, 13.52066479, 11.71256818, 14.26533708,
       14.85671129,  9.50560369, 10.75281123,  8.85887474, 22.30738254,
       27.33730402, 21.43300866, 22.63669382, 27.75539826, 28.44461198,
       29.59223119, 29.50081405, 33.02589682, 33.53877554, 31.00623203,
       29.53827637, 29.36924755, 28.44835275, 29.34727068, 26.44114783,
       27.62426912, 13.24830944, 12.02238696, 14.67210035, 14.39128687,
       16.7738253 ,  9.26220499, 12.7154655 , 12.66516527, 11.11