# Block 5: Evaluation

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn.model_selection

In [4]:
# Let's define a toy dataset 
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [10, 11], [-1, -2], [0, 0], [3, 3]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])

# use KFold to split the dataset

# try using n_splits greater than the number of data points
kf = sklearn.model_selection.KFold(n_splits=5, 
                                   shuffle=True, 
                                   random_state=43)
for train_index, test_index in kf.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    
    # build training and testing splits
    Xtrain, ytrain = X[train_index,:], y[train_index]
    Xtest, ytest = X[test_index,:], y[test_index]
    
    # train and evaluate the model ...

TRAIN index: [0 1 2 4 5 6] TEST index: [3 7]
TRAIN index: [0 1 2 3 4 7] TEST index: [5 6]
TRAIN index: [0 3 4 5 6 7] TEST index: [1 2]
TRAIN index: [1 2 3 4 5 6 7] TEST index: [0]
TRAIN index: [0 1 2 3 5 6 7] TEST index: [4]


In [5]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)


Before:  (406, 9)
After:  (392, 9)


In [9]:
def optimize(X, y, eta, steps):

    # z-score (NxK - 1xK) / 1xK = NxK
    X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

    # add a column of ones
    ones_col = np.ones((X.shape[0], 1)) # Nx1
    X = np.hstack((ones_col, X))
    
    # randomly initialize solution 
    Beta = np.random.rand(X.shape[1]) # K

    # iterate for steps
    history = []

    for i in range(steps):
        # compute model predictions
        yhat = X @ Beta # N
        mse = np.mean(np.square(yhat - y))
        history.append([Beta, mse])

        # compute gradient at those predictions
        # (NxK).T @ N = K
        Beta_grad = 2 * X.T @ (yhat - y) / X.shape[0]
        
        # update solution
        Beta = Beta - eta * Beta_grad
        
    return history 

def predict(params, Xtest):
    # z-score (NxK - 1xK) / 1xK = NxK
    Xtest = (Xtest - np.mean(Xtest, axis=0)[None, :]) / np.std(Xtest, axis=0)[None, :]

    # add a column of ones
    ones_col = np.ones((Xtest.shape[0], 1)) # Nx1
    Xtest = np.hstack((ones_col, Xtest))

    # compute model predictions
    yhat = Xtest @ params # N

    return yhat 

# test the new code

# construct input features
numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration']
X = df[numeric_features].to_numpy()
dummies = pd.get_dummies(df['Origin']).to_numpy().astype(float)
X = np.hstack((X, dummies))

history = optimize(X, df.Miles_per_Gallon.to_numpy(), 0.1, 100)
print("MSE: ", history[-1][1])

yhat = predict(history[-1][0], X)
y = df.Miles_per_Gallon.to_numpy()
mse = np.mean(np.square(yhat - y))
print("MSE from predict() func: ", mse)


MSE:  17.044916646981076
MSE from predict() func:  17.044916646981076


In [None]:
# run CV
# try using n_splits greater than the number of data points
kf = sklearn.model_selection.KFold(n_splits=5, 
                                   shuffle=True, 
                                   random_state=43)

# construct inputs and outputs
numeric_features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration']
X = df[numeric_features].to_numpy()
dummies = pd.get_dummies(df['Origin']).to_numpy().astype(float)
X = np.hstack((X, dummies))
y = df.Miles_per_Gallon.to_numpy()
mses = []
for train_index, test_index in kf.split(X):
    
    # build training and testing splits
    Xtrain, ytrain = X[train_index,:], y[train_index]
    Xtest, ytest = X[test_index,:], y[test_index]
    
    # train and predict
    history = optimize(Xtrain, ytrain, eta = 0.1, steps=100)
    final_params, _ = history[-1]
    yhat = predict(final_params, Xtest)

    # evaluate
    mse = np.mean(np.square(yhat - ytest))
    mses.append(mse)
print("Average MSE: ", np.mean(mses))
print("Stderr MSE: ", np.std(mses)/np.sqrt(len(mses)))

Average MSE:  36.688542846253334
Stderr MSE:  2.7042467351928168
