# Block 5: Evaluation

## Objectives

- Do a single random split and train and test
- Use the class to perform cross-validation on the multivariate regression model
- Learn how `KFold` class works and helps in cross-validation
- Report mean MSE and standard error

In [1]:
import numpy as np 
import pandas as pd 
import sklearn.model_selection

## Single Random Split

In [2]:
#
# Quick demo
#

# Let's define a toy dataset 
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [10, 11], [-1, -2], [0, 0], [3, 3]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])

# this is the list of sequential indecies into X (just the length of axis 0 of X)
ind = np.arange(X.shape[0])

# we shuffle it (this returns a copy)
ind = np.random.permutation(ind)

# now we take the first 80%
prop = 0.8
train_len = int(prop * X.shape[0]) # number of training elements (6)

# get training and testing indecies
train_ind = ind[:train_len]
test_ind = ind[train_len:]
print("Train: ", train_ind)
print("Test: ", test_ind)

# now use them to grab the training and testing portions of X and y
X_train = X[train_ind, :] # first 80% of X
X_test = X[test_ind, :]
ytrain = y[train_ind]
ytest = y[test_ind]
ytrain

Train:  [2 0 1 6 7 4]
Test:  [5 3]


array([3, 1, 2, 7, 8, 5])

In [3]:
#
# Copied these functions from block4
#

def prepare_inputs(df):
    # we need to separate categorical from numeric features
    # because they require separate processing
    # let's get categorical columns
    categorical_cols = df.select_dtypes(include='object').columns
    
    # let's get numeric
    ordinal_cols = df.select_dtypes(include='number').columns

    # construct input features
    X = df[ordinal_cols].to_numpy()

    # z-score (NxK' - 1xK') / 1xK' = NxK'
    X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

    # code categorical features
    for feature in categorical_cols:
        dummies = pd.get_dummies(df[feature]).to_numpy().astype(float)
        X = np.hstack((X, dummies)) 

    # add a column of ones
    ones_col = np.ones((X.shape[0], 1)) # Nx1
    X = np.hstack((ones_col, X)) # K

    return X 

def forward_fn(Beta, X):
    """
        Beta: K
        X: NxK
    """
   
    return X @ Beta # NxK @ K = N

def optimize(df, y, eta, steps):

    X = prepare_inputs(df)
    
    # randomly initialize solution 
    Beta = np.random.rand(X.shape[1]) # K
    
    # iterate for steps
    history = []

    for i in range(steps):
        yhat = forward_fn(Beta, X)
        mse = np.mean(np.square(yhat - y))
        history.append([Beta, mse])

        # compute gradient at those predictions
        # (NxK).T @ N = K
        Beta_grad = 2 * X.T @ (yhat - y) / X.shape[0]
        
        # update solution
        Beta = Beta - eta * Beta_grad
        
    return Beta, history 
def predict(Beta, df):
    X = prepare_inputs(df)
    
    return forward_fn(Beta, X)


In [4]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)

# this is the list of sequential indecies into X (just the length of axis 0 of X)
ind = np.arange(df.shape[0])

# we shuffle it (this returns a copy)
ind = np.random.permutation(ind)

train_ind = ind[:train_len]
test_ind = ind[train_len:]

# create train and test dfs
prop = 0.8
train_len = int(prop * df.shape[0]) # number of training elements
df_train = df.iloc[train_ind]
df_test = df.iloc[test_ind]

features = ['Cylinders']
output_col = 'Miles_per_Gallon'
params, _ = optimize(df_train[features], df_train[output_col], eta = 0.1, steps=100)
yhat = predict(params, df_test[features])
ytest = df_test[output_col].to_numpy()
mse = np.mean(np.square(yhat - ytest))
print(f"Test MSE: {mse}")

Before:  (406, 9)
After:  (392, 9)
Test MSE: 26.70546578505351


### Making training/evaluation more reusable

In [5]:
#
# this is a function ... that returns a function :)
#
def create_train_test_fn(features, output_col):
    
    def train_test_fn(df_train, df_test):
        params, _ = optimize(df_train[features], df_train[output_col], eta = 0.1, steps=100)
        yhat = predict(params, df_test[features])
        ytest = df_test[output_col].to_numpy()
        mse = np.mean(np.square(yhat - ytest))
        return mse 

    return train_test_fn

train_test_fn = create_train_test_fn(features, output_col)

loss = train_test_fn(df_train, df_test)
print(loss)

26.705465785051246


# Cross-validation

In [6]:
# Let's define a toy dataset 
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [10, 11], [-1, -2], [0, 0], [3, 3]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])

# use KFold to split the dataset

# try using n_splits greater than the number of data points
kf = sklearn.model_selection.KFold(n_splits=5, 
                                   shuffle=True, 
                                   random_state=43)
for train_index, test_index in kf.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)

TRAIN index: [0 1 2 4 5 6] TEST index: [3 7]
TRAIN index: [0 1 2 3 4 7] TEST index: [5 6]
TRAIN index: [0 3 4 5 6 7] TEST index: [1 2]
TRAIN index: [1 2 3 4 5 6 7] TEST index: [0]
TRAIN index: [0 1 2 3 5 6 7] TEST index: [4]


In [8]:
def cv(df, train_test_fn, folds, random_state):

    # instantiate the splitter
    kf = sklearn.model_selection.KFold(n_splits=folds, 
                                       shuffle=True, 
                                       random_state=random_state)
    
    metrics = []
    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]
        
        # evaluate
        metric = train_test_fn(train_df, test_df)
        metrics.append(metric)
    
    return metrics 

metrics = cv(df, train_test_fn, folds = 5, random_state= 1341341234)
print(metrics)
print(np.mean(metrics))
print(np.std(metrics) / np.sqrt(len(metrics)))

[np.float64(31.202771479362852), np.float64(27.734259136877203), np.float64(20.89097853293615), np.float64(24.185752934081133), np.float64(16.25489560189607)]
24.05373153703068
2.32806643949547
