# Block 6: Feature Selection

## Objectives

- Learn about functions as first class objects 
- Implement CV function
- Implement and test brute force selection
- Implement and test forward greedy selection
- Use brute force and greedy within the proper nested-CV setup

In [33]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn.model_selection
import itertools
from typing import List
import linear_model_numpy

## Functions are First Class Objects in Python

In [34]:
# consider this simple function
def myfunc(a, b):
    return a + b

print(myfunc(3, 5)) # prints 8

# we can "wrap" the function within another function:
# this function returns another function! just like any other value!
def wrapper():
    def wrapped(a, b):
        return a + b 
    return wrapped 

myfunc = wrapper() # myfunc is now assigned to the function that was returned from wrapper()

print(myfunc(3, 5)) # 8
print(wrapper()(3, 5)) # same thing

# so what is the point? we just did the same thing in a more complicated way!
# the point is that you can pass arguments to the wrapper that will always be available to the wrapped

def wrapper(a):
    def wrapped(b):
        return a + b 
    return wrapped

myfunc = wrapper(10) # my func is a function that takes on argument and adds 10 to it

print(myfunc(3))
print(myfunc(-10))

# This is why functions are first class objects: you can pass them around, return them, etc


8
8
8
13
0


In [35]:
# closure time!
# we will create a function that returns the mvlr_train_test_fn, but with the input and output columns defined
# we can call this a factory
def mvlr_train_test_function_factory(input_cols, output_col):
    def train_test_fn(train_df, test_df):
        return linear_model_numpy.mvlr_train_test_function(train_df, test_df, input_cols, output_col)
    return train_test_fn

# Load dataset

In [36]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)


Before:  (406, 9)
After:  (392, 9)


# Create a CV function and test our new factory

In [37]:
def cv(df, train_test_fn, folds, random_state):
    """
        Cross-validation: splits dataset into N folds, repeatedly trains on N-1 folds and test on the remaining.

        Inputs:
            df: dataframe of inputs and outputs
            train_test_fn: the training and testing function used
            folds: number of folds
            random_state: pseudo random number generator state
        Output:
            metrics: loss on each split (size N)
    
    """
    # instantiate the splitter
    kf = sklearn.model_selection.KFold(n_splits=folds, 
                                       shuffle=True, 
                                       random_state=random_state)
    
    metrics = []
    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]
        
        # evaluate
        metric = train_test_fn(train_df, test_df)
        metrics.append(metric)
    
    return metrics 

train_test_fn = mvlr_train_test_function_factory(['Horsepower', 'Cylinders'], 'Miles_per_Gallon') 
metrics = cv(df, train_test_fn, folds=5, random_state=2342)
metrics

[np.float64(18.6710588079303),
 np.float64(20.827477521411375),
 np.float64(23.804879577197713),
 np.float64(21.837305759462094),
 np.float64(21.32855280367554)]

## Brute Force Selection

In [38]:
# let's play with combinations function because we'll use it in brute force
# selection
cols = ['a', 'b', 'c']

# combinations of length 3
print(list(itertools.combinations(cols, 1)))
# combinations of length 2
print(list(itertools.combinations(cols, 2)))
# combinations of length 3
print(list(itertools.combinations(cols, 3)))

# to get all combinations
all_combinations = [comb for i in range(1, len(cols)) for comb in itertools.combinations(cols, i)]
print(all_combinations)

[('a',), ('b',), ('c',)]
[('a', 'b'), ('a', 'c'), ('b', 'c')]
[('a', 'b', 'c')]
[('a',), ('b',), ('c',), ('a', 'b'), ('a', 'c'), ('b', 'c')]


In [39]:
def brute_force_selection(df,
                          input_cols,
                          output_col,
                          train_test_factory):

    # generate all combinations
    all_combinations = [comb for i in range(1, len(input_cols)) for comb in itertools.combinations(input_cols, i)]

    # start cross validation FOR EACH combination
    all_metrics = []
    for combination in all_combinations:
        
        # call our factory to get the train_test_fn
        train_test_fn = train_test_factory(list(combination), output_col)

        # important to keep random state the same for all combinations
        # so that generated splits are the same
        metrics = cv(df, train_test_fn, folds=5, random_state=23412341)
        all_metrics.append(metrics)
    
    # organize all results in a 2D matrix (Rows = combinations, cols = folds)
    all_metrics = np.array(all_metrics) # Combinations x Folds

    # compute average metric for each combination
    avg_metric = np.mean(all_metrics, axis=1) # Combinations
    
    # pick best
    best_ix = np.argmin(avg_metric)
    best_combination = list(all_combinations[best_ix])
    
    return best_combination

# the features we want to search over ... the more -> the slower the algorithm
# if you have K features, the algorithm will examin 2^K - 1 models ... so exponential scaling
features = ['Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin', 'Year']

best_combination = brute_force_selection(df, input_cols=features, output_col='Miles_per_Gallon', train_test_factory=mvlr_train_test_function_factory)

best_combination

['Weight_in_lbs', 'Origin', 'Year']

## Forward Greedy Selection

In [40]:
def forward_greedy_selection(df,
                             input_cols,
                             output_col,
                             train_test_factory):
    
    
    current_combination = []
    current_metric = np.inf
    rem_features = input_cols

    while len(rem_features) > 0:
        
        all_metrics = []
        for feature in rem_features:
            
            # create candidate
            candidate_combination = current_combination + [feature]

            train_test_fn = train_test_factory(candidate_combination, output_col)


            # important to keep random state the same for all combinations
            # so that generated splits are the same
            metrics = cv(df, train_test_fn, folds=5, random_state=23412341)
            all_metrics.append(metrics)
        
        # organize all results in a 2D matrix (Rows = Rem Features, cols = folds)
        all_metrics = np.array(all_metrics) # Rem Features x Folds

        # compute average metric for each combination
        avg_metric = np.mean(all_metrics, axis=1) # Combinations
        
        # pick best
        best_ix = np.argmin(avg_metric)

        best_metric = avg_metric[best_ix]
        if best_metric > current_metric:
            # no combination improved on current best, stop
            break
        else:
            current_metric = best_metric
            best_feature = rem_features[best_ix]
            
            # update
            current_combination = current_combination + [best_feature]

            # remove from remaining features
            rem_features = [f for f in rem_features if f != best_feature]

    return current_combination

best_combination = forward_greedy_selection(df, 
                                            input_cols=features, 
                                            output_col='Miles_per_Gallon', 
                                            train_test_factory=mvlr_train_test_function_factory)

best_combination

['Weight_in_lbs', 'Year', 'Origin', 'Acceleration', 'Horsepower']

# Nested Cross-Validation

In [42]:
# This ANOTHER factory!
# this one performs feature selection and trains the model on the best combination
# this is the power of functional composition
def feature_selection_train_test_factory(features, output_col, method='greedy'):

    # pick the strategy for feature selection based on the method parameter
    if method == 'greedy':
        feature_selection_fn = forward_greedy_selection
    elif method == 'brute':
        feature_selection_fn = brute_force_selection
    else:
        raise Exception("Unknown method")
    
    def train_test_fn(df_train, df_test):
        # find best combination
        best_combination = feature_selection_fn(df_train, input_cols=features, output_col=output_col, train_test_factory=mvlr_train_test_function_factory)
        print(best_combination)

        # fit the model to the whole training dataset
        best_p = linear_model_numpy.optimize(df_train[best_combination], df_train[output_col].to_numpy(), learning_rate=0.1, epochs=100)

        # evaluate it
        yhat = linear_model_numpy.predict(best_p, df_test[best_combination])
        ytest = df_test[output_col].to_numpy()
        mse = np.mean(np.square(yhat - ytest))
        return mse 
    return train_test_fn

train_test_fn = feature_selection_train_test_factory(features, 'Miles_per_Gallon', 'greedy')
metrics = cv(df, train_test_fn, folds=5, random_state=23424)
np.mean(metrics)

['Weight_in_lbs', 'Year', 'Origin', 'Horsepower']
['Weight_in_lbs', 'Year', 'Origin', 'Acceleration']
['Weight_in_lbs', 'Year', 'Origin']
['Weight_in_lbs', 'Year', 'Origin']
['Weight_in_lbs', 'Year', 'Origin', 'Horsepower', 'Acceleration']


np.float64(10.880129732377553)

# Why are We Using these Factories?

Answer: Reusability.

Notice that `cv()` does not know anything about the model being tested, nor do `brute_force_selection` and `forward_greedy_selection` functions. We can use the same functions with other models, which we will do.