# Block 6: Feature Selection

In [13]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn.model_selection
import itertools
from typing import List

## Preliminaries: Some Cleanup

### Model Class 

Lets encapsulate our model in a class so that it is more easy to use.


In [48]:
class LinearModel:

    def __init__(self, 
                 features: List[str]):
        self._features = features 
    
    def train(self, df: pd.DataFrame, y: np.ndarray, steps: int = 100, eta: float = 0.01):

        # prepare inputs and outputs
        X = self._prepare_input_matrix(df)
        
        # randomly initialize solution 
        Beta = np.random.rand(X.shape[1]) # K

        # iterate for steps
        history = []
        for i in range(steps):
            # compute model predictions
            yhat = X @ Beta # N
            mse = np.mean(np.square(yhat - y))
            history.append([Beta, mse])

            # compute gradient at those predictions
            # (NxK).T @ N = K
            Beta_grad = 2 * X.T @ (yhat - y) / X.shape[0]
            
            # update solution
            Beta = Beta - eta * Beta_grad
        
        # save the parameters
        self._params, _ = history[-1]

        return history
    
    def _prepare_input_matrix(self, df: pd.DataFrame):

        # we need to separate categorical from numeric features
        # because they require separate processing
        # let's get categorical columns
        categorical_cols = df.select_dtypes(include='object').columns
        
        # let's get numeric
        ordinal_cols = df.select_dtypes(include='number').columns

        # construct input features
        X = df[ordinal_cols].to_numpy()

        # z-score (NxK' - 1xK') / 1xK' = NxK'
        X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

        # code categorical features
        for feature in categorical_cols:
            dummies = pd.get_dummies(df[feature]).to_numpy().astype(float)
            X = np.hstack((X, dummies)) 

        # add a column of ones
        ones_col = np.ones((X.shape[0], 1)) # Nx1
        X = np.hstack((ones_col, X)) # K
        
        return X 
    
    def predict(self, df: pd.DataFrame):
         
        X = self._prepare_input_matrix(df)

        # compute model predictions
        yhat = X @ self._params # N

        return yhat 


### Cross-Validation

Let's make it easy to run CV on a model.

In [62]:
def cv(df, y, folds, random_state):

    kf = sklearn.model_selection.KFold(n_splits=folds, 
                                       shuffle=True, 
                                       random_state=random_state)
    
    features = df.columns.tolist()
    mses = []
    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        ytrain = y[train_index]

        test_df = df.iloc[test_index]
        ytest = y[test_index]

        # train and predict
        model = LinearModel(features)
        model.train(train_df[features], ytrain)
        yhat = model.predict(test_df[features])
        
        # evaluate
        mse = np.mean(np.square(yhat - ytest))
        mses.append(mse)
    
    return mses 


In [63]:
# load dataset
df = pd.read_json('../data/cars.json')

# Filter dataframe
required_cols = ['Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower', 'Weight_in_lbs', 'Acceleration', 'Origin']

# only include rows where ALL columns are not nan
ix_included = np.sum(pd.isna(df[required_cols]), axis=1) == 0

# exclude examples with no horsepower or mpg
print("Before: ", df.shape)
df = df[ix_included]
print("After: ", df.shape)
df = df[required_cols]

Before:  (406, 9)
After:  (392, 9)


In [64]:
# Let's test the new class 
model = LinearModel(['Displacement', 'Origin'])
history = model.train(df, df[ 'Miles_per_Gallon'].to_numpy())
print(history[-1][1])
yhat = model.predict(df)
mse = np.mean(np.square(yhat - df.Miles_per_Gallon))
print(mse)


7.08978163351333
7.08978163351333


In [66]:
# run CV 
mses = cv(df[['Displacement', 'Origin']], df['Miles_per_Gallon'].to_numpy(), folds=5, random_state=4234)
mses

[np.float64(21.818435572593533),
 np.float64(38.102850174784635),
 np.float64(25.744303249836637),
 np.float64(20.829373256247006),
 np.float64(26.318520526734744)]

## Brute Force Selection

In [51]:
# let's play with combinations function because we'll use it in brute force
# selection
cols = ['a', 'b', 'c']

# combinations of length 3
print(list(itertools.combinations(cols, 1)))
# combinations of length 2
print(list(itertools.combinations(cols, 2)))
# combinations of length 3
print(list(itertools.combinations(cols, 3)))

# to get all combinations
all_combinations = [comb for i in range(1, len(cols)) for comb in itertools.combinations(cols, i)]
print(all_combinations)

[('a',), ('b',), ('c',)]
[('a', 'b'), ('a', 'c'), ('b', 'c')]
[('a', 'b', 'c')]
[('a',), ('b',), ('c',), ('a', 'b'), ('a', 'c'), ('b', 'c')]


In [69]:
def brute_force_selection(df, 
                          input_cols, 
                          output_col):

    # generate all combinations
    all_combinations = [comb for i in range(1, len(input_cols)) for comb in itertools.combinations(input_cols, i)]

    # start cross validation
    all_mses = []
    for combination in all_combinations:
        # important to keep random state the same for all combinations
        # so that generated splits are the same
        mses = cv(df[list(combination)], df[output_col].to_numpy(), folds=5, random_state=234234)   
        all_mses.append(mses)
        
    # organize all results in a 2D matrix (Rows = combinations, cols = folds)
    all_mses = np.array(all_mses) # Combinations x Folds

    # compute average MSE for each combination
    avg_mse = np.mean(all_mses, axis=1) # Combinations
    
    # pick best
    best_ix = np.argmin(avg_mse)
    best_combination = list(all_combinations[best_ix])

    # now train the best combination on the whole dataset and return the fitted model
    model = LinearModel(best_combination)
    model.train(df[best_combination], df[output_col].to_numpy())
    
    return model


model = brute_force_selection(df, ['Cylinders', 
                                   'Displacement', 
                                   'Horsepower', 
                                   'Weight_in_lbs', 
                                   'Acceleration', 
                                   'Origin'], 'Miles_per_Gallon')
print(model._features)


['Displacement', 'Weight_in_lbs', 'Origin']
