# Day 3

In [1]:
import numpy as np
import pandas as pd
import linear_model_numpy

## Functions as First Class Objects

In [2]:
# consider a simple function that takes two numbers and adds them
def myfunc(a, b):
    return a + b
myfunc(3, 5)

8

In [3]:
# we can make a function that returns a function
def wrapper():
    # wrapper doesn't take any arguments
    def wrapped(a, b):
        return a + b
    return wrapped # wrapper returns wrapped

# lets call wrapper
myfunc = wrapper() # now myfunc is whatever wrapper returned
myfunc(3, 5)

8

In [7]:
# but we can pass some arguments to the wrapper
def wrapper(a):
    def wrapped(b):
        return a + b
    return wrapped

myfunc = wrapper(5)
myfunc(-5)

0

In [8]:
# We want to make a version of the function mvlr_train_test_function 
# that takes train_df and test_df ONLY
def factory(input_cols, output_col):
    def train_test_fn(train_df, test_df):
        return linear_model_numpy.mvlr_train_test_function(train_df,
                                                           test_df,
                                                           input_cols,
                                                           output_col)
    return train_test_fn

In [9]:
df = pd.read_json('../data/cars.json')

#
# We want to do: Exclude ANY row that has ANY missing value
#

# find everywhere where there are missing values
missing_vals = pd.isna(df)

# we want to go over every row and find how many missing missing values there are
num_missing = np.sum(missing_vals, axis=1) # this will take NxK => N

# we want an array that tells us which rows should be included
ix = (num_missing == 0)
np.sum(ix)

# use ix to index into the dataframe and make it clean
print("Before cleaning: ", df.shape)
df = df[ix]
print("After cleaning: ", df.shape)

Before cleaning:  (406, 9)
After cleaning:  (392, 9)


In [13]:
# Let's do a quick cross-validation run to test our little factory

# brings in the KFold class which does CV splits
from sklearn.model_selection import KFold

def cv(df, train_test_fn, folds, random_state):
    
    # initialize the splitter
    splitter = KFold(n_splits = folds,
                     shuffle = True,
                     random_state = random_state)
    
    # run the CV loop
    metrics = [] # an array that will hold each iteration's result
    
    for train_indecies, test_indecies in splitter.split(df):
        
        # construct the training and test dataframes
        train_df = df.iloc[train_indecies]
        test_df = df.iloc[test_indecies]
    
        # evaluate
        mse = train_test_fn(train_df, test_df)
        metrics.append(mse)

    return metrics


# lets create a train test function from the factory
train_test_fn = factory(input_cols = ['Cylinders', 'Displacement'],
                        output_col = 'Miles_per_Gallon')

cv(df = df, 
   train_test_fn = train_test_fn,
   folds = 5,
   random_state = 123123)


[np.float64(24.391855009262596),
 np.float64(23.039246113140706),
 np.float64(13.333147700048498),
 np.float64(21.440681627535316),
 np.float64(25.82010852738354)]

# Brute Force Feature Selection

In [27]:
import itertools

# suppose you have three features
my_features = ['a', 'b', 'c']

# we want all possible combinations of these three features
# ['a'], ['b'], ['c'], ['a', 'b'], ['a', 'c'], ... 

# list combinations of length 1
list(itertools.combinations(my_features, 1))
list(itertools.combinations(my_features, 2)) # length 2
list(itertools.combinations(my_features, 3)) # length 3

# let make a list with all combinations concatenated
all_combinations = [
    combination
    for combination_length in range(1, len(my_features)+1)
    for combination in itertools.combinations(my_features, combination_length)
]

# start with empty list
all_combinations = []
for combination_length in range(1, len(my_features)+1):
    for combination in itertools.combinations(my_features, combination_length):
        all_combinations.append(combination)

# print it out
all_combinations

[('a',), ('b',), ('c',), ('a', 'b'), ('a', 'c'), ('b', 'c'), ('a', 'b', 'c')]

In [32]:
def brute_force_selection(df, # dataset
                          input_cols, # list of features to search over
                          output_col, # output column
                          factory): # factory

    # generate all possible input feature sets
    all_combinations = [
        list(combination) # we have to convert from a tuple to a list, 
        # otherwise we can't index into pandas dataframes
        for combination_length in range(1, len(input_cols)+1)
        for combination in itertools.combinations(input_cols, combination_length)
    ]

    # we want to go over every combination and test it
    # so we need to track the performance of each combination
    all_metrics = []
    for combination in all_combinations:

        # call our factory to get the train function
        train_test_fn = factory(input_cols = combination,
                                output_col = output_col)

        # run CV on this model
        metrics = cv(df = df,
                     train_test_fn = train_test_fn,
                     folds = 5,
                     random_state = 234234)
        
        # metrics is a list of length 5
        all_metrics.append(metrics)

    # organize all results in a 2D matrix 
    # (Rows correspond to combinations, and columns to indiviual CV runs)
    all_metrics = np.array(all_metrics)

    # compute the average CV performance of every combination
    avg_metric = np.mean(all_metrics, axis=1) # size V (V is # of combinations)

    # pick the best
    best_ix = np.argmin(avg_metric) # the row index of the model with the least MSE

    # this is the best combination
    best_combination = all_combinations[best_ix]

    return best_combination

input_cols = ['Cylinders', 
              'Displacement', 
              'Horsepower', 
              'Weight_in_lbs', 
              'Acceleration',
              'Origin',
              'Year']

brute_force_selection(df = df,
                      input_cols = input_cols,
                      output_col = 'Miles_per_Gallon',
                      factory = factory)

['Weight_in_lbs', 'Origin', 'Year']

# Forward Greedy Selection

In [50]:
def greedy_selection(df, # dataset
                     input_cols, # list of features to search over
                     output_col, # output column
                     factory): # factory

    # we need to track the current set of features
    # that we have selected so far
    current_combination = []

    # we also need to know what is the best metric value 
    # we have encountered so far
    current_metric = np.inf

    # and we need to know what are the remaining features that
    # we need consider
    remaining_features = input_cols # initially, its ALL the features

    # As long as there are remaining features ... do the following
    while len(remaining_features) > 0:

        # track the performance of all candidate features
        all_metrics = []

        for feature in remaining_features:

            # create the candidate feature set
            # this is [what we have selected so far] + the feature
            # For example, if current_combination = ['Horsepower']
            # candidate = ['Horsepower'] + ['Cylinders'] = ['Horsepower', 'Cylinders']
            candidate_combination = current_combination + [feature]

            # create train test fn
            train_test_fn = factory(input_cols = candidate_combination,
                                    output_col = output_col)

            # run CV on the candidate feature set
            metrics = cv(df = df,
                         train_test_fn = train_test_fn,
                         folds = 5,
                         random_state = 234234)
            # metrics should be a list of length 5, containing the MSE
            # of each CV fold
            all_metrics.append(metrics)

        # Now we are outside the inner loop

        # Organize results into 2D Matrix 
        all_metrics = np.array(all_metrics) # V x 5 (V is # combinations)

        # Compute average metric for each candidate feature
        avg_metric = np.mean(all_metrics, axis=1) # V

        # Pick the best feature in terms of metric
        best_ix = np.argmin(avg_metric)
        best_metric = avg_metric[best_ix] # the loss of the best possible next feature

        # If the best metric we got at this point does not even improve
        # upon the best we have so far ... STOP
        # None of the features improved the performance of the model so far
        if best_metric > current_metric:
            break # stops the while loop

        # if we are still here, that means that the best_metric <= current_metric
        # so lets update our state

        # new current best metric
        current_metric = best_metric

        # new best combination
        best_next_feature = remaining_features[best_ix] 
        current_combination = current_combination + [best_next_feature]

        # update the remaining features
        remaining_features = [f for f in remaining_features if f != best_next_feature]
        
    # "best" combination
    return current_combination

input_cols = ['Cylinders', 
              'Displacement', 
              'Horsepower', 
              'Weight_in_lbs', 
              'Acceleration',
              'Origin',
              'Year']
greedy_selection(df = df,
                 input_cols = input_cols,
                 output_col = 'Miles_per_Gallon',
                 factory = factory)

['Weight_in_lbs', 'Year', 'Origin']

# Nested CV

In [55]:
def feature_selection_factory(input_cols, # features to consider
                              output_col, # output
                              method): # 'greedy' or 'brute'

    # pick the strategy for doing feature selection
    if method == 'greedy':
        feature_selection_fn = greedy_selection
    elif method == 'brute':
        feature_selection_fn = brute_force_selection
    else:
        raise Exception("Unknown method. Please select 'greedy' or 'brute'")

    # now we define the function that we return to the caller
    def train_test_fn(train_df, test_df):

        # First, find the best feature combination
        best_combination = feature_selection_fn(df = train_df,
                                                input_cols = input_cols,
                                                output_col = output_col,
                                                factory = factory)
        print(best_combination)
        
        # Then, fit the best combination to the whole training data frame
        final_train_test_fn = factory(input_cols = best_combination,
                                      output_col = output_col)
        
        # Evaluate the model on the test data frame
        return final_train_test_fn(train_df, test_df)
        
    return train_test_fn 

input_cols = ['Cylinders', 
              'Displacement', 
              'Horsepower', 
              'Weight_in_lbs', 
              'Acceleration',
              'Origin',
              'Year']
feature_selection_model_fn = feature_selection_factory(input_cols = input_cols,
                                                       output_col = 'Miles_per_Gallon',
                                                       method = 'brute')
cv(df = df,
   train_test_fn = feature_selection_model_fn,
   folds = 5,
   random_state = 32234) # this is the outer CV loop

['Weight_in_lbs', 'Origin', 'Year']
['Weight_in_lbs', 'Origin', 'Year']
['Weight_in_lbs', 'Acceleration', 'Origin', 'Year']
['Weight_in_lbs', 'Acceleration', 'Origin', 'Year']
['Weight_in_lbs', 'Origin', 'Year']


[np.float64(10.702766968454496),
 np.float64(12.16392085857553),
 np.float64(7.634407766738132),
 np.float64(10.859485493381863),
 np.float64(10.85191612094271)]

# JAX

In [57]:
import jax
import jax.numpy as jnp

In [82]:
# Autodifferential capabilities of JAX

jnp.sin(jnp.pi/3)

# gradient of the sin function
grad_fn = jax.grad(jnp.sin)
grad_fn(jnp.pi/2) # cos(1)

# gradient of a function that takes a parameter
def myfunc(w):
    return w * 2

grad_fn = jax.grad(myfunc) # derivative of myfunc with respect to w
grad_fn(10.)

def myfunc(w, x):
    return w*x

grad_fn = jax.grad(myfunc) # derivative of myfunc with respect the first parameter
                           # , which is w

In [84]:
# our functions usually have more than one parameter
def myfunc(params, x):
    return (params['b0'] + params['b1'] * x)
val = myfunc(params = { 'b0' : 1., 
                        'b1' : 5. }, 
             x = 10.)

grad_fn = jax.grad(myfunc) # so this will create the gradient with respect to b0 and b1
grad_fn({ 'b0' : 1., 'b1' : 5. }, x=15)

{'b0': Array(1., dtype=float32, weak_type=True),
 'b1': Array(15., dtype=float32, weak_type=True)}

# Randomess

In [100]:
key = jax.random.key(42) # initializing a random "key" with value 42

# let's generate some standard normal variates
print(jax.random.normal(key, 3)) # generates three values
print(jax.random.normal(key, 3)) # generates three values
print(jax.random.normal(key, 3)) # generates three values
print(jax.random.normal(key, 3)) # generates three values
print(jax.random.normal(key, 3)) # generates three values

key1, key2 = jax.random.split(key, 2) # split key into two new keys
print(jax.random.normal(key1, 3)) # generates three values
print(jax.random.normal(key2, 3)) # generates three values
print()
for i in range(10):
    
    loop_key = jax.random.fold_in(key, i)
    #loop_key, _ = jax.random.split(loop_key, 2)
    
    val = jax.random.normal(loop_key, 3)
    print(val)

[-0.02830462  0.46713185  0.29570296]
[-0.02830462  0.46713185  0.29570296]
[-0.02830462  0.46713185  0.29570296]
[-0.02830462  0.46713185  0.29570296]
[-0.02830462  0.46713185  0.29570296]
[ 0.07592554 -0.48634264  1.2903206 ]
[ 0.60576403  0.7990441  -0.908927  ]

[ 0.07592554 -0.48634264  1.2903206 ]
[ 0.60576403  0.7990441  -0.908927  ]
[ 0.4323065  0.5872638 -1.1416743]
[-0.2818947 -1.367489  -1.6350379]
[0.6549178  0.17345214 1.6018405 ]
[-0.2166012  -1.9878021  -0.61060226]
[-0.25440374 -0.6385937  -0.68521845]
[ 0.2886397  -0.00963292  0.15268941]
[ 0.14384735 -0.15262456 -1.7989424 ]
[-1.3462586   0.5520057  -0.75974613]
