# 4 - Feature Selection

In [31]:
import pandas as pd
from sklearn.linear_model import Lasso

In [32]:
X_train_dc = pd.read_pickle('../pickles/split/X_train_dc.pkl')
y_train_dc = pd.read_pickle('../pickles/split/y_train_dc.pkl')
X_test_dc = pd.read_pickle('../pickles/split/X_test_dc.pkl')
y_test_dc = pd.read_pickle('../pickles/split/y_test_dc.pkl')

X_train_l = pd.read_pickle('../pickles/split/X_train_l.pkl')
y_train_l = pd.read_pickle('../pickles/split/y_train_l.pkl')
X_test_l = pd.read_pickle('../pickles/split/X_test_l.pkl')
y_test_l = pd.read_pickle('../pickles/split/y_test_l.pkl')

Using Lasso Linear Regression.

In [33]:
lasso_dc = Lasso()
lasso_dc.fit(X_train_dc, y_train_dc)

dc_selected = X_train_dc.columns[(lasso_dc.coef_ != 0)]
print('Selected Features:', dc_selected)

Selected Features: Index(['season', 'year', 'month', 'day', 'hour', 'weekday', 'precip', 'temp',
       'humidity'],
      dtype='object')


In [34]:
lasso_l = Lasso()
lasso_l.fit(X_train_l, y_train_l)

l_selected = X_train_l.columns[(lasso_l.coef_ != 0)]
print('Selected Features:', l_selected)

Selected Features: Index(['season', 'year', 'month', 'day', 'hour', 'weekday', 'workingday',
       'weather', 'precip', 'temp', 'humidity', 'windspeed'],
      dtype='object')


The London dataset selects more features than the DC dataset using Lasso, also keeping `workingday`, `weather` and `windspeed`.

Using Forward Selection

In [35]:
from feat_select_functions import forward_selection

In [36]:
dc_forward = forward_selection(X_train_dc,y_train_dc)
print('Number selected:', len(dc_forward), '\nSelected features:',dc_forward)

Number selected: 11 
Selected features: ['atemp', 'humidity', 'hour', 'year', 'season', 'precip', 'windspeed', 'weather', 'month', 'holiday', 'weekday']


In [37]:
l_forward = forward_selection(X_train_l,y_train_l)
print('Number selected:', len(l_forward), '\nSelected features:',l_forward)

Number selected: 11 
Selected features: ['atemp', 'humidity', 'hour', 'workingday', 'precip', 'temp', 'windspeed', 'day', 'season', 'month', 'weather']


Using Backward Selection:

In [38]:
from feat_select_functions import backward_selection

In [39]:
dc_backward = backward_selection(X_train_dc,y_train_dc)
print('Number selected:', len(dc_backward), '\nSelected features:',dc_backward)

Number selected: 11 
Selected features: ['season', 'year', 'month', 'hour', 'weekday', 'holiday', 'weather', 'precip', 'atemp', 'humidity', 'windspeed']


In [40]:
l_backward = backward_selection(X_train_l,y_train_l)
print('Number selected:', len(l_forward), '\nSelected features:',l_backward)

Number selected: 11 
Selected features: ['season', 'month', 'day', 'hour', 'workingday', 'weather', 'precip', 'temp', 'atemp', 'humidity', 'windspeed']


I'm curious to see how good of a model I can get based on different feature combinations, so I'm going to make new versions of each train and test set. One based on Lasso, and in the case of the DC set, one each for forward and backwards selection.

In [41]:
print(l_forward)
print(l_backward)
for i in l_backward:
    if i not in l_forward:
        print(i)
print('-')

['atemp', 'humidity', 'hour', 'workingday', 'precip', 'temp', 'windspeed', 'day', 'season', 'month', 'weather']
['season', 'month', 'day', 'hour', 'workingday', 'weather', 'precip', 'temp', 'atemp', 'humidity', 'windspeed']
-


However, forward and backwards select chose identical features lists for the London dataset, so I am making one pair of sets for both of those with that dataset. 

In [42]:
# list of selected columns
# lasso
dc_lasso = dc_selected.to_list()
lon_lasso = l_selected.to_list()

In [43]:
# apply selected columns to trains and tests
# lasso 
X_train_dc_lasso = X_train_dc[dc_lasso]
X_test_dc_lasso = X_test_dc[dc_lasso]

X_train_l_lasso = X_train_l[lon_lasso]
X_test_l_lasso = X_test_l[lon_lasso]

# forward select (DC)
X_train_dc_fw = X_train_dc[dc_forward]
X_test_dc_fw = X_test_dc[dc_forward]

# backward select (DC)
X_train_dc_bw = X_train_dc[dc_backward]
X_test_dc_bw = X_test_dc[dc_backward]

# FwBw select (London)
X_train_l_fwbw = X_train_l[l_forward]
X_test_l_fwbw = X_test_l[l_forward]

Pickle the new trains and tests into their own folders.

In [44]:
# lasso 
X_train_dc_lasso.to_pickle('../pickles/split/feat-select/lasso/X_train_dc_lasso.pkl')
X_test_dc_lasso.to_pickle('../pickles/split/feat-select/lasso/X_test_dc_lasso.pkl')
X_train_l_lasso.to_pickle('../pickles/split/feat-select/lasso/X_train_l_lasso.pkl')
X_test_l_lasso.to_pickle('../pickles/split/feat-select/lasso/X_test_l_lasso.pkl')

# forward select (DC)
X_train_dc_fw.to_pickle('../pickles/split/feat-select/forward/X_train_dc_fw.pkl')
X_test_dc_fw.to_pickle('../pickles/split/feat-select/forward/X_test_dc_fw.pkl')

#backward select (DC)
X_train_dc_bw.to_pickle('../pickles/split/feat-select/backward/X_train_dc_bw.pkl')
X_test_dc_bw.to_pickle('../pickles/split/feat-select/backward/X_test_dc_bw.pkl')

# FwBw select (London)
X_train_l_fwbw.to_pickle('../pickles/split/feat-select/fwbw/X_train_l_fwbw.pkl')
X_test_l_fwbw.to_pickle('../pickles/split/feat-select/fwbw/X_test_l_fwbw.pkl')