# 4 - Feature Selection

In [15]:
import pandas as pd
from sklearn.linear_model import Lasso
import statsmodels.api as sm

In [16]:
X_train_dc = pd.read_pickle('pickles/split/X_train_dc.pkl')
y_train_dc = pd.read_pickle('pickles/split/y_train_dc.pkl')
X_test_dc = pd.read_pickle('pickles/split/X_test_dc.pkl')
y_test_dc = pd.read_pickle('pickles/split/y_test_dc.pkl')

X_train_l = pd.read_pickle('pickles/split/X_train_l.pkl')
y_train_l = pd.read_pickle('pickles/split/y_train_l.pkl')
X_test_l = pd.read_pickle('pickles/split/X_test_l.pkl')
y_test_l = pd.read_pickle('pickles/split/y_test_l.pkl')

Using Lasso Linear Regression.

In [17]:
lasso_dc = Lasso()
lasso_dc.fit(X_train_dc, y_train_dc)

dc_selected = X_train_dc.columns[(lasso_dc.coef_ != 0)]
print('Selected Features:', dc_selected)

Selected Features: Index(['season', 'year', 'month', 'day', 'hour', 'weekday', 'precip', 'temp',
       'humidity'],
      dtype='object')


In [18]:
lasso_l = Lasso()
lasso_l.fit(X_train_l, y_train_l)

l_selected = X_train_l.columns[(lasso_l.coef_ != 0)]
print('Selected Features:', l_selected)

Selected Features: Index(['season', 'year', 'month', 'day', 'hour', 'weekday', 'workingday',
       'weather', 'precip', 'temp', 'humidity', 'windspeed'],
      dtype='object')


The London dataset selects more features than the DC dataset using Lasso, also keeping `workingday`, `weather` and `windspeed`.

Using Forward Selection

In [19]:
from feat_select_functions import forward_selection

In [20]:
dc_forward = forward_selection(X_train_dc,y_train_dc)
print('Number selected:', len(dc_forward), '\nSelected features:',dc_forward)

Number selected: 12 
Selected features: ['temp', 'humidity', 'hour', 'year', 'month', 'precip', 'holiday', 'weather', 'season', 'atemp', 'windspeed', 'weekday']


In [21]:
l_forward = forward_selection(X_train_l,y_train_l)
print('Number selected:', len(l_forward), '\nSelected features:',l_forward)

Number selected: 11 
Selected features: ['temp', 'humidity', 'hour', 'workingday', 'precip', 'day', 'windspeed', 'atemp', 'season', 'month', 'weather']


Using Backward Selection:

In [22]:
from feat_select_functions import backward_selection

In [23]:
dc_backward = backward_selection(X_train_dc,y_train_dc)
print('Number selected:', len(dc_backward), '\nSelected features:',dc_backward)

Number selected: 11 
Selected features: ['season', 'year', 'month', 'hour', 'weekday', 'holiday', 'weather', 'precip', 'atemp', 'humidity', 'windspeed']


In [24]:
l_backward = backward_selection(X_train_l,y_train_l)
print('Number selected:', len(l_forward), '\nSelected features:',l_forward)

Number selected: 11 
Selected features: ['temp', 'humidity', 'hour', 'workingday', 'precip', 'day', 'windspeed', 'atemp', 'season', 'month', 'weather']


Forward and backward selection have the same selected features, and as a result the same differences in selected features between the datasets.<p>
Selected for both<ul>
`season`<br>
`month`<br>
`hour`<br>
`weather`<br>
`precip`<br>
`atemp`<br>
`humidity`<br>
`windspeed`</ul>

Only for DC:<ul>
`year`<br>
`weekday`<br>
`holiday`</ul>

Only for London:<ul>
`temp`<br>
`workingday`<br>
`day`</ul>

Never selected:<ul> 
DC - `workingday`<p>
London - `Holiday`

I'm curious to see how good of a model I can get based on different feature combinations, so I'm going to make two more versions of each set; one based on Lasso and one based on forward/backward selection (since they chose the same features for each dataset.)

In [25]:
# list of selected columns
dc_lasso = dc_selected.to_list()
lon_lasso = l_selected.to_list()

In [26]:
# apply selected columns to trains and tests
X_train_dc_lasso = X_train_dc[dc_lasso]
X_test_dc_lasso = X_test_dc[dc_lasso]

X_train_l_lasso = X_train_l[lon_lasso]
X_test_l_lasso = X_test_l[lon_lasso]

For the FwBw selection, each dataset had one feature that was not selected, so only that will be dropped. 

In [27]:
X_train_dc_fwbw = X_train_dc.drop('workingday',axis=1)
X_test_dc_fwbw = X_test_dc.drop('workingday',axis=1)

X_train_l_fwbw = X_train_l.drop('holiday',axis=1)
X_test_l_fwbw = X_test_l.drop('holiday',axis=1)

Pickle the new trains and tests into their own folders.

In [28]:
# lasso selected
X_train_dc_lasso.to_pickle('pickles/split/lasso/X_train_dc_lasso.pkl')
X_test_dc_lasso.to_pickle('pickles/split/lasso/X_test_dc_lasso.pkl')
X_train_l_lasso.to_pickle('pickles/split/lasso/X_train_l_lasso.pkl')
X_test_l_lasso.to_pickle('pickles/split/lasso/X_test_l_lasso.pkl')

# forward/backward selected
X_train_dc_fwbw.to_pickle('pickles/split/fwbw/X_train_dc_fwbw.pkl')
X_test_dc_fwbw.to_pickle('pickles/split/fwbw/X_test_dc_fwbw.pkl')
X_train_l_fwbw.to_pickle('pickles/split/fwbw/X_train_l_fwbw.pkl')
X_test_l_fwbw.to_pickle('pickles/split/fwbw/X_test_l_fwbw.pkl')