# 0.0 IMPORTS

In [1]:
import numpy               as np
import pandas              as pd

import datetime

from sklearn.ensemble      import RandomForestRegressor

!pip install boruta
from boruta import BorutaPy

from IPython.display import display #only outside of Jupyter Notebook



# 6.0 Feature Selection

In [2]:
df6 = pd.read_csv('df5.csv')

Remove repeated and auxiliary variables:

In [3]:
remove_cols = ['day_of_week', 'week_of_year', 'day', 'month', 'promo_since', 'competition_since', 'year_week']

df6 = df6.drop(remove_cols, axis=1)

In [4]:
df6.dtypes

store                             int64
date                             object
sales                           float64
promo                             int64
school_holiday                    int64
store_type                        int64
assortment                        int64
competition_distance            float64
competition_open_since_month      int64
competition_open_since_year       int64
promo2                            int64
promo2_since_week                 int64
promo2_since_year                 int64
is_promo                          int64
year                            float64
competition_time_month          float64
promo_time_week                 float64
state_holiday_christmas           int64
state_holiday_easter_holiday      int64
state_holiday_public_holiday      int64
state_holiday_regular_day         int64
month_sin                       float64
month_cos                       float64
day_sin                         float64
day_cos                         float64


In [5]:
df6['date'] = pd.to_datetime(df6['date'])

df6['sales'] = df6['sales'].astype(int)

df6['year'] = df6['year'].astype(int)

df6['competition_time_month'] = df6['competition_time_month'].astype(int)

## 6.1 Split original DataFrame into train and test

In [6]:
date_filter = df6['date'].max() - datetime.timedelta(days=6*7)

df6_train = df6[df6['date'] < date_filter]
df6_test = df6[df6['date'] >= date_filter]

In [7]:
X_train = df6_train.drop(['date', 'sales'], axis=1)
y_train = df6_train['sales']

X_test = df6_test.drop(['date', 'sales'], axis=1)
y_test = df6_test['sales']

## 6.2 Boruta as Feature selector

In [8]:
rf = RandomForestRegressor(n_jobs=-1)

The Boruta selector only takes arrays as input

In [9]:
X_train_arr = X_train.to_numpy()
y_train_arr = y_train.ravel()

In [10]:
boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42).fit(X_train_arr, y_train_arr)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	9 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	10 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	11 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	1
Rejected: 	14
Iteration: 	16 / 100
Confirmed: 	12
Tentative: 	1
Reject

In [11]:
cols_selected = boruta.support_.tolist()

#best features
cols_selected_boruta = X_train.iloc[:, cols_selected].columns.to_list()

#not selected
cols_not_selected_boruta = list(np.setdiff1d(X_train.columns, cols_selected_boruta))

In [12]:
display(cols_selected_boruta)

['store',
 'promo',
 'store_type',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2_since_week',
 'promo2_since_year',
 'promo_time_week',
 'day_cos',
 'day_of_week_sin',
 'day_of_week_cos']

Boruta ran 27 iterations in 3h50min, and it selected twelve features (fifteen were rejected):

cols_selected_boruta = ['store', 'promo', 'store_type', 'competition_distance', 'competition_open_since_month', 'competition_open_since_year', 'promo2_since_week', 'promo2_since_year', 'promo_time_week', 'day_cos', 'day_of_week_sin', 'day_of_week_cos']

The same output was obtained in the command line, but in 1/3 of the time!