# 0.0 IMPORTS

In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.ensemble      import RandomForestRegressor
from boruta import BorutaPy

## 0.1. Loading Data

In [2]:
#df6 = df5.copy()
df6 = pd.read_csv('df5.csv', delimiter=';')

# 6.0 STEP 06: FEATURE SELECTION

In [3]:
df6.columns

Index(['store', 'day_of_week', 'date', 'sales', 'promo', 'school_holiday',
       'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'is_promo', 'year', 'month',
       'day', 'week_of_year', 'year_week', 'competition_since',
       'competition_time_month', 'promo_since', 'promo_time_week',
       'state_holiday_christmas', 'state_holiday_easter_holiday',
       'state_holiday_public_holiday', 'state_holiday_regular_day',
       'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos',
       'day_sin', 'day_cos', 'week_of_year_sin', 'week_of_year_cos'],
      dtype='object')

## 6.1 Split original DataFrame into train and test

In [4]:
# Columns date to date format:
df6['date'] = pd.to_datetime(df6['date'])

# Delete auxiliary variables
cols_drop = ['week_of_year', 'day', 'month', 'day_of_week', 'promo_since', 'competition_since', 'year_week' ]
df6.drop( cols_drop, axis=1, inplace=True)

In [5]:
# Important dates:
print( f"Fisrt date: {df6['date'].min().date()}")
print( f"Fisrt date: {df6['date'].max().date()}")
print( f"6 weeks is equivalent to {datetime.timedelta(days=6*7)}")

Fisrt date: 2013-01-01
Fisrt date: 2015-07-31
6 weeks is equivalent to 42 days, 0:00:00


In [6]:
# Split data into train and test:
# last day - 42 days → split date: 2015-06-19
cut_date = df6[['store', 'date']].groupby('store').max().reset_index()['date'][0] - datetime.timedelta( days = 7 * 6)

# split the dataset into train and test:
df6_train = df6[df6['date'] < cut_date]
df6_test = df6[df6['date'] >= cut_date]

# split the training set:
X_train = df6_train.drop(['sales', 'date'], axis=1)
y_train = df6_train['sales']

# split the test set:
X_test = df6_test.drop(['date', 'sales'], axis=1)
y_test = df6_test['sales']

## 6.2. Boruta as Feature Selector

In [7]:
# Instance Random Forest Regressor using all nucleus of processor:
rf = RandomForestRegressor(n_jobs=-1)

In [8]:
# Boruta works with arrays as inputs:
X_train_array = X_train.values
y_train_array = y_train.values.ravel()

# Define Boruta:
boruta = BorutaPy( estimator= rf, n_estimators= 'auto', verbose= 2, random_state= 42 ).fit( X_train_array, y_train_array )

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	9


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	9


### 6.2.1. Best Features from Boruta

In [13]:
# Seleceted features by relevance:
cols_selected = boruta.support_.tolist() #rank of relevance

#best features
cols_selected_boruta = X_train.iloc[:, cols_selected].columns.to_list()

#not selected
cols_not_selected_boruta = list(np.setdiff1d(X_train.columns, cols_selected_boruta))

In [15]:
cols_selected_boruta

['store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo_time_week',
 'day_of_week_sin',
 'day_of_week_cos',
 'month_cos',
 'day_sin',
 'day_cos',
 'week_of_year_cos']

In [16]:
cols_not_selected_boruta

['is_promo',
 'month_sin',
 'school_holiday',
 'state_holiday_christmas',
 'state_holiday_easter_holiday',
 'state_holiday_public_holiday',
 'state_holiday_regular_day',
 'week_of_year_sin',
 'year']

In [21]:
### Cols selected by Boruta:
cols_selected_by_boruta = ['store',
                           'promo',
                           'store_type',
                           'assortment',
                           'competition_distance',
                           'competition_open_since_month',
                           'competition_open_since_year',
                           'promo2',
                           'promo2_since_week',
                           'promo2_since_year',
                           'competition_time_month',
                           'promo_time_week',
                           'day_of_week_sin',
                           'day_of_week_cos',
                           'month_cos',
                           'day_sin',
                           'day_cos',
                           'week_of_year_cos']

### Cols not selected by Boruta:
cols_not_selected_by_boruta = ['is_promo',
                               'month_sin',
                               'school_holiday',
                               'state_holiday_christmas',
                               'state_holiday_easter_holiday',
                               'state_holiday_public_holiday',
                               'state_holiday_regular_day',
                               'week_of_year_sin',
                               'year']

## 6.3. Manual Feature Selection

In [None]:
# Boruta more some manual features: 'week_of_year_sin' and 'day_of_week_cos'
cols_selected_boruta = [
    'store',
    'promo',
    'store_type',
    'assortment',
    'competition_distance',
    'competition_open_since_month',
    'competition_open_since_year',
    'promo2',
    'promo2_since_week',
    'promo2_since_year',
    'competition_time_month',
    'promo_time_week',
    'day_of_week_sin',
    'day_of_week_cos',
    'month_cos',
    'month_sin',
    'day_sin',
    'day_cos',
    'week_of_year_cos',
    'week_of_year_sin']

# columns to add:
feat_to_add = ['date', 'sales']

# final features:
cols_selected_boruta.extend( feat_to_add )