In [3]:
import os
import numpy as np
import pandas as pd
import datetime 

pd.set_option('display.max_columns', 100)

In [4]:
os.chdir('..')

In [6]:
pwd

'/Users/A.IVA/Documents/jupyter_notebooks/coursera_and_blogs/rossmann_competition'

### Load data and generate new features

In [32]:
from rossmann.preprocess import load_and_preprocess
from rossmann.feature_generator import promo2_running

In [36]:
data_dir = os.path.join('..', 'data', 'rossmann-store-sales', 'source')
df = load_and_preprocess(data_dir)

df.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,competition_since,promos2
0,1,2,2013-01-01,0.0,0.0,0.0,0,a,1,,c,a,1270.0,2008-09-01,
1,1,3,2013-01-02,5530.0,668.0,1.0,0,0,1,,c,a,1270.0,2008-09-01,


##### generate features (set a)

In [37]:
# Is promo2 running?
df['Promo2'] = df[['Date', 'promos2']].apply(lambda x: promo2_running(x[0], x[1]), 1)

# Has the last competitor already arrived?
df['last_competitor_here'] = (df['competition_since'] <= df['Date']).astype(int)

# I will drop the `Date` column and use this column as my the trend generator
df['days_since_start'] = df['Date'].apply(lambda x: (x - datetime.datetime(2013, 1, 1)).days)

# # OHE of some variables
# df = pd.concat([df.drop(['StateHoliday'], 1),
#                 pd.get_dummies(df['StateHoliday'], prefix='state_h_')], axis=1, sort=False)

# df = pd.concat([df.drop(['DayOfWeek'], 1),
#                 pd.get_dummies(df['DayOfWeek'], prefix='day_')], axis=1, sort=False)


In [38]:
df.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,competition_since,promos2,Promo2,last_competitor_here,days_since_start
0,1,2,2013-01-01,0.0,0.0,0.0,0,a,1,,c,a,1270.0,2008-09-01,,0,1,0
1,1,3,2013-01-02,5530.0,668.0,1.0,0,0,1,,c,a,1270.0,2008-09-01,,0,1,1


##### generate features (set b)

In [None]:
# take into account the weird behaviour before Christmas
# weeks before Christmas/New year ()

In [57]:
df['days_left_yr'] = df['Date'].apply(lambda x: (datetime.datetime(x.year, 12, 31) - x).days)
df['days_since_yr'] = df['Date'].apply(lambda x: (x - datetime.datetime(x.year, 1, 1)).days)

df['weeks_left_yr'] = df['days_left_yr'] // 7
df['weeks_since_yr'] = df['days_since_yr'] // 7

df['any_state_holiday'] = ((df['StateHoliday'] != '0') | (df['DayOfWeek'] == 7)).astype(int)

df['weeks_left_yr']  = df['weeks_left_yr'] *  (df['weeks_left_yr'] < 12).astype(int) + 12 * (df['weeks_left_yr'] >= 12).astype(int)
df['weeks_since_yr'] = df['weeks_since_yr'] * (df['weeks_since_yr'] < 3).astype(int) + 3 * (df['weeks_since_yr'] >= 3).astype(int)

In [58]:
# this part will be utilized better with xgboost

for i in range(-1,7):

    df[f'any_holiday_in_{i}d'] = df \
        .sort_values(['Store', 'Date'], ascending=[True] * 2) \
        .groupby(['Store'])['any_state_holiday'] \
        .shift(-i, axis=0).fillna(0)

In [61]:
# display(df['weeks_left_yr'].value_counts())
# display(df['weeks_since_yr'].value_counts())

In [44]:
df[df['Store']==1][['any_state_holiday', 'any_holiday_in_1d', 'any_holiday_in_2d', 'any_holiday_in_3d']].head(10)

Unnamed: 0,any_state_holiday,any_holiday_in_1d,any_holiday_in_2d,any_holiday_in_3d
0,1,0.0,0.0,0.0
1,0,0.0,0.0,0.0
2,0,0.0,0.0,1.0
3,0,0.0,1.0,0.0
4,0,1.0,0.0,0.0
5,1,0.0,0.0,0.0
6,0,0.0,0.0,0.0
7,0,0.0,0.0,0.0
8,0,0.0,0.0,0.0
9,0,0.0,0.0,1.0


In [62]:
df.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,competition_since,promos2,Promo2,last_competitor_here,days_since_start,days_left,weeks_left,any_state_holiday,any_holiday_in_-1d,any_holiday_in_0d,any_holiday_in_1d,any_holiday_in_2d,any_holiday_in_3d,any_holiday_in_4d,any_holiday_in_5d,any_holiday_in_6d,days_left_yr,days_since_yr,weeks_left_yr,weeks_since_yr
0,1,2,2013-01-01,0.0,0.0,0.0,0,a,1,,c,a,1270.0,2008-09-01,,0,1,0,364,52,1,0.0,1,0.0,0.0,0.0,0.0,1.0,0.0,364,0,12,0
1,1,3,2013-01-02,5530.0,668.0,1.0,0,0,1,,c,a,1270.0,2008-09-01,,0,1,1,363,51,0,1.0,0,0.0,0.0,0.0,1.0,0.0,0.0,363,1,12,0


In [64]:
# OHE of some variables

# this will be replaced with any_state_holiday
# df = pd.concat([df.drop(['StateHoliday'], 1),
#                 pd.get_dummies(df['StateHoliday'], prefix='state_h_')], axis=1, sort=False)

df = pd.concat([df.drop(['DayOfWeek'], 1),
                pd.get_dummies(df['DayOfWeek'], prefix='day_')], axis=1, sort=False)

df = pd.concat([df.drop(['weeks_left_yr'], 1),
                pd.get_dummies(df['weeks_left_yr'], prefix='weeks_left_yr_')], axis=1, sort=False)

df = pd.concat([df.drop(['weeks_since_yr'], 1),
                pd.get_dummies(df['weeks_since_yr'], prefix='weeks_since_yr_')], axis=1, sort=False)

In [65]:
df.head(2)

Unnamed: 0,Store,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,competition_since,promos2,Promo2,last_competitor_here,days_since_start,days_left,weeks_left,any_state_holiday,any_holiday_in_-1d,any_holiday_in_0d,any_holiday_in_1d,any_holiday_in_2d,any_holiday_in_3d,any_holiday_in_4d,any_holiday_in_5d,any_holiday_in_6d,days_left_yr,days_since_yr,day__1,day__2,day__3,day__4,day__5,day__6,day__7,weeks_left_yr__0,weeks_left_yr__1,weeks_left_yr__2,weeks_left_yr__3,weeks_left_yr__4,weeks_left_yr__5,weeks_left_yr__6,weeks_left_yr__7,weeks_left_yr__8,weeks_left_yr__9,weeks_left_yr__10,weeks_left_yr__11,weeks_left_yr__12,weeks_since_yr__0,weeks_since_yr__1,weeks_since_yr__2,weeks_since_yr__3
0,1,2013-01-01,0.0,0.0,0.0,0,a,1,,c,a,1270.0,2008-09-01,,0,1,0,364,52,1,0.0,1,0.0,0.0,0.0,0.0,1.0,0.0,364,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,1,2013-01-02,5530.0,668.0,1.0,0,0,1,,c,a,1270.0,2008-09-01,,0,1,1,363,51,0,1.0,0,0.0,0.0,0.0,1.0,0.0,0.0,363,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0


In [None]:
exogen_vars = [
        'Promo', 'Promo2', 'SchoolHoliday', 'any_state_holiday',
        'last_competitor_here', 'days_since_start',
        'day__1', 'day__2', 'day__3', 'day__4', 'day__5', 'day__6', 'day__7',
        'weeks_left_yr__0', 'weeks_left_yr__1', 'weeks_left_yr__2', 'weeks_left_yr__3', 'weeks_left_yr__4',
        'weeks_left_yr__5', 'weeks_left_yr__6', 'weeks_left_yr__7', 'weeks_left_yr__8',
        'weeks_left_yr__9', 'weeks_left_yr__10', 'weeks_left_yr__11', 'weeks_left_yr__12',
        'weeks_since_yr__0', 'weeks_since_yr__1', 'weeks_since_yr__2', 'weeks_since_yr__3',
        'any_holiday_in_-1d', 'any_holiday_in_0d', 'any_holiday_in_1d', 'any_holiday_in_2d',
        'any_holiday_in_3d', 'any_holiday_in_4d', 'any_holiday_in_5d', 'any_holiday_in_6d']