In [117]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from sklearn.ensemble import RandomForestRegressor

In [4]:
sys.path.insert(0, '../modules/')
import cleaning as cln
import feature_eng as feng

In [5]:
#Import data
store = pd.read_csv('../data/store.csv', index_col='Store')
train = pd.read_csv('../data/train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Merging and cleaning data

In [84]:
#Merge data
df = cln.merge(train, store)
df.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,2013-01-01,1115.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,c,5350.0,,,1.0,22.0,2012.0,"Mar,Jun,Sept,Dec"
1,2013-01-01,379.0,2.0,0.0,0.0,0.0,0.0,a,1.0,d,a,6630.0,,,0.0,,,
2,2013-01-01,378.0,2.0,0.0,0.0,0.0,0.0,a,1.0,a,c,2140.0,8.0,2012.0,0.0,,,
3,2013-01-01,377.0,2.0,0.0,0.0,0.0,0.0,a,1.0,a,c,100.0,6.0,2010.0,1.0,18.0,2010.0,"Feb,May,Aug,Nov"
4,2013-01-01,376.0,2.0,0.0,0.0,0.0,0.0,a,1.0,a,a,160.0,8.0,2012.0,0.0,,,


In [85]:
#Drop customers column
df = cln.drop_column(df)

In [86]:
#Clean the StateHoliday column
df = cln.clean_column_values(df)

In [87]:
#Remove null and zero values for Sales
df = cln.clean_targets(df)

In [88]:
#Use the cleaning function
df = cln.rough_features_cleaning(df)

Total number of rows before cleaning:  515849
Total number of rows after cleaning:  425689


In [89]:
df.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
27,2013-01-01,353.0,2.0,3139.0,1.0,0.0,a,1.0,b,b,900.0,1.0
115,2013-01-01,335.0,2.0,2401.0,1.0,0.0,a,1.0,b,a,90.0,1.0
147,2013-01-01,512.0,2.0,2646.0,1.0,0.0,a,1.0,b,b,590.0,1.0
162,2013-01-01,494.0,2.0,3113.0,1.0,0.0,a,1.0,b,a,1260.0,0.0
199,2013-01-01,530.0,2.0,2907.0,1.0,0.0,a,1.0,a,c,18160.0,0.0


In [90]:
df.isna().any()

Date                   False
Store                  False
DayOfWeek              False
Sales                  False
Open                   False
Promo                  False
StateHoliday           False
SchoolHoliday          False
StoreType              False
Assortment             False
CompetitionDistance    False
Promo2                 False
dtype: bool

### Feature engineering

In [91]:
#Add new dates features
df = feng.dates_features(df)

In [92]:
df.head()

Unnamed: 0,Date,Store,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,month,day_of_week,day_of_month,is_monday,is_saturday
27,2013-01-01,353.0,3139.0,1.0,0.0,a,1.0,b,b,900.0,1.0,1,1,1,0,0
115,2013-01-01,335.0,2401.0,1.0,0.0,a,1.0,b,a,90.0,1.0,1,1,1,0,0
147,2013-01-01,512.0,2646.0,1.0,0.0,a,1.0,b,b,590.0,1.0,1,1,1,0,0
162,2013-01-01,494.0,3113.0,1.0,0.0,a,1.0,b,a,1260.0,0.0,1,1,1,0,0
199,2013-01-01,530.0,2907.0,1.0,0.0,a,1.0,a,c,18160.0,0.0,1,1,1,0,0


In [93]:
df = feng.one_hot_encoding(df, 'StateHoliday')
df = feng.one_hot_encoding(df, 'StoreType')
df = feng.one_hot_encoding(df, 'Assortment')

In [94]:
#Splitting data in train and test
df_train, df_test = feng.date_split_train_test(df, '2014-05-01')

In [95]:
#One hot encoding of StateHoliday, StoreType, Assortment for train
#df_train = feng.one_hot_encoding(df_train, 'StateHoliday')
#df_train = feng.one_hot_encoding(df_train, 'StoreType')
#df_train = feng.one_hot_encoding(df_train, 'Assortment')

In [96]:
#One hot encoding of StateHoliday, StoreType, Assortment for test
#df_test = feng.one_hot_encoding(df_test, 'StateHoliday')
#df_test = feng.one_hot_encoding(df_test, 'StoreType')
#df_test = feng.one_hot_encoding(df_test, 'Assortment')

In [97]:
#Add mean encoding for the Store id for train
df_train, dict_store_values = feng.mean_encoding(df_train, 'Store')

In [121]:
#Apply same transformation to test with the values from train
df_test.loc[:, 'Store_mean_encoded'] = df_test.loc[:, 'Store'].replace(to_replace=dict_store_values)

In [122]:
df_test.head()

Unnamed: 0,Date,Store,Sales,Promo,SchoolHoliday,CompetitionDistance,Promo2,month,day_of_week,day_of_month,...,StateHoliday _b,StateHoliday _c,StoreType _a,StoreType _b,StoreType _c,StoreType _d,Assortment _a,Assortment _b,Assortment _c,Store_mean_encoded
540889,2014-05-01,335.0,10770.0,1.0,0.0,90.0,1.0,5,3,1,...,0,0,0,1,0,0,1,0,0,13029.339286
540921,2014-05-01,512.0,7793.0,1.0,0.0,590.0,1.0,5,3,1,...,0,0,0,1,0,0,0,1,0,5075.170732
540934,2014-05-01,494.0,9561.0,1.0,0.0,1260.0,0.0,5,3,1,...,0,0,0,1,0,0,1,0,0,7478.446429
541068,2014-05-01,85.0,11656.0,1.0,0.0,1870.0,0.0,5,3,1,...,0,0,0,1,0,0,1,0,0,7096.483544
541169,2014-05-01,54.0,5829.0,1.0,0.0,7170.0,1.0,5,3,1,...,0,0,0,0,0,1,0,0,1,7798.825


In [99]:
#drop Store columns from train
df_train = df_train.drop('Store', axis=1)

In [109]:
#Drop Open (only 1)
df_train = df_train.drop('Open', axis=1)
df_test = df_test.drop('Open', axis=1)

KeyError: "['Open'] not found in axis"

In [110]:
df_test = df_test.drop('Open', axis=1)

## Models

In [101]:
#Error function
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [123]:
X_train = df_train.drop(['Date','Sales'], axis=1)
y_train = df_train.loc[:, 'Sales']

X_test = df_test.drop(['Date','Sales', 'Store'], axis=1)
y_test = df_test.loc[:, 'Sales']

In [124]:
X_test.head()

Unnamed: 0,Promo,SchoolHoliday,CompetitionDistance,Promo2,month,day_of_week,day_of_month,is_monday,is_saturday,StateHoliday _0,...,StateHoliday _b,StateHoliday _c,StoreType _a,StoreType _b,StoreType _c,StoreType _d,Assortment _a,Assortment _b,Assortment _c,Store_mean_encoded
540889,1.0,0.0,90.0,1.0,5,3,1,0,0,0,...,0,0,0,1,0,0,1,0,0,13029.339286
540921,1.0,0.0,590.0,1.0,5,3,1,0,0,0,...,0,0,0,1,0,0,0,1,0,5075.170732
540934,1.0,0.0,1260.0,0.0,5,3,1,0,0,0,...,0,0,0,1,0,0,1,0,0,7478.446429
541068,1.0,0.0,1870.0,0.0,5,3,1,0,0,0,...,0,0,0,1,0,0,1,0,0,7096.483544
541169,1.0,0.0,7170.0,1.0,5,3,1,0,0,0,...,0,0,0,0,0,1,0,0,1,7798.825


### Baseline models - averages

#### Basic average

In [112]:
avg_sales = df_train.Sales.mean()

In [113]:
actuals = df_test.Sales
preds = np.full_like(actuals, avg_sales)

In [114]:
actuals = actuals.to_numpy()

In [115]:
base_model_1_res = metric(preds, actuals)
base_model_1_res

55.04215945266602

### XGBoost tree

In [116]:
model_xgb_1 = xgboost.XGBRegressor()
model_xgb_1.fit(X_train, y_train)



XGBRegressor()

In [133]:
y_pred_xgb_1 = model_xgb_1.predict(X_test)

In [128]:
metric(y_pred, y_test.to_numpy())

20.52322760690766

In [120]:
X_test.columns

Index(['Promo', 'SchoolHoliday', 'CompetitionDistance', 'Promo2', 'month',
       'day_of_week', 'day_of_month', 'is_monday', 'is_saturday',
       'StateHoliday _0', 'StateHoliday _a', 'StateHoliday _b',
       'StateHoliday _c', 'StoreType _a', 'StoreType _b', 'StoreType _c',
       'StoreType _d', 'Assortment _a', 'Assortment _b', 'Assortment _c'],
      dtype='object')

### Random forest

In [136]:
model_rf_1 = RandomForestRegressor(n_estimators=500, max_depth=3)


In [None]:
model_rf_1.fit(X_train, y_train)

In [134]:
y_pred_rf_1 = model_rf_1.predict(X_test)

In [135]:
metric(y_pred_rf_1, y_test.to_numpy())

34.27410775618792

In [None]:
#next steps: mean encore storetype and assortment