# Predicting sales for Rossman stores: pipeline

In this notebook we'll present the devopment of our model.

In [23]:
# importing user defined functions
from data_cleaning_rossman import clean_rossman
from feature_eng import feat_eng
from models import split_rossmann_feat, 
from utils import metric, plot_stores_sales, train_xgboost, predict_xgboost

# importing libraries
import warnings

In [24]:
# ignore future warning  when using TargetEnconder()
warnings.filterwarnings("ignore", category=FutureWarning)

# Data cleaning

The function `clean_rossman()` performs some inital cleaning tasks on the data.
It takes as input a timeseries csv file, a csv with store info and the directory where the
two csv files can be found.
The function will read the csv files as a pandas Dataframe. First the two Dataframes will be merged on the Store column, with a left join to not lose any data.
The cleaning functions performed are:
* Replace missing values for Customers data with the median of the column
* Drop all rows without a store id or a sales value as we don't have any info in that case
* Drop the Open Column since we assume the store is open on days when there are sales
* Fill in the missing values of the DayOfWeek column based on the Date
* Drop the rows with missing values from the Promo column
* Replace all the missing values from StateHoliday and Schoolholiday with the most frequent value of the column
* Replace all the missing values from the CompetitionDistance column with the most frequent distance for that store
* Change the missing values in the PromoInterval to a 'No Promo' string to be able to use them in the feature engineering
* If we are cleaning the train Dataframe, the second percentile outliers get removed from the Sales and Customers columns

In [25]:
# clean the data
train_cleaned = clean_rossman(csv_tseries='train.csv', csv_store='store.csv', data_folder='data')

In [26]:
# clean the data
holdout_cleaned = clean_rossman(csv_tseries='holdout.csv', csv_store='store.csv', data_folder='data')

# Feature engineering

The user function `feat_eng()` performs some engineering tasks having as input a pandas dataframe. The main tasks performed by this function are:
* Create new features (year, day of the month and week of the year) from the `Date` column and
     when change type when necessary
* Create a new feature dividing the Sales per Customer per Store
* Apply one-hot and target enconding to some categorical features
* Drop some features
The user can choose to work with the train or the test file through the parameter train_data.

In [27]:
# engineering features
train_eng, te_store, te_week, te_day = feat_eng(train_cleaned, train_data=True, te_store=None, te_week=None, te_day=None)

In [28]:
# engineering features
holdout_eng, te_store, te_week, te_day = feat_eng(holdout_cleaned, train_data=False, te_store=te_store, te_week=te_week, te_day=te_day)

In [29]:
train_eng.head(3)

Unnamed: 0,Sales,Promo,CompetitionDistance,Promo2,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,SchoolHoliday_0,SchoolHoliday_1,...,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7,Store_target,WeekofYear_target,Day_target
27,3139,0,900,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,5104.747082,5852.078279,6798.600823
115,2401,0,90,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,8061.232558,5852.078279,6798.600823
147,2646,0,590,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,5161.451883,5852.078279,6798.600823


In [30]:
holdout_eng.head(3)

Unnamed: 0,Sales,Promo,CompetitionDistance,Promo2,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,SchoolHoliday_0,SchoolHoliday_1,...,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7,Store_target,WeekofYear_target,Day_target
0,7195,1,1970,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,6843.828979,7760.831193,6798.600823
1,9525,1,4880,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,8365.822581,7760.831193,6798.600823
2,5621,1,11120,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,4349.10984,7760.831193,6798.600823


# Modeling

In [63]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor

In [46]:
# separating features and target variable from train.csv

# matrix of features, X
X_train = train_eng.drop('Sales', axis=1)  
# target variable vector, y
y_train = train_eng['Sales']               

# matrix of features, X
X_test = holdout_eng.drop('Sales', axis=1)  
# target variable vector, y
y_test = holdout_eng['Sales']               

# Decision tree

In [47]:
# build the decision tree using default settings
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

DecisionTreeRegressor()

In [52]:
y_pred = dt_model.predict(X_test)

In [53]:
metric(np.array(y_pred), np.array(y_test))

29.088800863266727

# XGBoost

In [56]:
xgb_base = XGBRegressor(random_state=42)

xgb_base.fit(X_train,y_train)
pred = xgb_base.predict(X_test)

In [58]:
metric(np.array(pred), np.array(y_test))

23.923788580096033

In [59]:
X_train = X_train.drop(['StateHoliday_a', 
                        'StateHoliday_b', 
                        'StateHoliday_c', 
                        'PromoInterval_No Promo', 
                        'SchoolHoliday_1'], 
                       axis=1)

X_test = X_test.drop(['StateHoliday_a', 
                        'StateHoliday_b', 
                        'StateHoliday_c', 
                        'PromoInterval_No Promo', 
                        'SchoolHoliday_1'], 
                       axis=1)

In [60]:
xgb_sel = XGBRegressor(random_state=42)

xgb_sel.fit(X_train, y_train)
pred = xgb_sel.predict(X_test)

In [61]:
metric(np.array(pred), np.array(y_test))

24.205623429260886

# Random Forest

In [64]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [66]:
# prediction
y_pred = rf.predict(X_test)

In [67]:
metric(np.array(y_pred), np.array(y_test))

25.596893670443926

# XGBoost Regressor tuned

In [None]:
    rossman_cleaned = clean_rossman(csv_tseries='train.csv', csv_store='store.csv', data_folder='data')
    rossman_featured, te_store, te_week, te_day = feat_eng(rossman_cleaned)

    X_train, X_val, y_train, y_val = split_rossmann_feat(rossman_featured)

    xgb_file_name = "xgb_reg.pkl"
    # score_val = train_xgboost(X_train, X_val, y_train, y_val, xgb_file_name)
    # print(f"XGBRegressor score val (error) = {score_val:.1f}%")

    rossman_cleaned_test = clean_rossman(csv_tseries='holdout.csv', csv_store='store.csv', data_folder='data')
    rossman_featured_test, te_store, te_week, te_day = feat_eng(rossman_cleaned_test, train_data=False, te_store=te_store, te_week=te_week, te_day=te_day)

    X_test = rossman_featured_test.drop("Sales", axis=1)
    y_test = rossman_featured_test.loc[:, "Sales"]

    score_pred = predict_xgboost(X_test, y_test, xgb_file_name)
    print(f"XGBRegressor score holdout (error) = {score_pred:.1f}%")
