### Rohlik Orders Forecasting Challenge
#### Use historical data to predict customer orders


### Overview
Rohlik Group, a leading European e-grocery innovator, is revolutionising the food retail industry. We operate across 11 warehouses in Czech Republic, Germany, Austria, Hungary, and Romania.
Our challenge focuses on predicting the number of orders (grocery deliveries) at selected warehouses for the next 60 days.

In [27]:
# for analysis and data processing
import pandas as pd
import numpy as np
import seaborn as sns
import ppscore as pps
import datetime as dt
from datetime import date
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.preprocessing import StandardScaler
# for modelling
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score,classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, StratifiedShuffleSplit
#from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
import plotly.express as px
import math
import warnings
warnings.filterwarnings("ignore")

In [8]:
def read_data(path_to_csv_file):
    '''
    Reads csv files from specified paths

    Parameters:
            path to csv file locations
    Returns:
            dataframes of imported csv files
    '''
    # Read CSV files
    df = pd.read_csv(path_to_csv_file)
    
    return df

# call the function to import train data - update with relevant paths
df_train = read_data('/Users/mncedisimncwabe/Downloads/rohlik-orders-forecasting-challenge/train.csv')
df_test = read_data('/Users/mncedisimncwabe/Downloads/rohlik-orders-forecasting-challenge/test.csv')
train_calendar = read_data('/Users/mncedisimncwabe/Downloads/rohlik-orders-forecasting-challenge/train_calendar.csv')
test_calendar = read_data('/Users/mncedisimncwabe/Downloads/rohlik-orders-forecasting-challenge/test_calendar.csv')


In [9]:
df_train['set'] = 'train'
df_test['set'] = 'test'
df = pd.concat([df_train, df_test])

In [10]:
df["date"] = pd.to_datetime(df["date"])
df["day"] = df["date"].dt.strftime("%A")
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["is_weekend"] = "No"
df.loc[(df["day"] == "Saturday"), "is_weekend"] = "Yes"
df.loc[(df["day"] == "Sunday"), "is_weekend"] = "Yes"
df["time_since"] = (df["date"]-df["date"].min()).dt.days
df['cum_time_since'] = df['time_since'].cumsum()
df['week_of_year'] = df["date"].apply(lambda x: x.weekofyear)
df['quarter'] = df["date"].apply(lambda x: x.quarter)
df["month"] = df["month"].astype(str)
df["week_of_year"] = df["week_of_year"].astype(int)
df["quarter"] = df["quarter"].astype(str)
df["month_start"] = df["date"].dt.is_month_start.astype(int)
df["month_end"] = df["date"].dt.is_month_end.astype(int)

In [11]:
def add_season(num):
    if num in [3 ,4, 5]:
        return "Spring"
    elif num in [6, 7, 8]:
        return "Summer"
    elif num in [9, 10, 11]:
        return "Fall"
    else:
        return "winter"
    
df["season"] = df["month"].apply(add_season)

In [12]:
## fill missing value in holiday name with no holiday 
df['holiday_name'] = df['holiday_name'].fillna('no_holiday')

In [13]:
## Fill in the missing values ​​in snow feature with the average value of the month to which the value belongs.
snow_dict = df.groupby('month')['snow'].mean()

def fill_missing_snow(row):
    if pd.isna(row['snow']):
        return snow_dict[row['month']]
    else:
        return row['snow']
        
df['snow'] = df.apply(fill_missing_snow, axis=1)

In [14]:
## Fill in the missing values ​​in precipitation feature with the average value of the month to which the value belongs.

precipitation_dict = df.groupby('month')['precipitation'].mean()

def fill_missing_precipitation(row):
    if pd.isna(row['precipitation']):
        return precipitation_dict[row['month']]
    else:
        return row['precipitation']
        
df['precipitation'] = df.apply(fill_missing_precipitation, axis=1)

In [15]:
## note: i try use this feature by do shifting but i dont get good score with this way
cols_to_lags = [col for col in df.columns if df[col].isnull().sum()>0]
cols_to_lags.remove('orders')
df.drop(cols_to_lags, axis=1, inplace=True)

In [16]:
df.reset_index(inplace=True, drop=True)

In [17]:
df.head()

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,precipitation,snow,...,month,year,is_weekend,time_since,cum_time_since,week_of_year,quarter,month_start,month_end,season
0,Prague_1,2020-12-05,6895.0,no_holiday,0,0,0,0,0.0,0.0,...,12,2020,Yes,0,0,49,4,0,0,winter
1,Prague_1,2020-12-06,6584.0,no_holiday,0,0,0,0,0.0,0.0,...,12,2020,Yes,1,1,49,4,0,0,winter
2,Prague_1,2020-12-07,7030.0,no_holiday,0,0,0,0,0.0,0.0,...,12,2020,No,2,3,50,4,0,0,winter
3,Prague_1,2020-12-08,6550.0,no_holiday,0,0,0,0,0.8,0.0,...,12,2020,No,3,6,50,4,0,0,winter
4,Prague_1,2020-12-09,6910.0,no_holiday,0,0,0,0,0.5,0.0,...,12,2020,No,4,10,50,4,0,0,winter


In [18]:
df_new = df.copy()

In [19]:
one_hot_cols = ['holiday_name', 'is_weekend', 'day', 'year', 'season', 'quarter'] 

encoded = pd.get_dummies(df_new[one_hot_cols], drop_first=True)
df_new = pd.concat([df_new, encoded], axis=1)
df_new.drop(columns=one_hot_cols, inplace=True) 

In [20]:
df_new.drop('month', axis=1, inplace=True) # delete the month give us a better score 

In [22]:
submission = pd.DataFrame({'id': [], 'orders':[], 'warehouse':[],'date':[]})
models = {}
MAPE_scores = {}

for warehouse in df['warehouse'].unique():
    train = df_new[(df_new['set']=='train') & (df_new['warehouse']==warehouse) & (df_new['date']<='2024-01-15')]
    val = df_new[(df_new['set']=='train') & (df_new['warehouse']==warehouse) & (df_new['date']>'2024-01-15')]
    test = df_new[(df_new['set']=='test') & (df_new['warehouse']==warehouse)]
    # split data to X and Y
    X_train = train.drop(['orders', 'set', 'id', 'warehouse', 'date'], axis=1)
    y_train = train['orders']
    X_val = val.drop(['orders', 'set', 'id', 'warehouse', 'date'], axis=1)
    y_val = val['orders']
    X_test = test.drop(['orders', 'set', 'id', 'warehouse', 'date'], axis=1)
    
    # fit model
    model = CatBoostRegressor(verbose=0)
    #model = XGBRegressor()
    model.fit(X_train, y_train)
    # predict y_val_pre 
    y_val_pre = model.predict(X_val)

    # save and print score
    MAPE_scores[warehouse]= mean_absolute_percentage_error(y_val , y_val_pre)
    print(f'{warehouse}: {MAPE_scores[warehouse]}')

    # fit model with all train data
    train_full = df_new[(df_new['set']=='train') & (df_new['warehouse']==warehouse)]
    X_full_train = train_full.drop(['orders', 'set', 'id', 'warehouse', 'date'], axis=1)
    y_full_train = train_full['orders']
    model.fit(X_full_train, y_full_train)
    
    # predict y_test for submission
    preds = model.predict(X_test)
    sub = pd.DataFrame({'id': test['id'].values, 'orders':preds, 'warehouse': warehouse, 'date':test['date'].values})
    submission = pd.concat([submission, sub])
    models[warehouse] = model


Prague_1: 0.04382982401522946
Brno_1: 0.03862986803796035
Prague_2: 0.037411500910134524
Prague_3: 0.04339211230981668
Munich_1: 0.1075922698511162
Frankfurt_1: 0.044378538356116166
Budapest_1: 0.023429042460005253


In [23]:
# public submission score is 0.0468
print(f'\nthe mean of score is {np.mean(np.array(list(MAPE_scores.values())))}')


the mean of score is 0.04838045084862552


In [24]:
submission.head()

Unnamed: 0,id,orders,warehouse,date
0,Prague_1_2024-03-16,10735.828485,Prague_1,2024-03-16
1,Prague_1_2024-03-17,10626.529895,Prague_1,2024-03-17
2,Prague_1_2024-03-18,10156.709556,Prague_1,2024-03-18
3,Prague_1_2024-03-19,9921.026605,Prague_1,2024-03-19
4,Prague_1_2024-03-20,9872.255836,Prague_1,2024-03-20


In [25]:
submission.shape

(397, 4)

In [26]:
submission.reset_index(drop=True, inplace=True)
submission[['id', 'orders']].to_csv('/Users/mncedisimncwabe/Downloads/order_predictions.csv',index=False)