In [133]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Intro**

The aim of this project is to demonstrate a simple model for forecasting upcoming sales. There are a huge number of ways to deal with this, including framing this as either a classification or regression problem, and also a range of models one can use. In this case, we will read in our data, and compile a problem statement, workflow and model, as if in a real world business use case. As always, we first import the relevant libraries, read in our data and do a basic overview analysis.

This Kaggle dataset features 2 restaurants and there are 4 files in total. In a normal business setting we would be using SQL, in a relational database, but for this notebook, I will carry out the same operations in python. For ease, I am going to only only forecast Restaurant 2.


In [134]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error
from xgboost import XGBRegressor

In [135]:
orders = pd.read_csv("../input/19560-indian-takeaway-orders/restaurant-2-orders.csv")
products = pd.read_csv("../input/19560-indian-takeaway-orders/restaurant-2-products-price.csv")
orders

In [136]:
products.head(10)
#from this we can see that we didn't actually need the products data. It is already joined into the orders database
del products

In [137]:
#drop nan values and convert to datetime
orders = orders.dropna()
orders['Order Date'] = pd.to_datetime(orders['Order Date'], dayfirst = True)


In [138]:
#check how bad the time gaps are in the data
#first we group orders by day, summing total products
time_grouped = orders[['Order Date', 'Quantity']].resample('D', on='Order Date').sum().reset_index()
time_grouped 


In [139]:
#now we make a list of xticks
times = pd.date_range(start='2016-06-10',end='2019-08-03')
#and plot
plt.figure(figsize = (40,10))
sns.scatterplot(x = time_grouped['Order Date'], y = time_grouped['Quantity'])
plt.xticks(times)
plt.plot

In [140]:
#The plot shows generally few gaps for the latter portion, though there are 2 flats early on.
#Lets zoom in on this part of the graph
#and plot
plt.figure(figsize = (20,20))
sns.scatterplot(x = time_grouped['Order Date'].iloc[200:525], y = time_grouped['Quantity'].iloc[200:525])
plt.plot
#we can see that from 08/2016 is when orders become consistent, so these the values we will use for the model
orders = orders.loc[orders['Order Date'] >='2016-08-01' ]

In [141]:
#Groupby items to see what's popular
grouped = orders.groupby('Item Name').sum().sort_values(by = 'Total products', ascending = 0)
grouped
#we can see there are 337 unique products, with some much more popular than others

In [142]:
#Investigating average order volume by periods
print("Daily:\n", orders.groupby([pd.Grouper(key='Order Date', freq='D')])['Quantity'].sum().mean())
print("Weekly:\n", orders.groupby([pd.Grouper(key='Order Date', freq='W-MON')])['Quantity'].sum().mean())
print("Monthly:\n", orders.groupby([pd.Grouper(key='Order Date', freq='M')])['Quantity'].sum().mean())

**1. Define Problem Statement**


In this case, let’s say we are the owner of the takeaway business. Why would forecasting be useful? In this case, it may be helpful to know how may orders we will receive in total, so that we can get adequate staffing in place, without overpaying for staff unnecessarily. It may also be worth knowing what the sales of major items are likely to be, to ensure these do not run out.

So to address this case, the best method is to model as a regression problem. In the real world the likely situation would be a periodically run model or models, predicting x time period(s) ahead, which can be retrained/updated as needed either manually, or through an automated process. For this notebook, we will show the first step, which is a trained model ready to deploy. The process could then be manually updated/optimised monthly.

*** Based on the simple EDA above, the daily volume is too small and the monthly too large,  so we will create a model which predicts Total Sales Volume for the next week, as well as next week's sales volume for Bombay Aloo separately, as this is the most popular item which is not a side/condiment**


First we orgainise our table by date, drop unnecessary columns, and feature engineer, as well as creating labels

In [154]:
#create relevant Database df1 for total and df2 for bombay aloo
df = orders[['Order Date', 'Quantity']]
df2 = orders[orders['Item Name'] == 'Bombay Aloo']
df2 = df2[['Order Date', 'Quantity']]
df = df.groupby([pd.Grouper(key='Order Date', freq='W-MON')])['Quantity'].sum().reset_index().sort_values('Order Date')
df2 = df2.groupby([pd.Grouper(key='Order Date', freq='W-MON')])['Quantity'].sum().reset_index().sort_values('Order Date')
#Add Seasonality features
df['Week'] = df['Order Date'].dt.isocalendar().week
df['Month'] = df['Order Date'].dt.month
df2['Week'] = df2['Order Date'].dt.isocalendar().week
df2['Month'] = df2['Order Date'].dt.month
#Add past volume features
for i in range (1,15):
    label = "Quantity_" + str(i)
    df[label] = df['Quantity'].shift(i)
    df2[label] = df2['Quantity'].shift(i)
    label = "Average_" + str(i)
    df[label] = df['Quantity'].rolling(i).mean()
    df2[label] = df2['Quantity'].rolling(i).mean()
df = df.dropna()
df2 = df2.dropna()

In [155]:
#one hot encode df using pandas get_dummies
for column in ['Week','Month']:
    tempdf = pd.get_dummies(df[column], prefix=column)
    df = pd.merge(
        left=df,
        right=tempdf,
        left_index=True,
        right_index=True,
    )
    df = df.drop(columns=column)
df.shape

In [156]:
#one hot encode df2 using pandas get_dummies
for column in ['Week','Month']:
    tempdf = pd.get_dummies(df2[column], prefix=column)
    df2 = pd.merge(
        left=df2,
        right=tempdf,
        left_index=True,
        right_index=True,
    )
    df2 = df2.drop(columns=column)
df.shape

In [157]:
df2.head()

In [None]:
#143 rows so we split the data up to  row 107 for train and test sets for df
train = df[:107].drop('Order Date', axis = 1)
test = df[107:].drop('Order Date', axis = 1)
xtrain = train.drop(['Quantity'], axis = 1)
xtest = test.drop(['Quantity'], axis = 1)
ytrain = train['Quantity']
ytest =test['Quantity']

In [160]:
#143 rows so we split the data up to  row 107 for train and test sets for df2
train2 = df2[:107].drop('Order Date', axis = 1)
test2 = df2[107:].drop('Order Date', axis = 1)
xtrain2 = train2.drop(['Quantity'], axis = 1)
xtest2 = test2.drop(['Quantity'], axis = 1)
ytrain2 = train2['Quantity']
ytest2 =test2['Quantity']

**2. Build Model**

Now we build a model. There are various models we could use including the (S)ARIMA(X) models, and FBProphet, as well as a LSTM network in Keras. Because of the small dataset size and poor quality, and the sparsity of the data, plus the 2018 sales drop due to the global financial crisis, nothing is really going to be any good at making predictions. But in the abscence of more data length, and more explanatory variables, I have chosen XGBoost (it can handle sparcity without needing to convert to CSR and can handle multiple regressors).

We could run this model and save the results through a loop, to run it across multipel epochs for further into the future predictions. But as per our problem statement, we are going to focus on just the next week at a time (in a real life situation, as the busniess owner, we may run this at the end of each week, to get orders and staff prepared over the weekend for next week).

In [163]:
#Model for df
model = XGBRegressor(n_estimators=500, learning_rate=0.01)
eval_set = [(xtrain, ytrain)]
model.fit(xtrain, ytrain, eval_metric="rmsle", eval_set=eval_set, early_stopping_rounds=20, verbose=False)
ypred = model.predict(xtest)

In [164]:
#Model for df2
model2 = XGBRegressor(n_estimators=500, learning_rate=0.01)
eval_set = [(xtrain2, ytrain2)]
model2.fit(xtrain2, ytrain2, eval_metric="rmsle", eval_set=eval_set, early_stopping_rounds=20, verbose=False)
ypred2 = model2.predict(xtest2)

**3. Evaluation**

Finally, I evaluate both the models: both on graphs, and then using the relevant metrics. We optmised scoring for root mean square log error as it standardises for when a quantity is already high and thus the absolute percentage change is low

In [166]:
#First we add the results to our original dataframe, after first aligning the indexes

#df
ypred = pd.Series(ypred)
eval_df = df[107:].reset_index(drop = True)
eval_df['ypred'] = ypred
eval_df = eval_df[['Order Date','Quantity', 'ypred']]
eval_df.head()

#df2
ypred2 = pd.Series(ypred2)
eval_df2 = df2[107:].reset_index(drop = True)
eval_df2['ypred'] = round(ypred2)
eval_df2 = eval_df2[['Order Date','Quantity', 'ypred']]
eval_df2.head()

In [168]:
#And Now we plot the results of the train vs test sets
#df
plt.figure(figsize = (20,8))
plt.plot(eval_df['Order Date'], eval_df['Quantity'], label = "Actual Quanitity")
plt.plot(eval_df['Order Date'], eval_df['ypred'], color = 'red', label = 'Predicted Quantity')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.legend()
plt.title('Total Sales')

#df2
plt.figure(figsize = (20,8))
plt.plot(eval_df2['Order Date'], eval_df2['Quantity'], label = "Actual Quanitity")
plt.plot(eval_df2['Order Date'], eval_df2['ypred'], color = 'red', label = 'Predicted Quantity')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.legend()
plt.title('Bombay Aloo Sales')


In [171]:
#Lastly metrics mean_absolute_error, r2_score, mean_squared_log_error
#df
print("Metrics for Total Sale\n")
print("Mean Absolute Error:\n", mean_absolute_error(ytest, ypred))
print("R Squared:\n", r2_score(ytest, ypred))
print("Mean Squared Log Error:\n", mean_squared_log_error(ytest, ypred))

#df2
print("\n")
print("Metrics for Bombay Aloo Sales\n")
print("Mean Absolute Error:\n", mean_absolute_error(ytest2, ypred2))
print("R Squared:\n", r2_score(ytest2, ypred2))
print("Mean Squared Log Error:\n", mean_squared_log_error(ytest2, ypred2))

**4. Conclusion**



The clearest metric is the RMSE. This shows clearly that the model for Total sales worked really quite well. The Bombay Aloo model however was less successful, most likely due to the far smaller quantity per week.

So the Total sales model looks like it would be good to start deploying, but perhaps the Bombay Aloo model should be rethought/scrapped, or alternatively changed to a monthly model, which may improve accuracy (though may not be as useful to the business)

As always, improvement could be had with cross validation, more data, hyperparameter optimisation, and possibly tryign some of the other models mentioned above)