### Importação das bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import lightgbm as lgb
import os

from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from functools import reduce
from bizdays import Calendar

from functions import forecast_comparisson, extract_date_features, add_lags

CAL = Calendar.load('ANBIMA')

In [2]:
jan = pd.read_csv(r"C:\Users\Yamac\OneDrive\Documentos\Programação\Trampo\Forecasting\Forecasting-techniques\Datasets\Sales_January_2019.csv")
jan.head(2)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,141234,iPhone,1,700.0,01/22/19 21:25,"944 Walnut St, Boston, MA 02215"
1,141235,Lightning Charging Cable,1,14.95,01/28/19 14:15,"185 Maple St, Portland, OR 97035"


### 1. Reading and preparing the data

* The information is split by month in different spreadsheets
* As all the spreadsheets follow the same structure, I'll just loop through them and concatenate them all

In [3]:
folderpath = r"C:\Users\Yamac\OneDrive\Documentos\Programação\Trampo\Forecasting\Forecasting-techniques\Datasets"
datasets = []

for file in os.listdir(folderpath):
    dataset = pd.read_csv(os.path.join(folderpath, file))
    dataset = dataset[dataset['Order ID'] != 'Order ID']
    datasets.append(dataset)

In [4]:
df = pd.concat(datasets, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558.0,USB-C Charging Cable,2.0,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
1,,,,,,
2,176559.0,Bose SoundSport Headphones,1.0,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560.0,Google Phone,1.0,600.0,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560.0,Wired Headphones,1.0,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [5]:
df.shape

(186495, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186495 entries, 0 to 186494
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Order ID          185950 non-null  object
 1   Product           185950 non-null  object
 2   Quantity Ordered  185950 non-null  object
 3   Price Each        185950 non-null  object
 4   Order Date        185950 non-null  object
 5   Purchase Address  185950 non-null  object
dtypes: object(6)
memory usage: 8.5+ MB


* To identify the month to which the sales refert to, it is necessary to change the format of the "Order Date" column
* Also gonna split the time information in another column

In [5]:
# Convert Order Data to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'])

# Drop null values
df.dropna(inplace=True)

# Get the hour value from Order Date
df['Hour'] = df['Order Date'].dt.hour
# Remove hour from Ordeer Date
df['Order Date'] = df['Order Date'].dt.date
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Hour
0,176558,USB-C Charging Cable,2,11.95,2019-04-19,"917 1st St, Dallas, TX 75001",8
2,176559,Bose SoundSport Headphones,1,99.99,2019-04-07,"682 Chestnut St, Boston, MA 02215",22
3,176560,Google Phone,1,600.0,2019-04-12,"669 Spruce St, Los Angeles, CA 90001",14
4,176560,Wired Headphones,1,11.99,2019-04-12,"669 Spruce St, Los Angeles, CA 90001",14
5,176561,Wired Headphones,1,11.99,2019-04-30,"333 8th St, Los Angeles, CA 90001",9


In [6]:
# Sort dataframe cronologically
df.sort_values(by='Order Date', inplace=True)

In [7]:
# Change the column names
df.rename(columns={'Quantity Ordered':'Amount', 'Price Each':'Price', 'Order Date':'Date', 'Purchase Address':'Address'}, inplace=True)

In [8]:
df['Hour'] = pd.to_datetime(df['Hour'], format='%H').dt.time
df['Date'] = pd.to_datetime(df['Date'])

In [9]:
# Create columns for city and state
df['City'] = df['Address'].apply(lambda x: x.split(',')[1])
df['State'] = df['Address'].apply(lambda x: x.split(',')[2].split(' ')[1])
df['Price'] = df['Price'].astype('float').apply(lambda x: round(x, 2))

In [10]:
df.head(2)

Unnamed: 0,Order ID,Product,Amount,Price,Date,Address,Hour,City,State
68289,142066,27in 4K Gaming Monitor,1,389.99,2019-01-01,"110 Dogwood St, Seattle, WA 98101",22:00:00,Seattle,WA
76162,149579,Macbook Pro Laptop,1,1700.0,2019-01-01,"61 Lakeview St, Dallas, TX 75001",10:00:00,Dallas,TX


In [11]:
df['Amount'] = df['Amount'].astype('int')
df['Price'] = df['Price'].astype('float')

### 2. Data viz and exploration

In [15]:
daily_sales = pd.DataFrame(df.groupby('Date')['Amount'].sum())
px.line(daily_sales, x=daily_sales.index, y='Amount', title='Daily Sales')

In [16]:
daily_revenue = pd.DataFrame(df.groupby('Date')['Price'].sum())
px.line(daily_revenue, x=daily_revenue.index, y='Price', title='Daily Revenue')

In [17]:
hyped_hours = pd.DataFrame(df.groupby('Hour')['Amount'].sum())
px.bar(hyped_hours, x=hyped_hours.index, y='Amount', title='Periods of the day with more sales')

In [18]:
# Number of unique products
df['Product'].unique()

array(['27in 4K Gaming Monitor', 'Macbook Pro Laptop', 'ThinkPad Laptop',
       'Flatscreen TV', 'USB-C Charging Cable', '34in Ultrawide Monitor',
       'Lightning Charging Cable', 'Bose SoundSport Headphones',
       'LG Dryer', 'AA Batteries (4-pack)', 'Apple Airpods Headphones',
       'Wired Headphones', 'AAA Batteries (4-pack)', '27in FHD Monitor',
       'iPhone', '20in Monitor', 'Google Phone', 'Vareebadd Phone',
       'LG Washing Machine'], dtype=object)

* Kinda weird, but in fact, there are only 19 different products
* That allows us to more easily visualliase the sales of each product

In [19]:
product_sales_amount = pd.DataFrame(df.groupby('Product')['Amount'].sum()).sort_values('Amount', ascending=True)
px.bar(product_sales_amount, x='Amount', y=product_sales_amount.index, orientation='h', title='Amount sold by each product', text='Amount')

In [20]:
product_sales_rev = pd.DataFrame(df.groupby('Product')['Price'].sum()).sort_values('Price', ascending=True)
product_sales_rev['Price'] = product_sales_rev['Price'].apply(lambda x: round(x, 2))
px.bar(product_sales_rev, x='Price', y=product_sales_rev.index, orientation='h', title='Revenue by each product', text='Price')	

In [21]:
sales_city = pd.DataFrame(df.groupby('City')[['Amount', 'Price']].sum()).sort_values('Amount', ascending=True)
px.bar(sales_city, x='Amount', y=sales_city.index, orientation='h', title='Amount sold by each city', text='Amount')

In [22]:
sales_state = pd.DataFrame(df.groupby('State')[['Amount', 'Price']].sum()).sort_values('Amount', ascending=True)
px.bar(sales_state, x='Amount', y=sales_state.index, orientation='h', title='Amount sold by each state', text='Amount')

##### Insights
* When comparing the amount sold and revenue, we see that the best selling products are the ones that generate the least amount of revenue
* It is relevant to have an idea of the difference in the price of the products, because later on we want to calculate an error function based on loss
* There is a clear preference in the purchase times, which are higher at lunchtime and early evening and lower at dawn, early morning and mid-afternoon, which are times when people are sleeping or working. Knowing the times of greatest consumption can help to better direct notifications and promotions

## Forecasting models

* Here we start the sales forecasting work
* The idea is to try to predict how the sale of each product will be in the next 7 days

#### Naive models

* Naive models will serve as plain forms of forecasting
* The idea is to serve as a benchmark for more sophisticated models

#### 1. Last day based forecasting

* Here, we just assume the sales for the next days will be equal to the previous day

In [23]:
df.head()

Unnamed: 0,Order ID,Product,Amount,Price,Date,Address,Hour,City,State
68289,142066,27in 4K Gaming Monitor,1,389.99,2019-01-01,"110 Dogwood St, Seattle, WA 98101",22:00:00,Seattle,WA
76162,149579,Macbook Pro Laptop,1,1700.0,2019-01-01,"61 Lakeview St, Dallas, TX 75001",10:00:00,Dallas,TX
69491,143202,ThinkPad Laptop,1,999.99,2019-01-01,"129 Walnut St, Los Angeles, CA 90001",18:00:00,Los Angeles,CA
74481,147963,Flatscreen TV,1,300.0,2019-01-01,"655 Meadow St, Austin, TX 73301",11:00:00,Austin,TX
73301,146844,Flatscreen TV,1,300.0,2019-01-01,"593 Church St, New York City, NY 10001",11:00:00,New York City,NY


In [None]:
forecasts_last_day = []

# We're using a loop to forecast each product individually
for product in df['Product'].unique():
    sales_filtered_last = df[df['Product'] == product].sort_values(by='Date', ascending=True)
    sales_filtered_last = sales_filtered_last[['Product', 'Date', 'Amount']]
    sales_filtered_last = sales_filtered_last.groupby('Date').sum()
    sales_filtered_last['Product'] = product
    sales_filtered_last = sales_filtered_last.resample('D').ffill().fillna(0)  # Fill missing dates with 0

    train_last = sales_filtered_last.iloc[:-7] # Split in train and test by date -> in this case we want to use the last 30 days to test
    test_last = sales_filtered_last.iloc[-7:]
    
    train_last['Forecast'] = train_last['Amount'].shift(1)  # Shift the amount of the previous day to the next day
    test_last['Forecast'] = train_last['Forecast'].iloc[-1]  # As the test won't have the real last day sales, we're filling it with the last 
                                                             # predicted value
    forecasts_last_day.append(train_last)
    forecasts_last_day.append(test_last)

last_day = pd.concat(forecasts_last_day)
last_day['Forecast'].fillna(0, inplace=True)

In [24]:
# Forecasts for the next 30 days
last_day.tail()

Unnamed: 0_level_0,Amount,Product,Forecast
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-12-27,1,LG Washing Machine,2.0
2019-12-28,3,LG Washing Machine,2.0
2019-12-29,4,LG Washing Machine,2.0
2019-12-30,3,LG Washing Machine,2.0
2019-12-31,5,LG Washing Machine,2.0


In [150]:
# Evaluation metrics for the last 30 days for each product
eval_last_day = pd.DataFrame(last_day[last_day.index >= last_day.index.max() - pd.Timedelta(days=6)].groupby('Product')[['Amount', 'Forecast']].sum())
eval_last_day['MAE'] = mean_absolute_error(eval_last_day['Amount'], eval_last_day['Forecast'])
eval_last_day['MAPE'] = mean_absolute_percentage_error(eval_last_day['Amount'], eval_last_day['Forecast'])
eval_last_day['MPE'] = eval_last_day['MAPE'] / 100
eval_last_day['RMSE'] = mean_squared_error(eval_last_day['Amount'], eval_last_day['Forecast'], squared=False)
eval_last_day

Unnamed: 0_level_0,Amount,Forecast,MAE,MAPE,MPE,RMSE
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20in Monitor,126,126.0,46.842105,0.21332,0.002133,66.093236
27in 4K Gaming Monitor,163,175.0,46.842105,0.21332,0.002133,66.093236
27in FHD Monitor,196,203.0,46.842105,0.21332,0.002133,66.093236
34in Ultrawide Monitor,144,186.0,46.842105,0.21332,0.002133,66.093236
AA Batteries (4-pack),756,684.0,46.842105,0.21332,0.002133,66.093236
AAA Batteries (4-pack),871,931.0,46.842105,0.21332,0.002133,66.093236
Apple Airpods Headphones,424,490.0,46.842105,0.21332,0.002133,66.093236
Bose SoundSport Headphones,341,385.0,46.842105,0.21332,0.002133,66.093236
Flatscreen TV,136,161.0,46.842105,0.21332,0.002133,66.093236
Google Phone,127,189.0,46.842105,0.21332,0.002133,66.093236


In [26]:
forecast_comparisson(last_day, product_col='Product', real_sales_col='Amount', forecast_col='Forecast')

#### 2. Rolling average based model forecasting

In [None]:
pred = []

for product in df['Product'].unique():
    sales_filtered = df[df['Product'] == product].sort_values(by='Date', ascending=True)
    sales_filtered = sales_filtered[['Product', 'Date', 'Amount']]
    sales_filtered = sales_filtered.groupby('Date').sum()
    sales_filtered['Product'] = product
    sales_filtered = sales_filtered.resample('D').ffill().fillna(0)

    train = sales_filtered.iloc[:-7] 
    window_size = 7
    windows = train['Amount'].rolling(window_size)  # Every group of 7 days
    moving_averages = windows.mean().tolist()  # Series with the mean of every window
    train = train.assign(Forecast=moving_averages)   
    train['Forecast'] = train['Forecast'].shift(1).fillna(0)

    test = pd.concat([train.tail(7), sales_filtered.iloc[-7:]])  # Test is the last 7 days plus the last 7 days of train to get the mean. 
                                                                 # After that, the mean will be calculated from the forecasted values

    for index, value in enumerate(test['Forecast']):
        if pd.isna(value):
            test['Forecast'].iloc[index] = test['Forecast'].iloc[max(0, index-7):index].mean()
    
    sales_pred = pd.concat([train[:-7], test])
    pred.append(sales_pred)

mov_avg = pd.concat(pred)
mov_avg['Forecast'] = mov_avg['Forecast'].apply(lambda x: int(x))

In [28]:
mov_avg.head(10)

Unnamed: 0_level_0,Amount,Product,Forecast
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,7,27in 4K Gaming Monitor,0
2019-01-02,14,27in 4K Gaming Monitor,0
2019-01-03,14,27in 4K Gaming Monitor,0
2019-01-04,8,27in 4K Gaming Monitor,0
2019-01-05,13,27in 4K Gaming Monitor,0
2019-01-06,8,27in 4K Gaming Monitor,0
2019-01-07,12,27in 4K Gaming Monitor,0
2019-01-08,12,27in 4K Gaming Monitor,10
2019-01-09,14,27in 4K Gaming Monitor,11
2019-01-10,12,27in 4K Gaming Monitor,11


In [148]:
eval_mov_avg = pd.DataFrame(mov_avg[mov_avg.index >= mov_avg.index.max() - pd.Timedelta(days=6)].groupby('Product')[['Amount', 'Forecast']].sum())
eval_mov_avg['MAPE'] = np.abs(np.mean((eval_mov_avg['Amount'] - eval_mov_avg['Forecast'])/eval_mov_avg['Amount'])*100)
eval_mov_avg['MAE'] = mean_absolute_error(eval_mov_avg['Amount'], eval_mov_avg['Forecast'])
eval_mov_avg['MPE'] = np.abs(np.mean((eval_mov_avg['Amount'] - eval_mov_avg['Forecast'])/eval_mov_avg['Amount'])*100)
eval_mov_avg['RMSE'] = np.sqrt(mean_squared_error(eval_mov_avg['Amount'], eval_mov_avg['Forecast']))    
eval_mov_avg

Unnamed: 0_level_0,Amount,Forecast,MAPE,MAE,MPE,RMSE
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20in Monitor,126,131,9.497157,42.684211,9.497157,64.424987
27in 4K Gaming Monitor,163,185,9.497157,42.684211,9.497157,64.424987
27in FHD Monitor,196,206,9.497157,42.684211,9.497157,64.424987
34in Ultrawide Monitor,144,156,9.497157,42.684211,9.497157,64.424987
AA Batteries (4-pack),756,734,9.497157,42.684211,9.497157,64.424987
AAA Batteries (4-pack),871,1008,9.497157,42.684211,9.497157,64.424987
Apple Airpods Headphones,424,458,9.497157,42.684211,9.497157,64.424987
Bose SoundSport Headphones,341,452,9.497157,42.684211,9.497157,64.424987
Flatscreen TV,136,140,9.497157,42.684211,9.497157,64.424987
Google Phone,127,161,9.497157,42.684211,9.497157,64.424987


In [149]:
forecast_comparisson(mov_avg, product_col='Product', real_sales_col='Amount', forecast_col='Forecast')

In [31]:
# Comparing the naive models
print('Mean MAE last day forecast: ', eval_last_day['MAE'].mean())
print('Mean MPE last day forecast: ', eval_last_day['MPE'].mean(), end='\n\n')
print('Mean MAE moving average forecast: ', eval_mov_avg['MAE'].mean())
print('Mean MPE moving average forecast: ', eval_mov_avg['MPE'].mean())

Mean MAE last day forecast:  11.43233082706767
Mean MPE last day forecast:  191.65103120086124

Mean MAE moving average forecast:  10.56015037593985
Mean MPE moving average forecast:  184.5927412832194


### Econometrics and ML models

#### 3. ARIMA (and variants)

In [32]:
df.head()

Unnamed: 0,Order ID,Product,Amount,Price,Date,Address,Hour,City,State
68289,142066,27in 4K Gaming Monitor,1,389.99,2019-01-01,"110 Dogwood St, Seattle, WA 98101",22:00:00,Seattle,WA
76162,149579,Macbook Pro Laptop,1,1700.0,2019-01-01,"61 Lakeview St, Dallas, TX 75001",10:00:00,Dallas,TX
69491,143202,ThinkPad Laptop,1,999.99,2019-01-01,"129 Walnut St, Los Angeles, CA 90001",18:00:00,Los Angeles,CA
74481,147963,Flatscreen TV,1,300.0,2019-01-01,"655 Meadow St, Austin, TX 73301",11:00:00,Austin,TX
73301,146844,Flatscreen TV,1,300.0,2019-01-01,"593 Church St, New York City, NY 10001",11:00:00,New York City,NY


In [None]:
forecast_arima = []

for product in tqdm(df['Product'].unique()):
    df_filtered = df[df['Product'] == product]
    df_filtered = pd.DataFrame(df_filtered.groupby('Date').agg({'Amount': 'sum', 'Price': 'mean'})).reset_index()
    df_filtered = extract_date_features(df_filtered, date_column='Date')  # Extracting date features
    df_filtered.set_index('Date', inplace=True)
    df_filtered['Product'] = product
    train_df = df_filtered[df_filtered.index <= df_filtered.index.max() - pd.Timedelta(days=6)]
    test_df = df_filtered[df_filtered.index >= df_filtered.index.max() - pd.Timedelta(days=6)]

    # Here we're using the auto_arima function to find the best parameters for the SARIMAX model
    arima_model = auto_arima(train_df['Amount'], exogenous=train_df[['Price', 'day_of_week', 'is_month_start', 'is_month_end']],         
                            max_order=None, max_p=5, max_q=5, max_d=1, m=1, max_P=4, max_Q=4, max_D=1, njobs=-1, stepwise=True, 
                            out_of_sample_size=7, scoring='mae', test='adf')
    arima_setup = SARIMAX(train_df['Amount'], order=arima_model.order)
    arima_fit = arima_setup.fit()
    predictions = arima_fit.forecast(steps=len(test_df), exogenous=test_df[['Price', 'day_of_week', 'is_month_start', 'is_month_end']])
    
    arima_preds = pd.DataFrame(predictions)
    arima_preds.columns = ['Forecast']
    arima_preds['Forecast'] = arima_preds['Forecast'].apply(lambda x: int(x))
    arima_preds['Actual_values'] = test_df['Amount'].values
    arima_preds['Product'] = product
    forecast_arima.append(arima_preds)

forecast_arima = pd.concat(forecast_arima)

In [174]:
test_df

Unnamed: 0_level_0,Amount,Price,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_leap_year,is_month_start,is_month_end,is_holiday,is_bizdays,Product
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-12-26,1,600.0,2019,12,26,3,360,52,4,False,False,False,False,True,LG Washing Machine
2019-12-28,3,600.0,2019,12,28,5,362,52,4,False,False,False,False,False,LG Washing Machine
2019-12-29,4,600.0,2019,12,29,6,363,52,4,False,False,False,False,False,LG Washing Machine
2019-12-30,3,600.0,2019,12,30,0,364,1,4,False,False,False,False,True,LG Washing Machine
2019-12-31,5,600.0,2019,12,31,1,365,1,4,False,False,True,False,True,LG Washing Machine


In [181]:
pd.DataFrame(forecast_arima.groupby('Product')[['Forecast', 'Actual_values']].sum())

Unnamed: 0_level_0,Forecast,Actual_values
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
20in Monitor,127,126
27in 4K Gaming Monitor,188,163
27in FHD Monitor,213,196
34in Ultrawide Monitor,203,179
AA Batteries (4-pack),798,869
AAA Batteries (4-pack),980,871
Apple Airpods Headphones,490,424
Bose SoundSport Headphones,406,341
Flatscreen TV,140,136
Google Phone,161,127


In [185]:
eval_arima = forecast_arima.groupby('Product')[['Forecast', 'Actual_values']].sum()
eval_arima['MAPE'] = np.abs(np.mean((eval_arima['Actual_values'] - eval_arima['Forecast'])/eval_arima['Actual_values'])*100)
eval_arima['MAE'] = mean_absolute_error(eval_arima['Actual_values'], eval_arima['Forecast'])
eval_arima['MPE'] = np.abs(np.mean((eval_arima['Actual_values'] - eval_arima['Forecast'])/eval_arima['Actual_values'])*100)
eval_arima['RMSE'] = np.sqrt(mean_squared_error(eval_arima['Actual_values'], eval_arima['Forecast']))
eval_arima

Unnamed: 0_level_0,Forecast,Actual_values,MAPE,MAE,MPE,RMSE
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20in Monitor,127,126,7.44695,41.842105,7.44695,56.633169
27in 4K Gaming Monitor,188,163,7.44695,41.842105,7.44695,56.633169
27in FHD Monitor,213,196,7.44695,41.842105,7.44695,56.633169
34in Ultrawide Monitor,203,179,7.44695,41.842105,7.44695,56.633169
AA Batteries (4-pack),798,869,7.44695,41.842105,7.44695,56.633169
AAA Batteries (4-pack),980,871,7.44695,41.842105,7.44695,56.633169
Apple Airpods Headphones,490,424,7.44695,41.842105,7.44695,56.633169
Bose SoundSport Headphones,406,341,7.44695,41.842105,7.44695,56.633169
Flatscreen TV,140,136,7.44695,41.842105,7.44695,56.633169
Google Phone,161,127,7.44695,41.842105,7.44695,56.633169


In [63]:
z = df[df['Product'] == 'USB-C Charging Cable']
px.line(x=z['Date'], y=z['Amount'])

In [64]:
# Checking for stationariety, so we can apply ARIMA
from statsmodels.tsa.stattools import adfuller
#z = pd.DataFrame(df.groupby('Date')['Amount'].sum())

result = adfuller(z['Amount'])
print('ADF statistics', result[0])
print('p-value', result[1])
print('Critical values', result[4])

if result[0] <= result[4]['5%']:
    print('Serie is stationary')
else:
    print('Serie is not stationary')

ADF statistics -149.67454686766402
p-value 0.0
Critical values {'1%': -3.4306486059072054, '5%': -2.861671973947546, '10%': -2.5668402460164885}
Serie is stationary


In [183]:
print('Mean MAE last day forecast: ', eval_last_day['MAE'].mean())
print('Mean MPE last day forecast: ', eval_last_day['MPE'].mean(), end='\n\n')
print('Mean MAE moving average forecast: ', eval_mov_avg['MAE'].mean())
print('Mean MPE moving average forecast: ', eval_mov_avg['MPE'].mean(), end='\n\n')
print('Mean MAE ARIMA forecast: ', eval_arima['MAE'].mean())
print('Mean MPE ARIMA forecast: ', eval_arima['MPE'].mean())

Mean MAE last day forecast:  46.8421052631579
Mean MPE last day forecast:  0.002133200496168675

Mean MAE moving average forecast:  42.684210526315795
Mean MPE moving average forecast:  9.49715744731524

Mean MAE ARIMA forecast:  10.419847328244275
Mean MPE ARIMA forecast:  192.98637955910368


### 4. LGBM

In [14]:
pd.set_option('display.max_columns', None)

In [58]:
df.head()

Unnamed: 0,Order ID,Product,Amount,Price,Date,Address,Hour,City,State
68289,142066,27in 4K Gaming Monitor,1,389.99,2019-01-01,"110 Dogwood St, Seattle, WA 98101",22:00:00,Seattle,WA
76162,149579,Macbook Pro Laptop,1,1700.0,2019-01-01,"61 Lakeview St, Dallas, TX 75001",10:00:00,Dallas,TX
69491,143202,ThinkPad Laptop,1,999.99,2019-01-01,"129 Walnut St, Los Angeles, CA 90001",18:00:00,Los Angeles,CA
74481,147963,Flatscreen TV,1,300.0,2019-01-01,"655 Meadow St, Austin, TX 73301",11:00:00,Austin,TX
73301,146844,Flatscreen TV,1,300.0,2019-01-01,"593 Church St, New York City, NY 10001",11:00:00,New York City,NY


In [63]:
df_filtered = df.drop(['Order ID', 'Address', 'Hour', 'City', 'State'], axis=1)
df_filtered = pd.DataFrame(df.groupby(['Product', 'Date']).agg({'Amount': 'sum', 'Price': 'mean'})).reset_index(level='Product')
df_filtered.reset_index(level='Date', inplace=True)
df_filtered = add_lags(df_filtered,lags=[1,2,3], date_column='Date', target_col='Amount')
df_filtered = extract_date_features(df_filtered, date_column='Date')
df_filtered.head()

Unnamed: 0,Date,Product,Amount,Price,Amount_lag_1,Amount_lag_2,Amount_lag_3,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_leap_year,is_month_start,is_month_end,is_holiday,is_bizdays
0,2019-01-01,20in Monitor,4,109.99,0.0,0.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False
2926,2019-01-01,Flatscreen TV,9,300.0,4.0,0.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False
6095,2019-01-01,Wired Headphones,31,11.99,9.0,4.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False
366,2019-01-01,27in 4K Gaming Monitor,7,389.99,31.0,9.0,4.0,2019,1,1,1,1,1,1,False,True,False,True,False
5732,2019-01-01,Vareebadd Phone,2,400.0,7.0,31.0,9.0,2019,1,1,1,1,1,1,False,True,False,True,False


In [64]:
next_7 = df_filtered.sort_values(by=['Product', 'Date'])
next_7 = pd.DataFrame(next_7.groupby(['Product', 'Date'])['Amount'].sum())
next_7['next'] = next_7['Amount'].rolling(window=7, min_periods=1).sum().shift(-7)
next_7.reset_index(level='Date', inplace=True)
next_7.reset_index(level='Product', inplace=True)

In [67]:
df_filtered = df_filtered.merge(next_7, on=['Product', 'Date'], how='left')
df_filtered.drop('Amount_x', axis=1, inplace=True)
df_filtered.rename(columns={'Amount_y':'Amount'}, inplace=True)
df_filtered['Product'] = df_filtered['Product'].astype('category')
df_filtered['next'].fillna(0, inplace=True)
df_filtered.set_index('Date', inplace=True)
df_filtered.drop('Amount', axis=1, inplace=True)
df_filtered['week_of_year'] = df_filtered['week_of_year'].astype('int')
df_filtered.head()

Unnamed: 0,Date,Product,Price,Amount_lag_1,Amount_lag_2,Amount_lag_3,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_leap_year,is_month_start,is_month_end,is_holiday,is_bizdays,Amount,next
0,2019-01-01,20in Monitor,109.99,0.0,0.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False,4,43.0
1,2019-01-01,Flatscreen TV,300.0,4.0,0.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False,9,43.0
2,2019-01-01,Wired Headphones,11.99,9.0,4.0,0.0,2019,1,1,1,1,1,1,False,True,False,True,False,31,228.0
3,2019-01-01,27in 4K Gaming Monitor,389.99,31.0,9.0,4.0,2019,1,1,1,1,1,1,False,True,False,True,False,7,81.0
4,2019-01-01,Vareebadd Phone,400.0,7.0,31.0,9.0,2019,1,1,1,1,1,1,False,True,False,True,False,2,19.0


In [68]:
param_grid = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric':'mape',
    'num_leaves': 50,
    'learning_rate': 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "max_depth": 10, 
    'min_split_gain':0.5,
    'force_row_wise': True,
    }

In [99]:
# Train test split
train = df_filtered[df_filtered.index <= df_filtered.index.max() - pd.Timedelta(days=8)]
test = df_filtered[df_filtered.index == df_filtered.index.max() - pd.Timedelta(days=7)]

x_train = train.drop('next', axis=1)
y_train = train['next']

x_test = test.drop('next', axis=1)
y_test = test[['Product', 'next']]

In [110]:
test.head()

Unnamed: 0_level_0,Product,Price,Amount_lag_1,Amount_lag_2,Amount_lag_3,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_leap_year,is_month_start,is_month_end,is_holiday,is_bizdays,next
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-12-25,Apple Airpods Headphones,150.0,0.0,0.0,0.0,2019,12,25,2,359,52,4,False,False,False,True,False,424.0
2019-12-25,27in FHD Monitor,149.99,70.0,0.0,0.0,2019,12,25,2,359,52,4,False,False,False,True,False,196.0
2019-12-25,Vareebadd Phone,400.0,37.0,70.0,0.0,2019,12,25,2,359,52,4,False,False,False,True,False,63.0
2019-12-25,34in Ultrawide Monitor,379.99,5.0,37.0,70.0,2019,12,25,2,359,52,4,False,False,False,True,False,177.0
2019-12-25,LG Dryer,600.0,35.0,5.0,37.0,2019,12,25,2,359,52,4,False,False,False,True,False,19.0


In [None]:
pred_lgbm = []

for product in df_filtered['Product'].unique():
    train = df_filtered[df_filtered.index <= df_filtered.index.max() - pd.Timedelta(days=8)]
    test = df_filtered[df_filtered.index == df_filtered.index.max() - pd.Timedelta(days=7)]

    train = train[train['Product'] == product]
    x_train = train.drop('next', axis=1)
    y_train = train['next']

    x_test = test.drop('next', axis=1)
    y_test = test[['Product', 'next']]
    d_train = lgb.Dataset(x_train, label=y_train)
    model = lgb.train(param_grid, d_train)

    x_test = x_test[x_test['Product'] == product]
    if x_test.shape[0] != 0:
        y_pred = pd.DataFrame(model.predict(x_test), columns=['Forecast'])
        y_pred['Product'] = product
        pred_lgbm.append(y_pred)
    else:
        pass
pred_lgbm = pd.concat(pred_lgbm)

In [130]:
pred_lgbm['Forecast'] = pred_lgbm['Forecast'].apply(lambda x: int(x))
pred_lgbm = pred_lgbm.merge(y_test.rename({'next':'Actual_values'}, axis=1), on='Product', how='left')
pred_lgbm.head(2)

Unnamed: 0,Forecast,Product,Actual_values
0,123,20in Monitor,126.0
1,146,Flatscreen TV,136.0


In [131]:
eval_lgbm = pred_lgbm.copy()
eval_lgbm['mape'] = np.abs((eval_lgbm['Forecast'] - eval_lgbm['Actual_values'])/eval_lgbm['Actual_values'])
eval_lgbm['mae'] = np.abs(eval_lgbm['Forecast'] - eval_lgbm['Actual_values'])
eval_lgbm['mpe'] = np.abs((eval_lgbm['Forecast'] - eval_lgbm['Actual_values'])/eval_lgbm['Actual_values'])*100
eval_lgbm['rmse'] = np.sqrt((eval_lgbm['Forecast'] - eval_lgbm['Actual_values'])**2)
eval_lgbm

Unnamed: 0,Forecast,Product,Actual_values,mape,mae,mpe,rmse
0,123,20in Monitor,126.0,0.02381,3.0,2.380952,3.0
1,146,Flatscreen TV,136.0,0.073529,10.0,7.352941,10.0
2,607,Wired Headphones,570.0,0.064912,37.0,6.491228,37.0
3,197,27in 4K Gaming Monitor,163.0,0.208589,34.0,20.858896,34.0
4,59,Vareebadd Phone,63.0,0.063492,4.0,6.349206,4.0
5,192,34in Ultrawide Monitor,177.0,0.084746,15.0,8.474576,15.0
6,145,Macbook Pro Laptop,102.0,0.421569,43.0,42.156863,43.0
7,739,USB-C Charging Cable,647.0,0.142195,92.0,14.219474,92.0
8,209,27in FHD Monitor,196.0,0.066327,13.0,6.632653,13.0
9,464,Apple Airpods Headphones,424.0,0.09434,40.0,9.433962,40.0


In [186]:
print('Mean MAE last day forecast: ', eval_last_day['MAE'].mean())
print('Mean MPE last day forecast: ', eval_last_day['MPE'].mean(), end='\n\n')
print('Mean MAE moving average forecast: ', eval_mov_avg['MAE'].mean())
print('Mean MPE moving average forecast: ', eval_mov_avg['MPE'].mean(), end='\n\n')
print('Mean MAE ARIMA forecast: ', eval_arima['MAE'].mean())
print('Mean MPE ARIMA forecast: ', eval_arima['MPE'].mean(), end='\n\n')
print('Mean MAE LGBM forecast: ', eval_lgbm['mae'].mean())
print('Mean MPE LGBM forecast: ', eval_lgbm['mpe'].mean())

Mean MAE last day forecast:  46.8421052631579
Mean MPE last day forecast:  0.002133200496168675

Mean MAE moving average forecast:  42.684210526315795
Mean MPE moving average forecast:  9.49715744731524

Mean MAE ARIMA forecast:  41.8421052631579
Mean MPE ARIMA forecast:  7.446950366536926

Mean MAE LGBM forecast:  35.77777777777778
Mean MPE LGBM forecast:  13.468477141546757
