### <b><span style='color:#F1C40F'>|</span> Home task</b>

- Choose any store from the initial dataset
- Check the presence of nans and fill them
- Make a forecast for 30, 180, 270, 365 days ahead
- Perform model evaluation

In [51]:
import pandas as pd
import numpy as np

import plotly.express as px

# model evaluation
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [52]:
def preprocess_data(df:pd.DataFrame) -> pd.DataFrame:
    df.date = pd.to_datetime(df.date)
    df['day_of_week'] = df['date'].dt.day_name()
    return df

# load train dataset | convert datatype of "date" column from "object" to "datetime"
stores_df = pd.read_csv("train.csv")
stores_df = preprocess_data(stores_df)

In [53]:
store54_train=stores_df[stores_df['store_nbr'] == 54]

In [54]:
store54_train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week
1617,1617,2013-01-01,54,AUTOMOTIVE,0.000,0,Tuesday
1618,1618,2013-01-01,54,BABY CARE,0.000,0,Tuesday
1619,1619,2013-01-01,54,BEAUTY,0.000,0,Tuesday
1620,1620,2013-01-01,54,BEVERAGES,0.000,0,Tuesday
1621,1621,2013-01-01,54,BOOKS,0.000,0,Tuesday
...,...,...,...,...,...,...,...
3000751,3000751,2017-08-15,54,POULTRY,59.619,0,Tuesday
3000752,3000752,2017-08-15,54,PREPARED FOODS,94.000,0,Tuesday
3000753,3000753,2017-08-15,54,PRODUCE,915.371,76,Tuesday
3000754,3000754,2017-08-15,54,SCHOOL AND OFFICE SUPPLIES,0.000,0,Tuesday


In [55]:
# sum up sales for the day
def sum_sales_per_day(df: pd.DataFrame, store_number:int=54) -> pd.DataFrame:
    day_level_df = df[df["store_nbr"]==store_number]\
        [
            ["date", "sales", "day_of_week"]
        ]\
            .groupby("date").agg(
                {
                    "sales": "sum",
                    "day_of_week": "first"
                }).reset_index()

    return day_level_df


day_level_df = sum_sales_per_day(stores_df)

In [56]:
day_level_df["sales"] = day_level_df["sales"].mask(day_level_df["sales"] == float(0), None)
nan_indices = day_level_df[day_level_df["sales"].isna()].index
day_level_df["sales"].fillna(day_level_df["sales"].mean(), inplace=True)

day_level_df

Unnamed: 0,date,sales,day_of_week
0,2013-01-01,6585.755282,Tuesday
1,2013-01-02,4973.285000,Wednesday
2,2013-01-03,3901.570000,Thursday
3,2013-01-04,3266.966000,Friday
4,2013-01-05,4394.549000,Saturday
...,...,...,...
1679,2017-08-11,8513.834000,Friday
1680,2017-08-12,9139.678002,Saturday
1681,2017-08-13,14246.827996,Sunday
1682,2017-08-14,11882.994000,Monday


In [57]:
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly, plot_cross_validation_metric, add_changepoints_to_plot

In [58]:
# preprocess data to needed format
fbp_set = day_level_df[['date', 'sales']]
fbp_set.rename(columns={"date": "ds", "sales":"y"}, inplace=True)
fbp_set

Unnamed: 0,ds,y
0,2013-01-01,6585.755282
1,2013-01-02,4973.285000
2,2013-01-03,3901.570000
3,2013-01-04,3266.966000
4,2013-01-05,4394.549000
...,...,...
1679,2017-08-11,8513.834000
1680,2017-08-12,9139.678002
1681,2017-08-13,14246.827996
1682,2017-08-14,11882.994000


In [59]:
# model evaluation
def evaluate_forecasting_model(actual_values:pd.Series, predicted_values:pd.Series, round_nbr:int=2) -> None:
    mape = mean_absolute_percentage_error(
        actual_values,
        predicted_values
    )
    mae = mean_absolute_error(
        actual_values,
        predicted_values
    )
    mse = mean_squared_error(
        actual_values,      
        predicted_values
    )

    print(f"MAE - {round(mae, round_nbr)}")
    print(f"MSE - {round(mse, round_nbr)}")
    print(f"MAPE - {round(mape, round_nbr)}")

### <b>365 Days predict</b>

In [60]:
window = 365
train, test = fbp_set[:-window], fbp_set[-window:]

# init and fit the model
m = Prophet()
m.fit(train)

# Create Future dates
future_sales = m.make_future_dataframe(periods=366)

# Predict sales
forecast = m.predict(future_sales)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

# merge test set with forecasted values
benchmark_df = test.merge(forecast[["ds", "yhat"]], on="ds", how="left")

# Plot actual and forecasted data
fig = px.line(benchmark_df, x='ds', y=["y", "yhat"], markers=True, title="Prophet 365 days forecast")
# # Show plot 
fig.show()

evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

15:06:20 - cmdstanpy - INFO - Chain [1] start processing
15:06:20 - cmdstanpy - INFO - Chain [1] done processing


MAE - 1378.36
MSE - 4040233.797
MAPE - 0.16


### <b>270 Days predict</b>

In [61]:
window = 270
train, test = fbp_set[:-window], fbp_set[-window:]

# init and fit the model
m = Prophet()
m.fit(train)

# Create Future dates
future_sales = m.make_future_dataframe(periods=271)

# Predict sales
forecast = m.predict(future_sales)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

# merge test set with forecasted values
benchmark_df = test.merge(forecast[["ds", "yhat"]], on="ds", how="left")

# Plot actual and forecasted data
fig = px.line(benchmark_df, x='ds', y=["y", "yhat"], markers=True, title="Prophet 270 days forecast")
# # Show plot 
fig.show()

evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

15:06:20 - cmdstanpy - INFO - Chain [1] start processing
15:06:20 - cmdstanpy - INFO - Chain [1] done processing


MAE - 1412.332
MSE - 5207336.101
MAPE - 0.143


### <b>180 Days predict</b>

In [62]:
window = 180
train, test = fbp_set[:-window], fbp_set[-window:]

# init and fit the model
m = Prophet()
m.fit(train)

# Create Future dates
future_sales = m.make_future_dataframe(periods=180)

# Predict sales
forecast = m.predict(future_sales)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

# merge test set with forecasted values
benchmark_df = test.merge(forecast[["ds", "yhat"]], on="ds", how="left")

# Plot actual and forecasted data
fig = px.line(benchmark_df, x='ds', y=["y", "yhat"], markers=True, title="Prophet 180 days forecast")
# # Show plot 
fig.show()

evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

15:06:21 - cmdstanpy - INFO - Chain [1] start processing
15:06:21 - cmdstanpy - INFO - Chain [1] done processing


MAE - 1561.458
MSE - 6394061.114
MAPE - 0.152


### <b>30 Days predict</b>

In [63]:
window = 30
train, test = fbp_set[:-window], fbp_set[-window:]

# init and fit the model
m = Prophet()
m.fit(train)

# Create Future dates
future_sales = m.make_future_dataframe(periods=30)

# Predict sales
forecast = m.predict(future_sales)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

# merge test set with forecasted values
benchmark_df = test.merge(forecast[["ds", "yhat"]], on="ds", how="left")

# Plot actual and forecasted data
fig = px.line(benchmark_df, x='ds', y=["y", "yhat"], markers=True, title="Prophet 30 days forecast")
# # Show plot 
fig.show()

evaluate_forecasting_model(
    actual_values=benchmark_df['y'],
    predicted_values=benchmark_df['yhat'],
    round_nbr=3
)

15:06:22 - cmdstanpy - INFO - Chain [1] start processing
15:06:22 - cmdstanpy - INFO - Chain [1] done processing


MAE - 1497.967
MSE - 3832540.004
MAPE - 0.132
