
# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex2 import *

# Setup notebook
import warnings
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression

warnings.simplefilter("ignore")

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

data_dir = Path('../input/ts-course-data/')
comp_dir = Path('../input/store-sales-time-series-forecasting')

retail_sales = pd.read_csv(
    data_dir / "us-retail-sales.csv",
    parse_dates=['Month'],
    index_col='Month',
).to_period('D')
food_sales = retail_sales.loc[:, 'FoodAndBeverage']
auto_sales = retail_sales.loc[:, 'Automobiles']

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
    index_col=['store_nbr', 'family', 'date'],
    squeeze=True,
).sort_index().groupby(['date']).mean()

-------------------------------------------------------------------------------

# 1) Determine trend with a moving average plot

The *US Retail Sales* dataset contains monthly sales data for a number of retail industries in the United States. Run the next cell to see a plot of the *Food and Beverage* series.

In [None]:
ax = food_sales.plot(**plot_params)
ax.set_title("US Food and Beverage Sales")
ax.set_ylabel("Millions of Dollars")

Now make a moving average plot to estimate the trend for this series.

In [None]:
# YOUR CODE HERE: Add methods to `food_sales` to compute a moving
# average with appropriate parameters for trend estimation.
trend = food_sales


ax = food_sales.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=3)

In [None]:
# Uncomment to get a hint or solution
#_COMMENT_IF(PROD)_
q_1.hint()
#_COMMENT_IF(PROD)_
q_1.solution()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales

#q_1.assert_check_failed()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales.rolling(
    window=365,
    center=True,
    min_periods=183,
).mean()

#q_1.assert_check_failed()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales.rolling(
    window=12,
    center=False,
    min_periods=6,
).mean()

#q_1.assert_check_failed()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales.rolling(
    window=12,
    center=False,
    min_periods=6,
).std()

#q_1.assert_check_failed()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales.rolling(
    window=12,
    center=True,
    min_periods=6,
)

#q_1.assert_check_failed()

In [None]:
#%%RM_IF(PROD)%%
trend = food_sales.rolling(
    window=12,
    center=True,
    min_periods=6,
).mean()

ax = food_sales.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=3)

q_1.assert_check_passed()

-------------------------------------------------------------------------------

# 2) Identifying trend

What order polynomial trend might be appropriate for the *Food and Beverage Sales* series? Can you think of a non-polynomial curve that might work even better?

Once you've thought about it, run this cell for some discussion.

In [None]:
# View the solution (Run this cell to receive credit!)
q_2.check()

-------------------------------------------------------------------------------

Now we'll get started with the *Store Sales - Time Series Forecasting* competition data. The entire dataset comprises almost 1800 series recording store sales across a variety of product families from 2013 into 2017. For this lesson, we'll just work with a single series (`store_sales`) of the average sales each day.

In [None]:
# Run this cell to see a time plot
ax = store_sales.plot(**plot_params)
ax.set_title("Average Daily Sales (all products and stores)");

Run this cell to see a moving average plot of `store_sales` estimating the trend.

In [None]:
trend = store_sales.rolling(
    window=365,
    center=True,
    min_periods=183,
).mean()

ax = store_sales.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=3);

# 3) Create a Trend Feature

There appears to be a. Use `DeterministicProcess` to create a feature set for a linear trend model.

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

y = store_sales.copy()  # the target

# YOUR CODE HERE: Instantiate `DeterministicProcess` with arguments
# appropriate for a trend model: y = weight * time + bias
dp = DeterministicProcess(
    index=y.index,
    # constant=
    # order=
    # drop=
)
# YOUR CODE HERE: Create the feature set for the dates given in y.index
X = ____

# Check your answer
q_3.check()

In [None]:
# Lines below will give you a hint or solution code
q_3.hint()
q_3.solution()

In [None]:
#%%RM_IF(PROD)%%
from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=2,
    drop=True,
)
X = dp.in_sample()

q_3.assert_check_passed()

--------------------------------------------------------------------------------

# 4) Fit and forecast trend

Using the `LinearRegression` model from scikit-learn, fit a linear trend model to `y` on the feature set `X` you just created.

In [None]:
# YOUR CODE HERE: Create the trend model
model = ____
# YOUR CODE HERE: Fit the model to y using X
____

# Check your answer
q_4.check()

In [None]:
# Lines below will give you a hint or solution code
q_4.hint()
q_4.solution()

In [None]:
#%%RM_IF(PROD)%%
model = LinearRegression(fit_intercept=False)
model.fit(X, y)

q_4.assert_check_passed()

You can see the a plot of the result by running the next cell.

In [None]:
y_pred = pd.Series(model.predict(X), index=y.index)
ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, linewidth=3, label="Linear Trend")
ax.legend();

--------------------------------------------------------------------------------

# (Optional) Fit trend with splines

The *Multivariate Adaptive Regression Splines* (MARS) algorithm in the `pyearth` library is powerful and easy to use. There are a lot of hyperparameters you may want to investigate.

In [None]:
from pyearth import Earth

# Target and features are the same as before
y = store_sales.copy()
dp = DeterministicProcess(
    index=y.index,
    order=1,
    drop=True,
    constant=False,
)
X = dp.in_sample()

# Fit a MARS model with `Earth`
model = Earth()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)

ax = y.plot(**plot_params, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, linewidth=3, label="Trend")

Forecasting complicated trends like this will typically be difficult, if not impossible. With historical data, however, you can use them to isolate other patterns in a time series by *detrending*.

In [None]:
y_detrended = y - y_pred   # remove the trend from store_sales

y_detrended.plot(**plot_params, title="Detrended Average Sales");

--------------------------------------------------------------------------------

# (Optional) Trends in variation

The trends we've looked at so far have all be trends in the mean. As we mentioned in the tutorial, other properties of a time series can also display trends. Trends in variation are especially common.

The next cell demonstrates how to visualize a trend in standard deviation.

In [None]:
rolling_std = store_sales.rolling(
    window=365,
    center=True,
    min_periods=183,
    win_type='triang',  # add "triangular" weighting to observations in window
).std()

fig , (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6))
ax1 = store_sales.plot(ax=ax1, **plot_params, alpha=0.5)
ax2 = rolling_std.plot(ax=ax2, linewidth=2.5)

Plots like these can help you choose good training sets for your forecasting model. Years prior to 2016 seem to be fairly different from the years after, suggesting they may not be as useful as training data.

# Keep Going #
