# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex3 import *

# Setup notebook
import warnings
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

warnings.simplefilter("ignore")

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax


def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax


comp_dir = Path('../input/store-sales-time-series-forecasting')

holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    parse_dates=['date'],
    infer_datetime_format=True,
).set_index('date').to_period('D')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
average_sales = (
    store_sales
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)

-------------------------------------------------------------------------------

Examine the following seasonal plot:

In [None]:
X = average_sales.to_frame()
X["week"] = X.index.week
X["day"] = X.index.dayofweek
seasonal_plot(X, y='sales', period='week', freq='day');

And also the periodogram:

In [None]:
plot_periodogram(average_sales);

# 1) Determine seasonality

What kind of seasonality do you see evidence of? Once you've thought about it, run the next cell for some discussion.

In [None]:
# View the solution (Run this cell to receive credit!)
q_1.check()

-------------------------------------------------------------------------------

# 2) Create seasonal features

Use `DeterministicProcess` and `CalendarFourier` to create:
- indicators for weekly seasons and
- Fourier features of order 4 for monthly seasons.

In [None]:
y = average_sales.copy()

# YOUR CODE HERE
fourier = ____
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    # YOUR CODE HERE
    # ____
    drop=True,
)
X = dp.in_sample()

# Check your answer
q_2.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_2.hint()
#_COMMENT_IF(PROD)_
q_2.solution()

In [None]:
#%%RM_IF(PROD)%%
y = average_sales.copy()
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()

q_2.assert_check_passed()

Now run this cell to fit the seasonal model.

In [None]:
model = LinearRegression().fit(X, y)
y_pred = pd.Series(
    model.predict(X),
    index=X.index,
    name='Fitted',
)

y_pred = pd.Series(model.predict(X), index=X.index)
ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax.legend();

-------------------------------------------------------------------------------

Run the next cell to see the deseasonalized series with holidays indicated.

In [None]:
y_deseason = y - y_pred

# Dates of national and regional holidays in the training set
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017-01':'2017-08-15', :]
    .index
)

ax = y_deseason.plot(**plot_params)
plt.plot_date(holidays, y_deseason[holidays], color='C3')
ax.set_title('National and Regional Holidays')

# 3) Explore holiday events

Does it appear that features representing holidays could be useful to include?

In [None]:
# View the solution (Run this cell to receive credit!)
q_3.check()

A one-hot encoding would work well.


In [None]:
#X['NewYear'] = 

-------------------------------------------------------------------------------

# 4) Check for remaining seasonality

Does there appear to be any regular or periodic behavior still present? How could you check?

In [None]:
# View the solution (Run this cell to receive credit!)
q_4.check()

Look at the periodogram of the deseasonalized series.

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(y, ax=ax1)
ax1.set_title("Product Sales Frequency Components")
ax2 = plot_periodogram(y_deseason, ax=ax2);
ax2.set_title("Deseasonalized");

Based on this periodogram, how effectively does it appear your model captured the seasonality in this series? Does it agree with your observations of the deseasonalized series?

-------------------------------------------------------------------------------

# (Optional) Understand log transforms and the RMSLE metric

Sometimes a logarithmic transform is effective at stabilizing the variation in a series.

```
log(trend * seasons * error) = log(trend) + log(seasons) + log(error)
```

The next cell plots the change in store sales after a log-transform. Notice how the variation in the transformed series, instead of increasing over time, appears to be almost constant from start to finish.

In [None]:
import numpy as np

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(10, 7))
average_sales.plot(ax=ax1)
ax1.set_title("Sales")
ax2 = np.log1p(average_sales).plot(ax=ax2)
ax2.set_title("Log Sales");

In [None]:
from sklearn.metrics import mean_squared_log_error

X_valid = dp.out_of_sample(steps=16)
X_valid['NewYear'] = False

rmsle_train = mean_squared_log_error(y_train, y_pred) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, model.predict(X_valid)) ** 0.5

print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')

In [None]:
from sklearn.metrics import mean_squared_error

model = LinearRegression().fit(X, np.log1p(y_train))
y_pred = pd.Series(
    model.predict(X),
    index=X.index,
    name='Fitted',
)

rmsle_train = mean_squared_error(np.log1p(y_train), y_pred) ** 0.5
rmsle_valid = mean_squared_error(np.log1p(y_valid), model.predict(X_valid)) ** 0.5

print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')

-------------------------------------------------------------------------------


# Keep Going #