# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex4 import *
from learntools.time_series.utils import plot_lags, make_lag_features

# Setup notebook
import warnings
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.tsa.tsatools import lagmat

warnings.simplefilter("ignore")

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


comp_dir = Path('../input/store-sales-time-series-forecasting')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
average_sales = store_sales.groupby('date').mean().squeeze().loc['2017']

oil = pd.read_csv(
    comp_dir / "oil.csv",
    dtype='float32',
    parse_dates=["date"],
    infer_datetime_format=True,
)
oil = oil.set_index('date').to_period('D').squeeze()

-------------------------------------------------------------------------------


In [None]:
N = 250
time = np.linspace(0, 100, num=N)
error_1 = np.random.normal(size=N, scale=10.0)
error_2 = np.random.normal(size=N, scale=2.0)

trending = pd.Series(time + error_1)
seasonal = pd.Series(10 * np.sin(20*2*np.pi*time/100) + error_2)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6))
ax1 = trending.plot(ax=ax1)
ax1.set_title("Trending")
ax2 = seasonal.plot(ax=ax2)
ax2.set_title("Seasonal");

In [None]:
fig = plot_lags(trending, lags=8, nrows=2)
fig.suptitle('Trending', fontweight='bold', fontsize=16);

fig = plot_lags(seasonal, lags=8, nrows=2)
fig.suptitle('Seasonal', fontweight='bold', fontsize=16);

# 1) Examine the relationship between time and serial dependence



In [None]:
# View the solution (Run this cell to receive credit!)
q_1.check()

-------------------------------------------------------------------------------

Since we're interested in what *new* information we can capture through lag features, let's remove from *Store Sales* the parts we've already captured: the trend and the seasons. Removing from a series its trend or seasons is called **detrending** or **deseasonalizing** the series.

Use the code in the next cell to deseasonalize *Average Store Sales*.

In [None]:
y = average_sales.loc[:, 'sales']

fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    constant=True,
    index=y.index,
    order=1,
    seasonal=True,
    drop=True,
    additional_terms=[fourier],
)
X_time = dp.in_sample()
X_time['NewYearsDay'] = (X_time.index.dayofyear == 1)

model = LinearRegression()
model.fit(X_time, y)
sales_deseasoned = y - model.predict(X_time)
sales_deseasoned.name = 'sales_deseasoned'

ax = sales_deseasoned.plot()
ax.set_title("Average Store Sales (deseasonalized)");

Now let's examine our deseasonalized series for serial dependence.

First take a look at the partial autocorrelation correlogram. Do any of the lags seem significant?

In [None]:
plot_pacf(sales_deseasoned, lags=8);

Now look at the lag plot.

In [None]:
plot_lags(sales_deseasoned, lags=8, nrows=2);

Do you notice any potentially useful relationships that weren't apparent from the correlogram?

# 2) Examine serial dependence in *Store Sales*

After you've thought about your answer, run the next cell.

In [None]:
# View the solution (Run this cell to receive credit!)
q_2.check()

-------------------------------------------------------------------------------

Recall from the tutorial that a *leading indicator* is a series that can be used to predict the target at a future time -- a leading indicator provides "advance notice" of changes in the target.

The competition dataset includes two time series that could potentially be useful as leading indicators: 
- `onpromotion`: number of items on a special promotion that day, and
- `oil`: daily oil price.

We have values for both of these series throughout the training and test periods.

Use the next cell to examine lags for `onpromotion`.

In [None]:
onpromotion = average_sales.loc[:, 'onpromotion']

plot_lags(onpromotion['2017-01-02' :], y['2017-01-02' :], lags=8, nrows=2);

And use this cell to examine lags for `oil`.

In [None]:
y_oil, oil = y.align(oil, join='inner')

plot_lags(oil, y_oil, lags=8, nrows=2);

# 3) Examine time series features

Which of these series do the plots suggest might be useful to include as features?

In [None]:
q_3.check()

-------------------------------------------------------------------------------

# 4) Create time series features

Create the features indicated in the solution to Question 3. If no features from that series would be useful, use an empty dataframe `pd.DataFrame()` as your answer.

In [None]:
# YOUR CODE HERE
X_lags = ____

# YOUR CODE HERE
X_promo = ____

# YOUR CODE HERE
X_oil = ____

#_UNCOMMENT_IF(PROD)_
#X = pd.concat([X_lags, X_promo, X_oil], axis=1)

# Check your answer
q_4.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_4.hint()
#_COMMENT_IF(PROD)_
q_4.solution()

In [None]:
#%%RM_IF(PROD)%%
X_lags = make_lag_features(y_deseason, lags=1)

X_promo = pd.concat([
    make_lag_features(onpromotion, lags=1),
    onpromotion,
    make_lag_features(onpromotion, lags=-1),
], axis=1)

X_oil = pd.DataFrame()

X = pd.concat([X_time, X_lags, X_promo, X_oil], axis=1).dropna()
y, X = y.align(X, join='inner')

q_4.assert_check_passed()

Use the code in the next cell if you'd like to see predictions from the resulting model.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=30, shuffle=False)

model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
y_fit = pd.Series(model.predict(X_train), index=X_train.index).clip(0.0)
y_pred = pd.Series(model.predict(X_valid), index=X_valid.index).clip(0.0)

rmsle_train = mean_squared_log_error(y_train, y_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, y_pred) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')

ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_fit.plot(ax=ax, label="Fitted", color='C0')
ax = y_pred.plot(ax=ax, label="Forecast", color='C3')
ax.legend();

-------------------------------------------------------------------------------

# 5) Create statistical features

Winners of Kaggle forecasting competitions have often included moving averages and other rolling statistics in their feature sets. Such features seem to be especially useful when used with GBDT algorithms like XGBoost.

In Lesson 2 you learned how to compute moving averages to estimate trends. Computing rolling statistics to be used as features is similar except we need to take care to avoid lookahead leakage. First, the result should be set at the right end of the window instead of the center -- that is, we should use `center=False` (the default) in the `rolling` method. Second, the target should be lagged a step.

Edit the code in the next cell to create the following features:
- 14-day rolling median (`median`) of lagged target
- 7-day rolling standard deviation (`std`) of lagged target
- 7-day sum (`sum`) of items "on promotion", with centered window

In [None]:
y_lag = average_sales.loc[:, 'sales'].shift(1)  # lagged target
onpromo = average_sales.loc[:, 'onpromotion']  # items on promotion

# Statistical features
X_stats = pd.concat({
    # 28-day mean of lagged target
    'mean_7': y_lag.rolling(7).mean(),
    # YOUR CODE HERE: Edit to create the rolling statistic
    # 14-day median of lagged target
#_UNCOMMENT_IF(PROD)_
#    'median_14': ____,
    # 7-day rolling standard deviation of lagged target
#_UNCOMMENT_IF(PROD)_
#    'std_7': ____,
    # 7-day sum of promotions with centered window
#_UNCOMMENT_IF(PROD)_
#    'promo_7': ____,
}, axis=1).dropna()


# Check your answer
q_5.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_5.hint()
#_COMMENT_IF(PROD)_
q_5.solution()

In [None]:
#%%RM_IF(PROD)%%
y_lag = average_sales.loc[:, 'sales'].shift(1)
onpromo = average_sales.loc[:, 'onpromotion']

X_stats = pd.concat({
    'mean_7': y_lag.rolling(7).mean(),
    'median_14': y_lag.rolling(14).median(),
    'std_7': y_lag.rolling(7).std(),
    'promo_7': onpromo.rolling(7, center=True).sum(),
}, axis=1).dropna()


q_5.assert_check_passed()

Check out the Pandas [`Window` documentation](https://pandas.pydata.org/pandas-docs/stable/reference/window.html) for more statistics you can compute. Also try "exponential weighted" windows by using `ewm` in place of `rolling`; exponential decay is often a more realistic representation of how effects propagate over time.

-------------------------------------------------------------------------------

# (Optional) Explore non-linear dynamics


# Keep Going #