# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex4 import *

# Setup notebook
from pathlib import Path
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.offsetbox import AnchoredText
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import ar_select_order
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend)
from statsmodels.tsa.tsatools import lagmat

warnings.simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)


# Utility functions
def add_trend(X, freq="D", order=1):
    base_period = X.index[0]
    trend = CalendarTimeTrend(
        freq=freq, order=order, constant=False, base_period=base_period,
    )
    X = X.join(trend.in_sample(X.index))
    return X


def add_seasonal(X, freq="D", period="W"):
    seasonality = CalendarSeasonality(freq="D", period="W")
    X = X.join(seasonality.in_sample(X.index))
    return X


def add_fourier(X, freq="A", order=52):
    fourier = CalendarFourier(freq=freq, order=order)
    X = X.join(fourier.in_sample(X.index))
    return X


def lagplot(ts, ts2=None, lag=1, ax=None):
    ts_lag = ts.shift(lag) if ts2 is None else ts2.shift(lag)
    corr = ts.corr(ts_lag)
    if ax is None:
        fig, ax = plt.subplots()
    ax.plot(ts, ts_lag, ".")
    ax.set_aspect("equal")
    at = AnchoredText(
        f"{corr:.2f}", prop=dict(size=18), frameon=True, loc="upper left",
    )
    at.patch.set_boxstyle("square, pad=0.0")
    ax.add_artist(at)
    ax.set(title=f"Lag {lag}", xlabel=ts.name, ylabel=ts_lag.name)
    return ax


def plot_ccf(x, y, lags=10, ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    ax.xcorr(
        x, y, maxlags=lags, usevlines=True, normed=True, lw=2, color="C0",
    )
    ax.set_xticks(np.arange(-lags, lags + 1, dtype=np.int8))
    ax.set(
        xlabel="Lag",
        ylabel="Correlation",
        title="Cross-Correlation",
    )
    return ax


# Load 1C data
data_dir = Path("../input/ts-course-data")
df_train = pd.read_csv(data_dir / "1c_train.csv", parse_dates=["date"])

# Aggregate item sales into a single time series
ts = df_train.pivot_table(
    index="date", values=["item_cnt_day", "item_price"],
    aggfunc={"item_cnt_day": "sum", "item_price": "mean"}
).to_period("D")
ts_cnt = ts["item_cnt_day"]
ts_price = ts["item_price"]

-------------------------------------------------------------------------------

Identify whether the following features are deterministic or stochastic:
- the daily average temperature
- whether a day is a weekend or a weekday

# 1) Deterministic or Stochastic

After you've thought about your answer, run this cell for the solution.

In [None]:
# View the solution (Run this cell to receive credit!)
q_1.check()

-------------------------------------------------------------------------------

Run this cell to see the data deseasonalized by weekly seasonality.

In [None]:
X = pd.DataFrame(index=ts_cnt.index)
X = add_trend(X)
X = add_seasonal(X)
y_cnt = ts_cnt.clip(0.0)
model = LinearRegression().fit(X, y_cnt)
fitted = pd.Series(model.predict(X), y_cnt.index, name="item_cnt_day")

resid_cnt = y_cnt - fitted
resid_cnt.plot();

Here are the ACF and PACF plots.

In [None]:
# TODO: maybe auto-lag plots, too
_, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(11, 7))
plot_acf(resid_cnt, ax=ax1)
plot_pacf(resid_cnt, ax=ax2);

# 2) Detecting Autocorrelation and the AR/MA Properties

Do the above plots give evidence of autocorrelation? What can you say about the AR/MA orders of the series?

After you've thought about your answer, run this cell for the solution.

In [None]:
# View the solution (Run this cell to receive credit!)
q_2.check()

Now try confirming this with the statsmodels `ar_select_order` function.

In [None]:
lags = ar_select_order(
    ts_cnt, maxlag=7, trend="ct", seasonal=True, glob=True, missing="drop"
).ar_lags
print("Detected AR Order: ", lags)

Do the answers agree?

-------------------------------------------------------------------------------

This shows the cross-correlation of the daily change in rentals with the daily change in temperature.

In [None]:
# TODO: maybe cross-lag plots, too
plot_ccf(ts_cnt.diff().dropna(), ts_price.diff().dropna(), lags=7);

# 3) Detecting Cross-correlation

Do the above plots give evidence of cross-correlation? If so, which series appears to be leading?

In [None]:
# View the solution (Run this cell to receive credit!)
q_3.check()

-------------------------------------------------------------------------------

# 4) Create Lag Features

Based on the conclusions above, join the appropriate lag features to your feature set.

In [None]:
# YOUR CODE HERE
#_UNCOMMENT_IF(PROD)_
#____

# Check your answer
q_4.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_4.hint()
#_COMMENT_IF(PROD)_
q_4.solution()

In [None]:
#%%RM_IF(PROD)%%
X_cnt_lag = lagmat(ts_cnt, maxlag=6, use_pandas=True, trim="both")
X_cnt_lag = X_cnt_lag.iloc[:, [0, 2, 5]]
X_cnt_lag.dropna(inplace=True)

X_price_lag = lagmat(ts_price, maxlag=1, use_pandas=True, trim="both")
X_price_lag.dropna(inplace=True)

q_4.assert_check_passed()

-------------------------------------------------------------------------------

# 5) Create MA Features

Now create the appropriate moving-average features.

In [None]:
# YOUR CODE HERE
#_UNCOMMENT_IF(PROD)_
#____

# Check your answer
q_5.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_5.hint()
#_COMMENT_IF(PROD)_
q_5.solution()

In [None]:
#%%RM_IF(PROD)%%
X_ma = pd.DataFrame(index=ts_cnt.index)
X_ma["item_cnt_day_MA"] = ts_cnt.shift(1).ewm(alpha=0.3).mean()
X_ma["item_price_MA"] = ts_price.shift(1).ewm(alpha=0.3).mean()

q_5.assert_check_passed()

Run the cell below to see your model with all the features you've created so far: trend, seasonality, and lag.


In [None]:
# TODO: move this
def create_features_4(ts_cnt, ts_price):
    X = pd.DataFrame(index=ts_cnt.index)
    X = add_trend(X)
    X = add_seasonal(X)
    X = add_fourier(X, order=52)
    X_cnt_lag = lagmat(ts_cnt, maxlag=6, use_pandas=True, trim="both")
    X_cnt_lag = X_cnt_lag.iloc[:, [0, 2, 5]]
    X_price_lag = lagmat(ts_price, maxlag=1, use_pandas=True, trim="both")
    X_ma = pd.DataFrame(index=ts_cnt.index)
    X_ma["item_cnt_day_MA"] = ts_cnt.shift(1).ewm(alpha=0.3).mean()
    X_ma["item_price_MA"] = ts_price.shift(1).ewm(alpha=0.3).mean()
    X = X.join(X_cnt_lag)
    X = X.join(X_price_lag)
    X = X.join(X_ma)
    X.dropna(inplace=True)
    return X


X = create_features_4(ts_cnt, ts_price)
y = ts_cnt.loc[X.index]
lag_model = LinearRegression()
lag_model.fit(X, y)

lag_fit = pd.Series(lag_model.predict(X).reshape(-1), index=y.index)
ax = ts_cnt.plot(
    color="0.25", style=".", title="Total Sales", ylabel="items sold"
)
lag_fit.plot(ax=ax, label="Linear Trend")
_ = plt.legend()

# Keep Going #