# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex1 import *

# Setup notebook
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

# Load data
data_dir = Path('../input/ts-course-data/')
comp_dir = Path('../input/store-sales-time-series-forecasting')

book_sales = pd.read_csv(
    data_dir / 'book_sales.csv',
    index_col='Date',
    parse_dates=['Date'],
).drop('Paperback', axis=1)
book_sales['Time'] = np.arange(len(book_sales.index))
book_sales['Lag_1'] = book_sales['Hardcover'].shift(1)
book_sales = book_sales.reindex(columns=['Hardcover', 'Time', 'Lag_1'])

ar = pd.read_csv(data_dir / 'ar.csv')

dtype = {
    'store_nbr': 'category',
    'family': 'category',
    'sales': 'float32',
    'onpromotion': 'uint64',
}

sales = pd.read_csv(
    comp_dir / 'train.csv',
    dtype=dtype,
    parse_dates=['date'],
    infer_datetime_format=True,
)
sales = sales.set_index('date').to_period('D')
sales = sales.set_index(['store_nbr', 'family'], append=True)
total_sales = sales.groupby('date').mean()['sales']

--------------------------------------------------------------------------------

# 1) Interpreting linear regression with the time dummy

In [None]:
fig, ax = plt.subplots()
ax.plot('Time', 'Hardcover', data=book_sales, color='0.75')
ax = sns.regplot(x='Time', y='Hardcover', data=book_sales, ci=None, scatter_kws=dict(color='0.25'))
ax.set_title('Time Plot of Hardcover Sales');

The linear regression line has an equation of (approximately) `Hardcover = 3.33 * Time + 150.5`. Over 3 time steps, how much on average would you expect hardcover sales to change?

In [None]:
# Uncomment the next line for a hint
q_1.hint()

In [None]:
# View the solution (Run this cell to receive credit!)
q_1.check()

-------------------------------------------------------------------------------


# 2) Interpreting linear regression with a lag feature

Run the following cell.

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 5), sharex=True)
ax1 = ax1.plot(ar['ar1'])
ax2 = ax2.plot(ar['ar2'])

One of these series has the equation `target = 0.95 * lag_1 + error` and the other has the equation `target = -0.95 * lag_1 + error`, differing only by the sign on the lag feature. Can you tell which equation goes with each series?

In [None]:
# Uncomment the next line for a hint
q_2.hint()

In [None]:
# View the solution (Run this cell to receive credit!)
q_2.check()

-------------------------------------------------------------------------------

# 3) Fit a time feature to Store Sales



In [None]:
# YOUR CODE HERE
____

# Check your answer
q_3.check()

In [None]:
# Lines below will give you a hint or solution code
q_3.hint()
q_3.solution()

In [None]:
from sklearn.linear_model import LinearRegression

df = total_sales.to_frame()

# Time dummy
df['time'] = np.arange(len(df.index))

# Training data
X = df.loc[:, ['time']]  # features
y = df.loc[:, 'sales']  # target

# Train the model
model = LinearRegression()
model.fit(X, y)

# Store the fitted values as a time series with the same time index as
# the training data
y_pred = pd.Series(model.predict(X), index=X.index)

In [None]:
ax = y.plot(**plot_params)
ax = y_pred.plot(ax=ax, linewidth=3)
ax.set_title('Time Plot of Total Store Sales');

-------------------------------------------------------------------------------

# 4) Fit a lag feature to Store Sales


In [None]:
# YOUR CODE HERE
____

# Check your answer
q_4.check()

In [None]:
# Lines below will give you a hint or solution code
q_4.hint()
q_4.solution()

In [None]:
df = total_sales.to_frame()

df['lag_1'] = df['sales'].shift(1)

X = df.loc[:, ['lag_1']]
X.dropna(inplace=True)  # drop missing values in the feature set
y = df.loc[:, 'sales']  # create the target
y, X = y.align(X, join='inner')  # drop corresponding values in target

model = LinearRegression()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)

In [None]:
fig, ax = plt.subplots()
ax.plot(X['lag_1'], y, '.', color='0.25')
ax.plot(X['lag_1'], y_pred)
ax.set_aspect('equal')
ax.set_ylabel('sales')
ax.set_xlabel('lag_1')
ax.set_title('Lag Plot of Total Store Sales');

# Keep Going #