# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex6 import *

# Setup notebook
import holidays
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend)
from xgboost import XGBRegressor


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)


def add_trend(X, freq="D", order=1):
    base_period = X.index[0]
    trend = CalendarTimeTrend(
        freq=freq, order=order, constant=False, base_period=base_period,
    )
    X = X.join(trend.in_sample(X.index))
    return X


def add_seasonal(X, freq="D", period="W"):
    seasonality = CalendarSeasonality(freq="D", period="W")
    X = X.join(seasonality.in_sample(X.index))
    return X


def add_fourier(X, freq="A", order=52):
    fourier = CalendarFourier(freq=freq, order=order)
    X = pd.concat([X, fourier.in_sample(X.index)], axis=1)
    return X


def add_lags(X, y, maxlag=1):
    X = X.join(lagmat(y, maxlag=maxlag, use_pandas=True, trim="both"))
    return X, y


# Load 1C data
data_dir = Path("../input/ts-course-data")
df = pd.read_csv(data_dir / "1c_train.csv", parse_dates=["date"])
df["date"] = df.date.dt.to_period("D")
df_items = pd.read_csv(data_dir / "1c_items.csv", index_col="item_id")

# Make series for each item a column
ts_cnt = df.pivot_table(
    index="date", values="item_cnt_day", columns="item_id", aggfunc="sum"
)

ts_valid = ts_cnt.iloc[-90:]
ts_train = ts_cnt.drop(ts_valid.index)
df_train = df[df.date < ts_valid.index[0]]
df_valid = df[df.date >= ts_valid.index[0]]

-------------------------------------------------------------------------------

# 0) Forecast with Only Global Model


In [None]:
# YOUR CODE HERE
#_UNCOMMENT_IF(PROD)_
#____

# Check your answer
q_0.check()

In [None]:
# Lines below will give you a hint or solution code
#_COMMENT_IF(PROD)_
q_0.hint()
#_COMMENT_IF(PROD)_
q_0.solution()

In [None]:
#%%RM_IF(PROD)%%
def get_time_features(X):
    X["Year"] = X.index.year
    X["Month"] = X.index.month
    X["DayOfMonth"] = X.index.day
    X["DayOfWeek"] = X.index.dayofweek
    X["WeekOfYear"] = X.index.weekofyear
    return X


def get_data_splits(df):
    df = df.sort_values(by="date")
    dates = df.date.unique()
    X = pd.DataFrame(index=pd.PeriodIndex(dates, name="date"))
    X = get_time_features(X)
    ts = df.set_index(["item_id", "date"]).sort_index(level=0)
    X = X.join(ts)
    y = X.pop("item_cnt_day")
    y = y.clip(0.0)
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    idx_train = X.index[df.date < dates[-90]]
    idx_valid = X.index[df.date >= dates[-90]]
    return (
        X.loc[idx_train],
        y.loc[idx_train],
        X.loc[idx_valid],
        y.loc[idx_valid],
    )


X_train, y_train, X_valid, y_valid = get_data_splits(df)

model = XGBRegressor(eval_set=[X_valid, y_valid], eval_metric="rmse")
model.fit(X_train, y_train)


q_0.assert_check_passed()

Evaluate.

In [None]:
y_pred = pd.Series(model.predict(X_train), index=y_train.index).clip(0.0)
y_fore = pd.Series(model.predict(X_valid), index=y_valid.index).clip(0.0)
rmse_train = np.sqrt(((y_train - y_pred) ** 2).mean())
rmse_valid = np.sqrt(((y_valid - y_fore) ** 2).mean())
print(rmse_train, rmse_valid)

In [None]:
for item in df.item_id.unique()[:20]:
    ax = y[item].plot(style=".", color="0.25")
    _ = y_pred[item].plot(ax=ax)
    plt.show()

-------------------------------------------------------------------------------

In [None]:
def make_local_features(ts):
    X = pd.DataFrame(index=ts.index)
    X = add_trend(X)
    X = add_seasonal(X)
    return X


X = make_local_features(ts_cnt)

# 1) Make local forecasts

Create local forecasts using the `LinearRegression` model

In [None]:
forecast = {}
for item in ts_train:
    y = ts_cnt.loc[:, item].dropna().clip(0.0)
    y_train = ts_train.loc[:, item].dropna().clip(0.0)
    X_train = X.loc[y_train.index, :]
    model = LinearRegression()
    model.fit(X_train, y_train)
    forecast[item] = pd.Series(
        model.predict(X.loc[y.index, :]), index=y.index,
    )

ts_local = pd.DataFrame(forecast)
local_train = ts_local.loc[ts_train.index]
local_valid = ts_local.loc[ts_valid.index]

Run this cell to see the error using only the local models.

In [None]:
rmse = np.sqrt(np.nanmean((ts_valid - local_valid) ** 2))
print(f"RMSE: {rmse}")

Now run this to detrend.

In [None]:
# Detrend
ts_detrended = ts_cnt - ts_local

Run this cell to see the result.

In [None]:
items = ts_train.sample(4, axis=1)
for item in items:
    fig, ax = plt.subplots(figsize=(11, 3))
    s, y = df_items.loc[item, ["shop", "item"]]
    title = f"{s} {y} (Detrended)"
    ts_detrended[item].plot(ax=ax, color="C0", title=title)
    plt.show()

-------------------------------------------------------------------------------

# 3) Define Global Features


In [None]:
def get_data(X):
    X["Year"] = X.index.year
    X["Month"] = X.index.month
    X["DayOfMonth"] = X.index.day
    X["DayOfWeek"] = X.index.dayofweek
    X["WeekOfYear"] = X.index.weekofyear
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    return X


X_train = df_train[["date", "item_id", "shop", "item"]].set_index("date")
X_valid = df_valid[["date", "item_id", "shop", "item"]].set_index("date")
X_train = get_data(X_train).set_index("item_id", append=True).sort_index()
X_valid = get_data(X_valid).set_index("item_id", append=True).sort_index()

y = ts_detrended.melt(ignore_index=False).dropna()
y = y.reset_index().set_index(["date", "item_id"])
y = y.value
y.name = "item_cnt_day"

y_train = y.loc[X_train.index].sort_index()
y_valid = y.loc[X_valid.index].sort_index()

X_train["Sales MA"] = (
    ts_train
    .shift(1)
    .ewm(halflife=7)
    .mean()
    .melt(ignore_index=False)
    .set_index("item_id", append=True)
    .fillna(0.0)
)
X_valid["Sales MA"] = (
    ts_valid
    .shift(1)
    .ewm(halflife=7)
    .mean()
    .melt(ignore_index=False)
    .set_index("item_id", append=True)
    .fillna(0.0)
)


-------------------------------------------------------------------------------

# 4) Forecast with global model


In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred = pd.DataFrame({"item_cnt_day": y_pred}, index=y_train.index)
y_pred = y_pred.reset_index().pivot_table(
    index="date", columns="item_id", values="item_cnt_day"
)
y_pred = y_pred + local_train
y_pred = y_pred.clip(0.0)

y_forecast = model.predict(X_valid)
y_forecast = pd.DataFrame({"item_cnt_day": y_forecast}, index=y_valid.index)
y_forecast = y_forecast.reset_index(level=1).pivot(
    columns="item_id", values="item_cnt_day"
)
y_forecast = y_forecast + local_valid
y_forecast = y_forecast.clip(0.0)
y_forecast = y_forecast.melt(ignore_index=False).rename(columns={"variable": "item_id", "value": "item_cnt_day"}).set_index("item_id", append=True).dropna().sort_index(level=0)["item_cnt_day"]

This will show you the errors.

In [None]:
rmse_is = np.sqrt(np.nanmean((ts_train - y_pred) ** 2))
print(f"In-Sample RMSE: {rmse_is}")

rmse_oos = np.sqrt(np.nanmean((y_valid - y_forecast) ** 2))
print(f"Out-of-Sample RMSE: {rmse_oos}")

Run this to see some of the forecast series.


In [None]:
items = ts_train.sample(4, axis=1)
for item in items:
    fig, ax = plt.subplots(figsize=(11, 3))
    s, y = df_items.loc[item, :]
    title = f"{s} {y}"
    ax = ts_train[item].plot(color="0.25", style=".")
    y_pred[item].plot(ax=ax, color="C0", title=title)
    plt.show()

# Next Steps #

# References #

Here are some great resources you might like to consult for more on time series and forecasting. They all played a part in shaping this course:

- *Learnings from Kaggle's forecasting competitions*, an article by Casper Solheim Bojer and Jens Peder Meldgaard.
- *Forecasting: Principles and Practice*, a book by Rob J Hyndmann and George Athanasopoulos.
- *Practical Time Series Forecasting with R*, a book by Galit Shmueli and Kenneth C. Lichtendahl Jr.
- *Time Series Analysis and Its Applications*, a book by Robert H. Shumway and David S. Stoffer.
- *Machine learning strategies for time series forecasting*, an article by Gianluca Bontempi, Souhaib Ben Taieb, and Yann-Aël Le Borgne.
- *On the use of cross-validation for time series predictor evaluation*, an article by Christoph Bergmeir and José M. Benítez.
