# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex6 import *

# Setup notebook
import holidays
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend)
from xgboost import XGBRegressor


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)


def add_trend(X, freq="D", order=1):
    base_period = X.index[0]
    trend = CalendarTimeTrend(
        freq=freq, order=order, constant=False, base_period=base_period,
    )
    X = X.join(trend.in_sample(X.index))
    return X


def add_seasonal(X, freq="D", period="W"):
    seasonality = CalendarSeasonality(freq="D", period="W")
    X = X.join(seasonality.in_sample(X.index))
    return X


def add_fourier(X, freq="A", order=52):
    fourier = CalendarFourier(freq=freq, order=order)
    X = X.join(fourier.in_sample(X.index))
    return X


def add_lags(X, y, maxlag=1):
    X = X.join(lagmat(y, maxlag=maxlag, use_pandas=True, trim="both"))
    return X, y


# Load 1C data
data_dir = Path("../input/ts-course-data")
df_train = pd.read_csv(data_dir / "1c_train.csv", parse_dates=["date"])
df_items = pd.read_csv(data_dir / "1c_items.csv", index_col="item_id")

# Make series for each item a column
ts_cnt = df_train.pivot_table(
    index="date", values="item_cnt_day", columns="item_id", aggfunc="sum"
)
ts_cnt = ts_cnt.to_period("D")

ts_valid = ts_cnt.iloc[-90:]
ts_train = ts_cnt.drop(ts_valid.index)

-------------------------------------------------------------------------------


In [None]:
def make_local_features(ts):
    X = pd.DataFrame(index=ts.index)
    X = add_trend(X)
    X = add_seasonal(X)
    return X


X = make_local_features(ts_cnt)

# 1) Make local forecasts

Create local forecasts using the `LinearRegression` model

In [None]:
forecast = {}
for item in ts_train:
    y = ts_cnt.loc[:, item].dropna().clip(0.0)
    y_train = ts_train.loc[:, item].dropna().clip(0.0)
    X_train = X.loc[y_train.index, :]
    model = LinearRegression()
    model.fit(X_train, y_train)
    forecast[item] = pd.Series(
        model.predict(X.loc[y.index, :]), index=y.index,
    )

ts_local = pd.DataFrame(forecast)
local_train = ts_local.loc[ts_train.index]
local_valid = ts_local.loc[ts_valid.index]

Run this cell to see the error using only the local models.

In [None]:
rmse = np.sqrt(np.nanmean((ts_valid - local_valid) ** 2))
print(f"RMSE: {rmse}")

-------------------------------------------------------------------------------

# 2) Detrend with local forecasts


In [None]:
# Detrend
ts_detrended = ts_cnt - ts_local

Run this cell to see the result.

In [None]:
items = ts_train.sample(4, axis=1)
for item in items:
    fig, ax = plt.subplots(figsize=(11, 3))
    s, y = df_items.loc[item, :]
    title = f"{s} {y} (Detrended)"
    ts_detrended[item].plot(ax=ax, color="C0", title=title)
    plt.show()

-------------------------------------------------------------------------------

# 3) Define Global Features


In [None]:
def get_data(ts, features):
    X = pd.DataFrame(index=ts.index)
    X["DayOfWeek"] = X.index.dayofweek
    X["DayOfYear"] = X.index.dayofyear
    X["Month"] = X.index.month
    X = add_fourier(X, order=52)
    X = features.join(X.to_timestamp(), on="date")
    X.set_index("date", inplace=True)
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    return X


features_train = df_train[["date", "shop", "item"]]
features_valid = df_valid[["date", "shop", "item"]]
X_train = get_data(ts_train, features_train)
X_valid = get_data(ts_valid, features_valid)

y_train = ts_detrended.melt(ignore_index=False).dropna()
y_train = y_train.reset_index().set_index(["date", "item_id"])
y_train = y_train.value
y_train.name = "item_cnt_day"

-------------------------------------------------------------------------------

# 4) Forecast with global model


In [None]:


model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred = pd.DataFrame({"item_cnt_day": y_pred}, index=y_train.index)
y_pred = y_pred.reset_index().pivot_table(
    index="date", columns="item_id", values="item_cnt_day"
)
y_pred = y_pred + ts_trend
y_pred = y_pred.clip(0.0)

Run this to see the result.


In [None]:
items = ts_train.sample(4, axis=1)
for item in items:
    fig, ax = plt.subplots(figsize=(11, 3))
    s, y = df_items.loc[item, :]
    title = f"{s} {y}"
    ax = ts_train[item].plot(color="0.25", style=".")
    y_pred[item].plot(ax=ax, color="C0", title=title)
    plt.show()

And this to see the final error.

In [None]:
rmse = np.sqrt(np.nanmean((ts - ts_trend) ** 2))
print(f"RMSE: {rmse}")

# Keep Going #