In [None]:
from typing import Optional, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
plt.rcParams["font.size"] = 14
plt.rcParams['figure.figsize'] = (16, 4.0)

from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [None]:
books = pd.read_csv('data/bol_books.csv')
books['date'] = pd.to_datetime(books.date)
books

In [None]:
plt.plot(books.date, books.n)

#### 1. Plot two centered rolling means with two different windows

In [None]:
# TODO: CREATE A ROLLING WINDOW OF THE PREVIOUS SALES OF THE lAST 10 MONTHS, THIS CAN BE A NICE INITIAL PREDICTOR (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html)
rolling_window_sales = books.rolling(10, center=False).mean()
# TODO: END CODEBLOCK

plt.plot(books.date, books.n, label='Actual sales')
plt.plot(books.date, rolling_window_sales, label='Rolling window sales')
plt.legend()
plt.show()

#### 2. Fit and plot a linear model. Does it adequately represent the trend?

In [None]:
def fit_and_plot_model(X: pd.DataFrame, y_true: pd.Series, dates: List, sklearn_model: BaseEstimator, pre_processor: Optional[ColumnTransformer] = None, X_train: Optional[pd.DataFrame] = None, y_train: Optional[pd.DataFrame] = None, X_test: Optional[pd.DataFrame] = None, y_test: Optional[pd.DataFrame] = None):
    pipeline_steps = [('model', sklearn_model)]
    if pre_processor:
        pipeline_steps.insert(0, ('preprocess', pre_processor))

    pipeline = Pipeline(pipeline_steps)

    if X_train is None or y_train is None or X_test is None or y_test is None:
        X_train = X
        y_train = y_true
        X_test = X
        y_test = y_true
    else:
        plt.axvline(x=dates[len(X_train)], label='Start of the test set', c='red')

    pipeline = pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    print(f"Mean absolute error is {round(mean_absolute_error(y_test, y_pred), 3)}")

    plt.plot(dates, y_true, label='Actual sales')
    plt.plot(dates, pipeline.predict(X), label='Predicted sales')
    plt.legend()
    plt.show()

In [None]:
# TODO: DEFINE X AS A NUMERIC SEQUENCE THAT INCREASES 1 BY 1 STARTING FROM 0 | DEFINE Y AS THE TARGET (THE ACTUAL SALES NUMBERS)
books['period_num'] = np.arange(len(books.index))
X = books[['period_num']]
y = books['n'].values
# TODO: END CODEBLOCK

fit_and_plot_model(X=X, y_true=y, dates=books.date, sklearn_model=LinearRegression())

#### 3. Are there any noticeable break points? If so, add respective dummy(s) & interactions to the linear model.

In [None]:
def get_features_with_breakpoint(df_books: pd.DataFrame, year: str) -> pd.DataFrame:
    # TODO: CREATE A NEW COLUMN WHICH IS TRUE IF BEFORE YEAR X (WHICH WILL BE YOUR BREAKING POINT) AND FALSE IF AFTER YEAR X
    df_books['before_year_x'] = books.date < year
    # TODO: END CODEBLOCK

    return (
        df_books[['period_num', 'before_year_x']]
        .assign(interaction = lambda df: df.period_num*df.before_year_x)
    )
X_with_break = get_features_with_breakpoint(df_books=books, year='2006')

fit_and_plot_model(X=X_with_break, y_true=y, dates=books.date, sklearn_model=LinearRegression())

#### 4. Add seasonal dummies to the model. How frequent should they be to capture seasonality well?

In [None]:
X_season = X_with_break

# TODO: CREATE A NEW COLUMN WHICH INDICATES SEASONALITY (THE YEAR / THE QUARTER / THE SEASON / THE MONTH), WHATEVER YOU LIKE
X_season['season'] = books.date.dt.month
# TODO: END CODEBLOCK

feature_transformer = ColumnTransformer(
     [('numeric', 'passthrough', ['period_num', 'before_year_x', 'interaction']),
      ('categorical', OneHotEncoder(sparse=False, drop='first'), ['season'])]
)

fit_and_plot_model(X=X_season, y_true=y, dates=books.date, sklearn_model=LinearRegression(), pre_processor=feature_transformer)

#### 5. Use a better model to improve the predictive power

In [None]:
# TODO: FIND A BETTER MODEL ON THE SCIKIT LEARN WEBSITE AND TRY IT :-) (https://scikit-learn.org/stable/supervised_learning.html, tip: it is a regression problem)
better_model = GradientBoostingRegressor()
# TODO: END CODEBLOCK

fit_and_plot_model(X=X_season, y_true=y, dates=books.date, sklearn_model=better_model, pre_processor=feature_transformer)

#### 6. What will happen if we predict for the next year? Will it still perform well? Can you visualize this?

In [None]:
def create_train_test_split(X_to_split, y_to_split) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    # TODO: MAKE SURE WE HAVE A TRAIN TEST SPLIT OF THE DATASETS X_SEASON AND Y
    train_split_idx = int(len(X_to_split) * 0.75)
    X_train, X_test = X_to_split[:train_split_idx + 1], X_to_split[train_split_idx:]
    y_train, y_test = y_to_split[:train_split_idx + 1], y_to_split[train_split_idx:]
    # TODO: END CODEBLOCK

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = create_train_test_split(X_season, y)

fit_and_plot_model(X=X_season, y_true=y, dates=books.date, sklearn_model=better_model, pre_processor=feature_transformer, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

#### 7. Can you think of other features to improve the model?

In [None]:
X_improved = X_season
improved_model = better_model

# TODO: HERE YOU CAN DO ANYTHING TO MAKE THE MODEL BETTER. ADD FEATURES, CHANGE THE MODEL TO SOMETHING FANCY, WHATEVER YOU LIKE :)
X_improved['sales 1 month ago'] = pd.Series(y).shift(1).fillna(-1)
X_improved['sales 6 months ago'] = pd.Series(y).shift(6).fillna(-1)
X_improved['sales 12 months ago'] = pd.Series(y).shift(12).fillna(-1)

improved_feature_transformer = ColumnTransformer(
     [('numeric', 'passthrough', ['period_num', 'before_year_x', 'interaction', 'sales 1 month ago', 'sales 6 months ago', 'sales 12 months ago']),
      ('categorical', OneHotEncoder(sparse=False, drop='first'), ['season'])]
)
# TODO: END CODEBLOCK

X_train, X_test, y_train, y_test = create_train_test_split(X_improved, y)

fit_and_plot_model(X=X_season, y_true=y, dates=books.date, sklearn_model=improved_model, pre_processor=improved_feature_transformer, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)