In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from copy import deepcopy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer

tqdm.pandas()

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [4]:
submission = pd.read_csv("data/raw/sample_submission.csv")
data = pd.read_csv("data/processed/data.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

new_submission = deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

In [5]:
data = data[
    [
        "store_id",
        "date",
        "day_of_week",
        "holiday_flg",
        "genre_name",
        "area_name",
        "latitude",
        "longitude",
        "year",
        "month",
        "day",
        "visitors",
    ]
]

X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

In [6]:
def to_category(data): 
    data = data.copy()   
    for c in data.columns:
        col_type = data[c].dtype
        if (
            col_type == "object"
            or col_type.name == "category"
            or col_type.name == "datetime64[ns]"
        ):
            data[c] = data[c].astype("category")
    return data

In [7]:
from skforecast.plot import set_dark_theme
from skforecast.preprocessing import series_long_to_dict
from skforecast.preprocessing import exog_long_to_dict
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries

In [8]:
series_dict = series_long_to_dict(
    data      = data,
    series_id = 'store_id',
    index     = 'date',
    values    = 'visitors',
    freq      = 'D'
)

In [9]:
series_dict['air_00a91d42b08b08d9']

2016-07-01     24.0
2016-07-02     47.0
2016-07-03      NaN
2016-07-04      3.0
2016-07-05      7.0
              ...  
2017-04-18      4.0
2017-04-19     47.0
2017-04-20    110.0
2017-04-21     30.0
2017-04-22     18.0
Freq: D, Name: air_00a91d42b08b08d9, Length: 296, dtype: float64

In [None]:
regressor = XGBRegressor(
                objective="reg:squaredlogerror",
                random_state=42,
                enable_categorical=True,
            )
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = regressor, 
                 lags               = 14, 
                 encoding           = "ordinal", 
                 dropna_from_series = False
             )

forecaster.fit(series=series_dict, suppress_warnings=True)
forecaster

ForecasterAutoregMultiSeries 
Regressor: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, objective='reg:squaredlogerror', ...) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14] 
Transformer for series: None 
Transformer for exog: None 
Series encoding: ordinal 
Window size: 14 
Series levels (names): air_00a91d42b08b08d9, a

In [None]:
pipeline = Pipeline(
    steps=[
        ("transformer", MyTransformer()),
        (
            "model",
            XGBRegressor(
                objective="reg:squaredlogerror",
                random_state=42,
                enable_categorical=True,
            ),
        ),
    ]
)


param_grid = {
    "model__n_estimators": [5, 10, 20, 30, 40, 50, 70],
    "model__learning_rate": [0.001, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
    "model__max_depth": np.arange(2, 20, 1),
}

validation_size = new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size)
rscv = RandomizedSearchCV(
    estimator=pipeline,
    cv=cv,
    param_distributions=param_grid,
    scoring="neg_root_mean_squared_log_error",
    # n_jobs=-1,
    verbose=10,
    n_iter=1,
)

X = to_category(data).drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

rscv.fit(X, y)