In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from copy import deepcopy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer

tqdm.pandas()

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [4]:
submission = pd.read_csv("data/raw/sample_submission.csv")
data = pd.read_csv("data/processed/data.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

new_submission = deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

In [5]:
data = data[
    [
        "store_id",
        "date",
        "day_of_week",
        "holiday_flg",
        "genre_name",
        "area_name",
        "latitude",
        "longitude",
        "year",
        "month",
        "day",
        "visitors",
    ]
]

In [11]:
validation_size = new_submission["date"].nunique()

regressor = XGBRegressor(
    objective="reg:squaredlogerror", random_state=42, enable_categorical=True
)

pipeline = Pipeline(steps=[
    ("transformer", MyTransformer()), 
    ("model", regressor)
])

param_grid = {
    "model__n_estimators": [5, 10, 20, 30, 40, 50, 70],
    "model__learning_rate": [0.001, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
    "model__max_depth": np.arange(2, 20, 1),
}

cv = MySplitter(test_size=1)
rscv = RandomizedSearchCV(
    estimator=pipeline,
    cv=cv,
    param_distributions=param_grid,
    scoring="neg_root_mean_squared_log_error",
    # n_jobs=-1,
    verbose=10,
    n_iter=1,
)


X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

rscv.fit(X, y)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
splitter (247704,) (723,)
splitter (248344,) (734,)
splitter (249067,) (749,)
splitter (249801,) (784,)
splitter (250550,) (774,)
[CV 1/5; 1/1] START model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20
[CV 1/5; 1/1] END model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20;, score=-1.344 total time= 9.0min
[CV 2/5; 1/1] START model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20
[CV 2/5; 1/1] END model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20;, score=-1.356 total time= 8.1min
[CV 3/5; 1/1] START model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20
[CV 3/5; 1/1] END model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20;, score=-1.340 total time= 7.0min
[CV 4/5; 1/1] START model__learning_rate=0.1, model__max_depth=18, model__n_estimators=20
[CV 4/5; 1/1] END model__learning_rate=0.1, model__max_depth=18, model__n_estimators=

In [20]:
X = data.drop(columns=["visitors", "date"])

for c in X.columns:
    col_type = X[c].dtype
    if (
        col_type == "object"
        or col_type.name == "category"
        # or col_type.name == "datetime64[ns]"
    ):
        X[c] = X[c].astype("category")

y = data["visitors"]

In [25]:
from sktime.forecasting.compose import TransformedTargetForecaster, ForecastingPipeline, make_reduction
from sktime.forecasting.base import ForecastingHorizon
from sktime.split import ExpandingWindowSplitter


fh = ForecastingHorizon(pd.Timedelta(days=39))
# cv = MySplitter(test_size = validation_size, fh=fh)
cv = ExpandingWindowSplitter(fh=np.arange(39), initial_window=251100, step_length=1)

regressor = XGBRegressor(
    objective="reg:squaredlogerror", random_state=42, enable_categorical=True
)

forecaster = make_reduction(regressor, window_length=validation_size, strategy="recursive")

pipeline = ForecastingPipeline(
    steps=[
        # ("my_transformer", MyTransformer()),
        ("forecaster", forecaster),
    ]
)

param_grid = {
    "n_estimators": [20, 30, 40, 50, 70, 100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
    "max_depth": [3, 4, 5],
    # "eval_metric": ["rmsle"],
}

# param_grid = {
#     # 'forecast__estimator__max_depth': [3, 5, 6, 10, 15, 20],
#     # 'forecast__estimator__learning_rate': [0.01, 0.1, 0.2, 0.3],
#     # 'forecast__estimator__subsample': np.arange(0.5, 1.0, 0.1),
#     # 'forecast__estimator__colsample_bytree': np.arange(0.4, 1.0, 0.1),
#     # 'forecast__estimator__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
#     'forecaster__n_estimators': [100, 500, 1000],

#     "eval_metric": ["rmsle"],
#     'forecaster__enable_categorical': ['True'],
# }

# gscv = ForecastingRandomizedSearchCV(make_reduction(regressor, window_length=validation_size, strategy="recursive"), cv=cv, param_distributions=param_grid, n_iter=3, random_state=42)

pipeline.fit(y=y, X=X)

TypeError: Forecaster RecursiveTabularRegressionForecaster(estimator=XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, objective='reg:squaredlogerror', ...),
                                     window_length=39) does not support categorical features in exogeneous X.