In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from copy import deepcopy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer

tqdm.pandas()

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [4]:
submission = pd.read_csv("data/raw/sample_submission.csv")
data = pd.read_csv("data/processed/data.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

new_submission = deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

In [5]:
data = data[
    [
        "store_id",
        "date",
        "day_of_week",
        "holiday_flg",
        "genre_name",
        "area_name",
        "latitude",
        "longitude",
        "year",
        "month",
        "day",
        "visitors",
    ]
]

In [10]:
X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

In [39]:
from sklearn.base import BaseEstimator, RegressorMixin


class RecursivePipeline(BaseEstimator, RegressorMixin):
    def __init__(self, pipeline):
        self.pipeline = pipeline

    def fit(self, X, y):
        self.pipeline.fit(X, y)
        return self

    def predict_batch(self, group):
        return self.pipeline.predict(group)

    def predict(self, X=None):
        predictions = []

        X_cols = X.columns
        predictions = (
            X.sort_values("date")
            .groupby(by=["date"], group_keys=False)[X_cols]
            .progress_apply(
                lambda group: self.predict_batch(group), include_groups=False
            )
        )

        return np.concatenate(predictions.to_numpy())

In [None]:
pipeline = RecursivePipeline(
    pipeline=Pipeline(
        steps=[
            ("transformer", MyTransformer()),
            (
                "model",
                XGBRegressor(
                    objective="reg:squaredlogerror",
                    random_state=42,
                    enable_categorical=True,
                ),
            ),
        ]
    )
)


param_grid = {
    "pipeline__model__n_estimators": [5, 10, 20, 30, 40, 50, 70],
    "pipeline__model__learning_rate": [0.001, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
    "pipeline__model__max_depth": np.arange(2, 20, 1),
}

validation_size = new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size)
rscv = RandomizedSearchCV(
    estimator=pipeline,
    cv=cv,
    param_distributions=param_grid,
    scoring="neg_root_mean_squared_log_error",
    # n_jobs=-1,
    verbose=10,
    n_iter=1,
)


X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

rscv.fit(X, y)