In [None]:
import pandas as pd

# import numpy as np
import copy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer
from recursive_pipeline import RecursivePipeline

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
data = pd.read_csv("data/processed/data_clear.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day,visitors
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1,24
1,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,3,28
2,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,1,9
3,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,1,10
4,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,1,12


In [None]:
pipeline = RecursivePipeline(
    pipeline=Pipeline(
        steps=[
            ("transformer", MyTransformer()),
            (
                "model",
                XGBRegressor(
                    objective="reg:squaredlogerror",
                    random_state=42,
                    enable_categorical=True,
                ),
            ),
        ]
    )
)


param_grid = {
    "pipeline__model__n_estimators": [1500], 
    "pipeline__model__learning_rate": [0.005], 
    "pipeline__model__max_depth": [1],
    "pipeline__model__subsample": [0.5],
    "pipeline__model__colsample_bytree": [0.5],
}


validation_size = 39  # new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size, n_splits=3)
rscv = RandomizedSearchCV(
    estimator=pipeline,
    cv=cv,
    param_distributions=param_grid,
    scoring="neg_root_mean_squared_log_error",
    n_jobs=1,
    verbose=10,
    n_iter=1,
)


X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

rscv.fit(X, y)

In [None]:
import joblib

joblib.dump(rscv, "xgb1500_1_005_5_5.pkl", compress=True)

['xgb1500_1_005_5_5.pkl']

In [None]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

submission = pd.read_csv("data/raw/sample_submission.csv")

new_submission = copy.deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

new_submission_ = new_submission.drop(columns=["id", "visitors"])


def to_category(data):
    data = copy.deepcopy(data)
    for c in data.columns:
        col_type = data[c].dtype
        if (
            col_type == "object"
            or col_type.name == "category"
            or col_type.name == "datetime64[ns]"
            or col_type.name == "string"
            or col_type == "string"
        ):
            data[c] = data[c].astype("category")
    return data


predictions = rscv.predict(new_submission_)
predictions

In [None]:
submission["visitors"] = predictions
submission.to_csv("data/submissions/xgb1500_1_005_5_5_no_transformer.csv", index=False)