In [1]:
import pandas as pd

import numpy as np
import copy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer
from recursive_pipeline import RecursivePipeline

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
data = pd.read_csv("data/processed/data_clear.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day,visitors
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1,24
1,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,3,28
2,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,1,9
3,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,1,10
4,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,1,12


In [4]:
data = data.drop(columns=["latitude", "longitude"])

In [None]:
pipeline = RecursivePipeline(
    pipeline=Pipeline(
        steps=[
            ("transformer", MyTransformer()),
            (
                "model",
                XGBRegressor(
                    objective="reg:squaredlogerror",
                    random_state=42,
                    enable_categorical=True,
                ),
            ),
        ]
    )
)


param_grid = {
    # "pipeline__model__n_estimators": [1500],
    # "pipeline__model__learning_rate": [0.005],
    # "pipeline__model__max_depth": [1],
    # "pipeline__model__subsample": [0.5],
    # "pipeline__model__colsample_bytree": [0.5],
    # "pipeline__model__grow_policy": ["depthwise", "lossguide"],
    "pipeline__model__booster": ["gbtree"],
}


validation_size = 39  # new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size, n_splits=3)
rscv = GridSearchCV(
    estimator=pipeline,
    cv=cv,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_log_error",
    n_jobs=1,
    verbose=10,
)


X = data.drop(columns=["visitors"]).reset_index(drop=True)
y = data["visitors"].reset_index(drop=True)

rscv.fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START pipeline__model__booster=gbtree.............................
[CV 1/3; 1/1] END pipeline__model__booster=gbtree;, score=-0.810 total time=  39.7s
[CV 2/3; 1/1] START pipeline__model__booster=gbtree.............................
[CV 2/3; 1/1] END pipeline__model__booster=gbtree;, score=-0.807 total time=  48.0s
[CV 3/3; 1/1] START pipeline__model__booster=gbtree.............................
[CV 3/3; 1/1] END pipeline__model__booster=gbtree;, score=-0.813 total time=  52.1s


In [None]:
import joblib

# joblib.dump(rscv, "xgb1500_1_005_5_5.pkl", compress=True)

['xgb1500_1_005_5_5.pkl']

In [None]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

submission = pd.read_csv("data/raw/sample_submission.csv")

new_submission = copy.deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

new_submission_ = new_submission.drop(columns=["id", "visitors"])

predictions = rscv.predict(new_submission_.drop(columns=["latitude", "longitude"]).head(50))
# predictions

In [10]:
predictions


array([15.081747 , 15.455475 , 15.844707 , 15.178842 , 16.865393 ,
       15.29513  , 15.218975 , 15.056982 , 16.646679 , 14.499445 ,
       15.230082 , 16.580437 , 15.114142 , 14.8917265, 14.828428 ,
       14.80629  , 14.741457 , 14.828428 , 14.80629  , 14.828428 ,
       14.7512455, 14.7512455, 14.732203 , 14.732203 , 14.732203 ,
       14.580511 , 14.580511 , 14.580511 , 14.616178 , 14.554672 ,
       14.554672 , 14.554672 , 14.554672 , 14.554672 , 14.536768 ,
       14.38622  , 14.38622  , 14.38622  , 14.38622  , 15.054222 ,
       16.02781  , 15.793083 , 16.145367 , 15.988049 , 16.299505 ,
       15.873187 , 15.619373 , 15.248919 , 15.161483 , 14.852431 ],
      dtype=float32)

In [None]:
for id in new_submission["store_id"].unique():
    temp = data[data["store_id"] == id]
    value = temp.loc[temp['date'].idxmax()]["visitors"]
    new_submission.loc[new_submission["store_id"] == id, "visitors"] = value
    
new_submission

Unnamed: 0,id,visitors,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day
0,air_00a91d42b08b08d9_2017-04-23,18,air_00a91d42b08b08d9,2017-04-23,Sunday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,23
1,air_00a91d42b08b08d9_2017-04-24,18,air_00a91d42b08b08d9,2017-04-24,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,24
2,air_00a91d42b08b08d9_2017-04-25,18,air_00a91d42b08b08d9,2017-04-25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,25
3,air_00a91d42b08b08d9_2017-04-26,18,air_00a91d42b08b08d9,2017-04-26,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,26
4,air_00a91d42b08b08d9_2017-04-27,18,air_00a91d42b08b08d9,2017-04-27,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32014,air_fff68b929994bfbd_2017-05-27,30,air_fff68b929994bfbd,2017-05-27,Saturday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,5,27
32015,air_fff68b929994bfbd_2017-05-28,30,air_fff68b929994bfbd,2017-05-28,Sunday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,5,28
32016,air_fff68b929994bfbd_2017-05-29,30,air_fff68b929994bfbd,2017-05-29,Monday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,5,29
32017,air_fff68b929994bfbd_2017-05-30,30,air_fff68b929994bfbd,2017-05-30,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,5,30


In [None]:
new_submission[["id", "visitors"]].to_csv("data/submissions/lag.csv", index=False)

In [None]:
submission["visitors"] = predictions
# submission.to_csv("data/submissions/xgb1500_1_005_5_5_no_transformer.csv", index=False)