In [6]:
import pandas as pd

import numpy as np
import joblib
import copy

from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from tqdm import tqdm

from my_splitter import MySplitter
from my_transformer import MyTransformer
from recursive_pipeline import RecursivePipeline

ID_SIZE = 20
CAT_COLS = ["store_id", "date", "day_of_week", "genre_name", "area_name"]

In [7]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [8]:
data = pd.read_csv("data/processed/data_clear.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day,visitors
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1,24
1,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,3,28
2,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,1,9
3,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,1,10
4,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,1,12


In [41]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

submission = pd.read_csv("data/raw/sample_submission.csv")
submission["visitors"] = submission["visitors"].astype("float64")

new_submission = copy.deepcopy(submission)
new_submission["visitors"] = new_submission["visitors"].astype("float64")
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

new_submission_ = new_submission.drop(columns=["id", "visitors"])

# new_submission_.groupby(by=["store_id"]).apply()

In [10]:
import math
class RMSLE(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            val = max(approxes[index], 0)
            der1 = math.log1p(targets[index]) - math.log1p(max(0, approxes[index]))
            der2 = -1 / (max(0, approxes[index]) + 1)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result
class RMSLE_val(object):
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((math.log1p(max(0, approx[i])) - math.log1p(max(0, target[i])))**2)

        return error_sum, weight_sum

In [45]:
def to_category(data):    
    data = copy.deepcopy(data)
    for c in data.columns:
        col_type = data[c].dtype
        if (
            col_type == "object"
            or col_type.name == "category"
            or col_type.name == "datetime64[ns]"
            or col_type.name == "string"
            or col_type == "string"
        ):
            data[c] = data[c].astype("string")

    return data

new_submission_ = to_category(new_submission_)
data = to_category(data)

In [None]:
from catboost import CatBoostRegressor

pipeline = RecursivePipeline(
    pipeline=Pipeline(
        steps=[
            ("transformer", MyTransformer()),
            (
                "model",
                CatBoostRegressor(
                    random_state=42,
                    logging_level="Silent",
                    cat_features=CAT_COLS,
                    loss_function=RMSLE(),
                    eval_metric=RMSLE_val(),
                ),
            ),
        ]
    )
)

param_grid = {
    "pipeline__model__n_estimators": [100, 250, 500, 750, 1000, 1200],
    "pipeline__model__learning_rate": [0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
    "pipeline__model__max_depth": [1, 2, 3, 4, 5, 6],
    # "pipeline__model__subsample": [0.5, 0.75, 1],
}
validation_size = new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size, n_splits=2)


ids = new_submission["store_id"].unique()
i = 0



for id in tqdm(ids):
    if id in fitted_ids:
        continue

    i += 1
    X = data[data["store_id"] == id].drop(columns=["visitors"]).reset_index(drop=True)
    y = data[data["store_id"] == id]["visitors"].reset_index(drop=True)

    # model = RandomizedSearchCV(
    #     estimator=pipeline,
    #     cv=cv,
    #     param_distributions=param_grid,
    #     scoring="neg_root_mean_squared_log_error",
    #     n_jobs=1,
    #     verbose=5,
    #     n_iter=1,
    # )

    # model = RecursivePipeline(
    #     pipeline=Pipeline(
    #         steps=[
    #             ("transformer", MyTransformer()),
    #             (
    #                 "model",
    #                 CatBoostRegressor(
    #                     random_state=42,
    #                     logging_level="Silent",
    #                     cat_features=CAT_COLS,
    #                     loss_function=RMSLE(),
    #                     eval_metric=RMSLE_val(),
    #                 ),
    #             ),
    #         ]
    #     )
    # )

    model = CatBoostRegressor(
        random_state=42,
        logging_level="Silent",
        cat_features=CAT_COLS,
        loss_function=RMSLE(),
        eval_metric=RMSLE_val(),
    )

    model.fit(X, y)

    # with open("catboost_id_log.txt", "a") as file:
    #     file.write(f"{id} {model.best_score_} {model.best_params_}\n")

    X_pred = new_submission_[new_submission_["store_id"] == id]
    new_submission.loc[new_submission["store_id"] == id, "visitors"] = model.predict(X_pred)

    if i % 10 == 0:
        new_submission[["id", "visitors"]].to_csv("data/submissions/catboost_for_each_store_id_clear.csv", index=False)

    fitted_ids.append(id)

new_submission[["id", "visitors"]].to_csv("data/submissions/catboost_for_each_store_id_clear.csv", index=False)

100%|██████████| 821/821 [33:51<00:00,  2.47s/it]
