In [17]:
import pandas as pd
from eval import my_grid_search_cv
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from tqdm.notebook import tqdm

tqdm.pandas()

# from tqdm import tqdm
# tqdm.pandas()

In [18]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [22]:
ID_SIZE = 20

store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [124]:
THREE_WEEKS = 21
FIVE_WEEKS = 35
TWO_MONTH = 61
ONE_QUARTER = 365 // 4
HALF_YEAR = 365 // 2
THREE_QUARTERS = 365 * 3 // 4
YEAR = 365

LAGS = [
    THREE_WEEKS,
    FIVE_WEEKS,
    TWO_MONTH,
    ONE_QUARTER,
    HALF_YEAR,
    THREE_QUARTERS,
    YEAR,
]


def get_store_column_lag(row, data, column, start_date):
    for lag in LAGS:
        column_name = f"lag_{column}_{lag}"
        is_nan_column_name = f"is_nan_{column_name}"

        end_date = start_date - pd.Timedelta(days=lag)

        value = data.loc[(data["date"] >= end_date), "visitors"].mean()

        if pd.isna(value):
            value = 0
            is_nan = 1
        else:
            is_nan = 0

        row[column_name] = value
        row[is_nan_column_name] = is_nan

    return row


def get_store_lags(row, data, date_info):
    store_id_data = data[data["store_id"] == row["store_id"]]
    holiday_flg_data = store_id_data[
        store_id_data["holiday_flg"] == date_info["holiday_flg"]
    ]  # [["date", "visitors"]]
    day_of_week_data = store_id_data[
        store_id_data["day_of_week"] == date_info["day_of_week"]
    ]
    start_date = date_info["date"]

    row = get_store_column_lag(row, day_of_week_data, "day_of_week", start_date)
    row = get_store_column_lag(row, holiday_flg_data, "holiday_flg", start_date)

    return row


def get_area_genre_column_lag(group, data, column, start_date):
    for lag in LAGS:
        column_name = f"lag_area_genre_{column}_{lag}"
        is_nan_column_name = f"is_nan_{column_name}"

        end_date = start_date - pd.Timedelta(days=lag)

        new_data = data[data["date"] >= end_date]
        area_genre_data_mean = new_data.groupby(by=["date"], as_index=False).visitors.mean()

        value = area_genre_data_mean.mean()["visitors"]

        if pd.isna(value):
            value = 0
            is_nan = 1
        else:
            is_nan = 0

        group[column_name] = value
        group[is_nan_column_name] = is_nan

    return group


def get_area_genre_lags(group, data, date_info, area_genre):
    area_name = area_genre[0]
    genre_name = area_genre[1]

    group["genre_name"] = genre_name
    group["area_name"] = area_name

    area_genre_data = data[
        (data["area_name"] == area_name) & (data["genre_name"] == genre_name)
    ]
    holiday_flg_data = area_genre_data[
        area_genre_data["holiday_flg"] == date_info["holiday_flg"]
    ]
    day_of_week_data = area_genre_data[
        area_genre_data["day_of_week"] == date_info["day_of_week"]
    ]
    start_date = date_info["date"]
    
    group = get_area_genre_column_lag(group, day_of_week_data, "day_of_week", start_date)
    group = get_area_genre_column_lag(group, holiday_flg_data, "holiday_flg", start_date)

    return group


def predict_batch(group, date):
    global data

    group["date"] = date

    date_info = dict()
    date_info["date"] = date
    date_info["day_of_week"] = group.iloc[0]["day_of_week"]
    date_info["holiday_flg"] = group.iloc[0]["holiday_flg"]

    group = group.apply(lambda row: get_store_lags(row, data, date_info), axis = 1)
    group = group.groupby(by=["area_name", "genre_name"], group_keys=False).apply(
        lambda group: get_area_genre_lags(
            group, data, date_info, area_genre=group.name
        ),
        include_groups=False,
    )

    # pred = 1
    pred = model.predict(group)
    pred = max(0, pred)

    group["visitors"] = pred

    
    data = pd.concat([data, group.drop(columns=["id"])]).reset_index(drop=True)

    return group

In [127]:
submission = pd.read_csv("data/raw/sample_submission.csv")
data = pd.read_csv("data/processed/data.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

new_submission = deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

model = XGBRegressor()
model.load_model("models/xgboost/xgboost_more_estimators.json")

new_submission = (
    new_submission.sort_values("date")
    # .head(10000)
    .groupby(by=["date"], group_keys=False)
    .progress_apply(
        lambda group: predict_batch(group, date=group.name), include_groups=False
    )
)
data

  0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,...,is_nan_lag_holiday_flg_273,lag_area_genre_day_of_week_365,is_nan_lag_area_genre_day_of_week_365,visitors,lag_area_genre_holiday_flg_365,is_nan_lag_area_genre_holiday_flg_365,lag_day_of_week_365,is_nan_lag_day_of_week_365,lag_holiday_flg_365,is_nan_lag_holiday_flg_365
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,...,1,0.000000,1,24,0.000000,1,0.000000,1,0.000000,1
1,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,...,1,0.000000,1,28,0.000000,1,0.000000,1,0.000000,1
2,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,...,1,0.000000,1,9,0.000000,1,0.000000,1,0.000000,1
3,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,...,1,0.000000,1,10,0.000000,1,0.000000,1,0.000000,1
4,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,...,1,0.000000,1,12,0.000000,1,0.000000,1,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262103,air_9efaa7ded03c5a71,2017-05-05,Friday,1,Dining bar,Miyagi-ken Sendai-shi Kamisugi,38.269076,140.870403,2017,5,...,0,20.227564,0,0,18.798246,0,20.976744,0,15.083333,0
262104,air_fcfbdcf7b1f82c6e,2017-05-05,Friday,1,Italian/French,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,2017,5,...,0,22.924069,0,0,17.925552,0,17.829268,0,14.937500,0
262105,air_d44d210d2994f01b,2017-05-05,Friday,1,Bar/Cocktail,Tōkyō-to Setagaya-ku Setagaya,35.646572,139.653247,2017,5,...,0,20.767628,0,0,17.329861,0,21.784314,0,10.952381,0
262106,air_83db5aff8f50478e,2017-05-05,Friday,1,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,2017,5,...,0,21.078638,0,0,20.775353,0,23.000000,0,18.750000,0
