In [1]:
import pandas as pd
import math
import logging
from sklearn.model_selection import TimeSeriesSplit
from copy import deepcopy
from prophet import Prophet
from prophet.serialize import model_to_json


from eval import my_grid_search_cv

logging.getLogger("prophet").setLevel(logging.CRITICAL)
logging.getLogger("cmdstanpy").setLevel(logging.CRITICAL)

logging.getLogger("fbprophet.forecaster").propagate = False

# from tqdm import tqdm
# tqdm.pandas()

In [4]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [5]:
air_visit_data = pd.read_csv("data/raw/air_visit_data.csv")
data = deepcopy(air_visit_data)
data.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


In [6]:
date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.drop(columns=["day_of_week"])
date_info.head()

Unnamed: 0,calendar_date,holiday_flg
0,2016-01-01,1
1,2016-01-02,1
2,2016-01-03,1
3,2016-01-04,0
4,2016-01-05,0


In [7]:
holidays = (
    date_info[date_info["holiday_flg"] == 1]
    .rename(columns={"holiday_flg": "holiday", "calendar_date": "ds"})
    .reset_index(drop=True)
)
holidays = holidays[["holiday", "ds"]]
holidays["holiday"] = "holiday"
holidays.head()

Unnamed: 0,holiday,ds
0,holiday,2016-01-01
1,holiday,2016-01-02
2,holiday,2016-01-03
3,holiday,2016-01-11
4,holiday,2016-02-11


In [15]:
data

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6
...,...,...,...
252103,air_24e8414b9b07decb,2017-04-18,6
252104,air_24e8414b9b07decb,2017-04-19,6
252105,air_24e8414b9b07decb,2017-04-20,7
252106,air_24e8414b9b07decb,2017-04-21,8


In [8]:
# def fit_for_each_store(directory, model, data):

from tqdm.auto import tqdm

tqdm.pandas()

small = []

directory = "models/prophet_month_splitter"
os.makedirs(directory, exist_ok=True)

models = dict()

for store_id, store_data in tqdm(data.groupby("air_store_id")):

    if store_data.shape[0] <= 65:
        small.append(store_id)

    continue

    param_grid = {
        "changepoint_prior_scale": [0.001, 0.01, 0.1, 0.5],
        "seasonality_prior_scale": [0.01, 0.1, 1, 10],
        "seasonality_mode": ["additive", "multiplicative"],
        "holidays": [holidays],
    }

    if store_id == "air_17bed6dbf7c8b0fc":
        param_grid = {
            "changepoint_prior_scale": [0.001, 0.1, 0.5],
            "seasonality_prior_scale": [0.01, 0.1, 1, 10],
            "seasonality_mode": ["additive", "multiplicative"],
            "holidays": [holidays],
        }
    
    print(store_id, store_data.shape[0])

    file_path = os.path.join(directory, f"model_{store_id}.json")

    if os.path.exists(file_path):
        continue

    X = store_data[["visit_date"]].reset_index(drop=True)
    y = store_data[["visitors"]].reset_index(drop=True)

    n_splits = max(min(math.trunc((store_data.shape[0] - 5) / 30), 5), 2)

    # cv = TimeSeriesSplit(n_splits=n_splits, gap=3, test_size=30)

    if store_data.shape[0] <= 65:
        small.append(store_id)
        continue
        # cv = TimeSeriesSplit(
        #     n_splits=2, gap=1, test_size=int(store_data.shape[0] * 0.2)
        # )

    params = my_grid_search_cv(Prophet, X, y, param_grid, n_splits=n_splits, n_jobs=5)

    models[store_id] = Prophet(**params)

    df = pd.concat([X, y], axis=1)
    df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
    models[store_id].fit(df)

    with open(file_path, "w") as fout:
        fout.write(model_to_json(models[store_id]))

  0%|          | 0/829 [00:00<?, ?it/s]

In [9]:
small

['air_1c0b150f9e696a5f',
 'air_789103bf53b8096b',
 'air_8e492076a1179383',
 'air_8e8f42f047537154',
 'air_900d755ebd2f7bbd',
 'air_a17f0778617c76e2',
 'air_a9a380530c1e121f',
 'air_cf5ab75a0afb8af9']

In [54]:
from datetime import timedelta
import plotly.graph_objects as go
import plotly.express as px

for store_id in small:
    temp = deepcopy(data[data["air_store_id"] == store_id])
    temp["visit_date"] = pd.to_datetime(temp["visit_date"])
    fig = px.line(temp, x="visit_date", y="visitors")
    fig.show()
    last = temp['visit_date'].max()
    first = temp['visit_date'].min()
    len = last - first + timedelta(days=1)
    print("number of objects:", temp.shape[0])
    print("first: ", first)
    print("last: ", last)
    print("length of the period:", len)
    print("object every", len / temp.shape[0])

number of objects: 51
first:  2017-03-02 00:00:00
last:  2017-04-22 00:00:00
length of the period: 52 days 00:00:00
object every 1 days 00:28:14.117647058


number of objects: 41
first:  2016-07-06 00:00:00
last:  2017-04-22 00:00:00
length of the period: 291 days 00:00:00
object every 7 days 02:20:29.268292682


number of objects: 54
first:  2016-07-03 00:00:00
last:  2017-04-19 00:00:00
length of the period: 291 days 00:00:00
object every 5 days 09:20:00


number of objects: 62
first:  2017-02-08 00:00:00
last:  2017-04-22 00:00:00
length of the period: 74 days 00:00:00
object every 1 days 04:38:42.580645161


number of objects: 20
first:  2017-03-02 00:00:00
last:  2017-04-22 00:00:00
length of the period: 52 days 00:00:00
object every 2 days 14:24:00


number of objects: 47
first:  2017-03-07 00:00:00
last:  2017-04-22 00:00:00
length of the period: 47 days 00:00:00
object every 1 days 00:00:00


number of objects: 40
first:  2016-07-02 00:00:00
last:  2017-04-22 00:00:00
length of the period: 295 days 00:00:00
object every 7 days 09:00:00


number of objects: 53
first:  2017-02-15 00:00:00
last:  2017-04-22 00:00:00
length of the period: 67 days 00:00:00
object every 1 days 06:20:22.641509433


In [7]:
from prophet.serialize import model_from_json


def get_models(directory_path):
    models = dict()

    for file in os.listdir(directory_path):
        with open(f"{directory_path}/{file}", "r") as fin:
            store_id = file[6:]
            store_id = store_id[:-5]
            models[store_id] = model_from_json(fin.read())

    return models

In [8]:
models = get_models("models/prophet_for_each_store_with_holidays")

KeyboardInterrupt: 

In [45]:
from tqdm.auto import tqdm

submission = pd.read_csv("data/raw/sample_submission.csv")
submission = submission.rename(columns={"visit_date": "ds"})


def make_prediction(row):
    store_id = row["id"][:20]
    date = pd.DataFrame([row["id"][21:]])
    date = date.rename(columns={0: "ds"})

    pred = models[store_id].predict(date)
    pred = pred[["yhat"]]
    pred[pred < 0] = 0

    row["visitors"] = pred["yhat"][0]
    return row


tqdm.pandas()
submission = submission.progress_apply(make_prediction, axis=1)

  0%|          | 0/32019 [00:00<?, ?it/s]

In [46]:
directory = "data/submissions"
os.makedirs(directory, exist_ok=True)

submission.to_csv(directory + "/prophet_for_each_store_with_holidays.csv", index=False)