In [1]:
import pandas as pd
import math
import logging
from sklearn.model_selection import TimeSeriesSplit
from copy import deepcopy
from prophet import Prophet
from prophet.serialize import model_to_json


from eval import my_grid_search_cv

logging.getLogger("prophet").setLevel(logging.CRITICAL)
logging.getLogger("cmdstanpy").setLevel(logging.CRITICAL)

logging.getLogger("fbprophet.forecaster").propagate = False

# from tqdm import tqdm
# tqdm.pandas()

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
air_visit_data = pd.read_csv("data/raw/air_visit_data.csv")
data = deepcopy(air_visit_data)
data.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


In [4]:
date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.drop(columns=["day_of_week"])
date_info.head()

Unnamed: 0,calendar_date,holiday_flg
0,2016-01-01,1
1,2016-01-02,1
2,2016-01-03,1
3,2016-01-04,0
4,2016-01-05,0


In [5]:
holidays = (
    date_info[date_info["holiday_flg"] == 1]
    .rename(columns={"holiday_flg": "holiday", "calendar_date": "ds"})
    .reset_index(drop=True)
)
holidays = holidays[["holiday", "ds"]]
holidays["holiday"] = "holiday"
holidays.head()

Unnamed: 0,holiday,ds
0,holiday,2016-01-01
1,holiday,2016-01-02
2,holiday,2016-01-03
3,holiday,2016-01-11
4,holiday,2016-02-11


In [6]:
X = data[data["air_store_id"] == "air_ba937bf13d40fb24"][["visit_date"]]
y = data[data["air_store_id"] == "air_ba937bf13d40fb24"][["visitors"]]

In [7]:
X = data[data["air_store_id"] == "air_ba937bf13d40fb24"].drop(columns=["air_store_id"])
X["visit_date"] = pd.to_datetime(X["visit_date"])
# X = X.set_index("visit_date")
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 391 entries, 0 to 390
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   visit_date  391 non-null    datetime64[ns]
 1   visitors    391 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 9.2 KB


In [8]:
# X.groupby(pd.Grouper(key="visit_date", freq="1ME")).mean()
X.sort_values("visit_date")
for col_val, temp in X.groupby(pd.Grouper(key="visit_date", freq="1ME")):
    q = temp
temp.head()

Unnamed: 0,visit_date,visitors
370,2017-04-01,32
371,2017-04-02,4
372,2017-04-03,4
373,2017-04-04,32
374,2017-04-05,19


In [9]:
pd.Timestamp.now()

Timestamp('2024-10-02 16:46:54.111227')

In [10]:
import numpy as np

In [26]:
def split_data(data, k, period = "month"):
    latest_date = data['visit_date'].max()
    
    month = latest_date - pd.DateOffset(months=k)
    next_month = latest_date - pd.DateOffset(months=k-1)

    if period == "week":
        month = latest_date - pd.DateOffset(weeks=k)
        next_month = latest_date - pd.DateOffset(weeks=k-1)
    
    test = np.array(data[(data['visit_date'] >= month) & (data['visit_date'] < next_month)].index)
    train = np.array(data[data['visit_date'] < month].index)

    return train, test

In [34]:
X
X['year_month'] = X['visit_date'].dt.to_period('M')  # Creates a 'YYYY-MM' format
    
# Count unique year-month combinations
unique_months = X['visit_date'].dt.to_period('M').unique()
unique_months

<PeriodArray>
['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07',
 '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02',
 '2017-03', '2017-04']
Length: 16, dtype: period[M]

In [27]:
def split(X, n=5, type="standard"):
    if type == "standard":
        for i in range(n, 0, -1):
            yield split_data(X, i, "month")
    else: 
        for i in range(2, 0, -1):
            yield split_data(X, i, "week")

In [29]:
for train_index, test_index in split(X, "small"):

2


array([383, 384, 385, 386, 387, 388, 389])

In [None]:
# def fit_for_each_store(directory, model, data):

from tqdm.auto import tqdm

tqdm.pandas()

small = []

directory = "models/prophet_for_each_store_with_holidays"
os.makedirs(directory, exist_ok=True)

models = dict()

param_grid = {
    "changepoint_prior_scale": [0.001, 0.01, 0.1, 0.5],
    "seasonality_prior_scale": [0.01, 0.1, 1, 10],
    "seasonality_mode": ["additive", "multiplicative"],
    "holidays": [holidays],
}

for store_id, store_data in tqdm(data.groupby("air_store_id")):

    if store_data.shape[0] <= 65:
        small.append(store_id)

    if store_id == "air_17bed6dbf7c8b0fc":
        param_grid = {
            "changepoint_prior_scale": [0.001, 0.1, 0.5],
            "seasonality_prior_scale": [0.01, 0.1, 1, 10],
            "seasonality_mode": ["additive", "multiplicative"],
            "holidays": [holidays],
        }
    
    print(store_id, store_data.shape[0])

    file_path = os.path.join(directory, f"model_{store_id}.json")

    if os.path.exists(file_path):
        continue

    X = store_data[["visit_date"]].reset_index(drop=True)
    y = store_data[["visitors"]].reset_index(drop=True)

    n_splits = max(min(math.trunc((store_data.shape[0] - 5) / 30), 5), 2)

    cv = TimeSeriesSplit(n_splits=n_splits, gap=3, test_size=30)

    if store_data.shape[0] <= 65:
        small.append(store_id)
        cv = TimeSeriesSplit(
            n_splits=2, gap=1, test_size=int(store_data.shape[0] * 0.2)
        )

    params = my_grid_search_cv(Prophet, X, y, param_grid, cv=cv, n_jobs=5)

    models[store_id] = Prophet(**params)

    df = pd.concat([X, y], axis=1)
    df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
    models[store_id].fit(df)

    with open(file_path, "w") as fout:
        fout.write(model_to_json(models[store_id]))

In [None]:
param_grid = {
    "changepoint_prior_scale": [0.001, 0.01, 0.1, 0.5],
    "seasonality_prior_scale": [0.01, 0.1, 1, 10],
    "seasonality_mode": ["additive", "multiplicative"],
    "holidays": [holidays],
}

cv = split_ind(n=5)

# params = my_grid_search_cv(Prophet, X, y, param_grid, cv=cv, n_jobs=5)