In [1]:
import itertools
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_log_error
from copy import deepcopy
from prophet import Prophet
from joblib import Parallel, delayed

import logging

logging.getLogger("prophet").setLevel(logging.CRITICAL)
logging.getLogger("cmdstanpy").setLevel(logging.CRITICAL)

In [2]:
import os

os.chdir("../data/raw")
os.getcwd()

'/root/restaurants/data/raw'

In [3]:
def my_cross_validation(estimator, X, y, params, cv):
    scores = []
    pointer = 0

    for train_index, test_index in cv.split(X, y):
        model = estimator(**params)
        pointer += 1
        x_train, x_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        if isinstance(estimator(), Prophet):
            df = pd.concat([x_train, y_train], axis=1)
            df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
            model.fit(df)

            df = deepcopy(x_test)
            df = df.rename(columns={"visit_date": "ds"})
            pred = model.predict(df)
            pred = pred[["yhat"]]
            pred[pred < 0] = 0

        else:
            model.fit(x_train, y_train)

            pred = model.predict(x_test)

        score = root_mean_squared_log_error(y_test, pred)

        scores.append(score)

    model_score = np.mean(scores)

    return model_score

In [4]:
def my_grid_search_cv(estimator, X, y, param_grid, cv, n_jobs=1):
    keys, values = zip(*param_grid.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

    scores = Parallel(n_jobs=n_jobs)(
        delayed(my_cross_validation)(
            estimator=estimator, X=X, y=y, params=params, cv=cv
        )
        for params in permutations_dicts
    )

    return permutations_dicts[np.argmin(scores)]

In [22]:
def my_nested_cv(
    estimator,
    X,
    y,
    param_grid,
    inner_splits=5,
    outer_splits=5,
    inner_gap=3,
    outer_gap=3,
    test_size=30,
):
    cv_inner = TimeSeriesSplit(
        n_splits=inner_splits, gap=inner_gap, test_size=test_size
    )
    cv_outer = TimeSeriesSplit(
        n_splits=outer_splits, gap=outer_gap, test_size=test_size
    )

    history = []

    pointer = 0
    for train_index, test_index in cv_outer.split(X, y):
        pointer += 1
        print("NestedCV: {} of outer fold {}".format(pointer, cv_outer.get_n_splits()))
        x_train, x_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        params = my_grid_search_cv(
            estimator=estimator, X=X, y=y, param_grid=param_grid, cv=cv_inner, n_jobs=3
        )

        model = estimator(**params)

        if isinstance(estimator(), Prophet):
            df = pd.concat([x_train, y_train], axis=1)
            df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
            model.fit(df)

            df = deepcopy(x_test)
            df = df.rename(columns={"visit_date": "ds"})
            pred = model.predict(df)
            pred = pred[["yhat"]]
            pred[pred < 0] = 0

        else:
            model.fit(x_train, y_train)

            pred = model.predict(x_test)

        score = root_mean_squared_log_error(y_test, pred)

        print("Score:", score, "\n")
        history.append(score)

    print("Overall test performance: {}".format(np.mean(history)))

In [14]:
air_visit_data = pd.read_csv("air_visit_data.csv")
# data = (
#     air_visit_data.where(air_visit_data["air_store_id"] == "air_ba937bf13d40fb24")
#     .dropna()
#     .reset_index(drop=True)
# )
data = air_visit_data.drop(columns=["air_store_id"])
data

Unnamed: 0,visit_date,visitors
0,2016-01-13,25
1,2016-01-14,32
2,2016-01-15,29
3,2016-01-16,22
4,2016-01-18,6
...,...,...
252103,2017-04-18,6
252104,2017-04-19,6
252105,2017-04-20,7
252106,2017-04-21,8


In [15]:
X = data[["visit_date"]]
y = data[["visitors"]]

In [28]:
reg = Prophet

param_grid = {
    "seasonality_prior_scale": [0.01, 0.1],
}

my_nested_cv(reg, X, y, param_grid)

NestedCV: 1 of outer fold 5


18:03:46 - cmdstanpy - INFO - Chain [1] start processing
18:03:46 - cmdstanpy - INFO - Chain [1] start processing
18:04:56 - cmdstanpy - INFO - Chain [1] done processing
18:05:01 - cmdstanpy - INFO - Chain [1] start processing
18:05:21 - cmdstanpy - INFO - Chain [1] done processing
18:05:28 - cmdstanpy - INFO - Chain [1] start processing
18:06:59 - cmdstanpy - INFO - Chain [1] done processing
18:07:03 - cmdstanpy - INFO - Chain [1] done processing
18:07:04 - cmdstanpy - INFO - Chain [1] start processing
18:07:08 - cmdstanpy - INFO - Chain [1] start processing
18:08:59 - cmdstanpy - INFO - Chain [1] done processing
18:09:01 - cmdstanpy - INFO - Chain [1] done processing
18:09:03 - cmdstanpy - INFO - Chain [1] start processing
18:09:06 - cmdstanpy - INFO - Chain [1] start processing
18:10:20 - cmdstanpy - INFO - Chain [1] done processing
18:10:26 - cmdstanpy - INFO - Chain [1] start processing
18:11:01 - cmdstanpy - INFO - Chain [1] done processing
18:11:06 - cmdstanpy - INFO - Chain [1]

Score: 1.2058502650958414 

NestedCV: 2 of outer fold 5


18:13:43 - cmdstanpy - INFO - Chain [1] start processing
18:13:43 - cmdstanpy - INFO - Chain [1] start processing
18:15:00 - cmdstanpy - INFO - Chain [1] done processing
18:15:07 - cmdstanpy - INFO - Chain [1] start processing
18:15:25 - cmdstanpy - INFO - Chain [1] done processing
18:15:32 - cmdstanpy - INFO - Chain [1] start processing
18:17:08 - cmdstanpy - INFO - Chain [1] done processing
18:17:13 - cmdstanpy - INFO - Chain [1] done processing
18:17:14 - cmdstanpy - INFO - Chain [1] start processing
18:17:18 - cmdstanpy - INFO - Chain [1] start processing
18:19:20 - cmdstanpy - INFO - Chain [1] done processing
18:19:24 - cmdstanpy - INFO - Chain [1] done processing
18:19:26 - cmdstanpy - INFO - Chain [1] start processing
18:19:29 - cmdstanpy - INFO - Chain [1] start processing
18:20:50 - cmdstanpy - INFO - Chain [1] done processing
18:20:56 - cmdstanpy - INFO - Chain [1] start processing
18:21:33 - cmdstanpy - INFO - Chain [1] done processing
18:21:38 - cmdstanpy - INFO - Chain [1]

Score: 1.1406339584639853 

NestedCV: 3 of outer fold 5


18:24:35 - cmdstanpy - INFO - Chain [1] start processing
18:24:36 - cmdstanpy - INFO - Chain [1] start processing
18:25:48 - cmdstanpy - INFO - Chain [1] done processing
18:25:53 - cmdstanpy - INFO - Chain [1] start processing
18:26:16 - cmdstanpy - INFO - Chain [1] done processing
18:26:21 - cmdstanpy - INFO - Chain [1] start processing
18:27:52 - cmdstanpy - INFO - Chain [1] done processing
18:27:52 - cmdstanpy - INFO - Chain [1] done processing
18:27:56 - cmdstanpy - INFO - Chain [1] start processing
18:27:57 - cmdstanpy - INFO - Chain [1] start processing
18:29:55 - cmdstanpy - INFO - Chain [1] done processing
18:29:57 - cmdstanpy - INFO - Chain [1] done processing
18:30:00 - cmdstanpy - INFO - Chain [1] start processing
18:30:02 - cmdstanpy - INFO - Chain [1] start processing
18:31:22 - cmdstanpy - INFO - Chain [1] done processing
18:31:27 - cmdstanpy - INFO - Chain [1] start processing
18:31:59 - cmdstanpy - INFO - Chain [1] done processing
18:32:03 - cmdstanpy - INFO - Chain [1]

Score: 1.1460366032344298 

NestedCV: 4 of outer fold 5


18:34:47 - cmdstanpy - INFO - Chain [1] start processing
18:34:47 - cmdstanpy - INFO - Chain [1] start processing
18:35:59 - cmdstanpy - INFO - Chain [1] done processing
18:36:04 - cmdstanpy - INFO - Chain [1] start processing
18:36:25 - cmdstanpy - INFO - Chain [1] done processing
18:36:30 - cmdstanpy - INFO - Chain [1] start processing
18:38:02 - cmdstanpy - INFO - Chain [1] done processing
18:38:03 - cmdstanpy - INFO - Chain [1] done processing
18:38:06 - cmdstanpy - INFO - Chain [1] start processing
18:38:07 - cmdstanpy - INFO - Chain [1] start processing
19:28:43 - cmdstanpy - INFO - Chain [1] done processing
19:28:46 - cmdstanpy - INFO - Chain [1] start processing
19:28:46 - cmdstanpy - INFO - Chain [1] done processing
19:28:49 - cmdstanpy - INFO - Chain [1] start processing
19:30:00 - cmdstanpy - INFO - Chain [1] done processing
19:30:03 - cmdstanpy - INFO - Chain [1] start processing
19:30:37 - cmdstanpy - INFO - Chain [1] done processing
19:30:40 - cmdstanpy - INFO - Chain [1]

Score: 1.2764789171842608 

NestedCV: 5 of outer fold 5


19:33:09 - cmdstanpy - INFO - Chain [1] start processing
19:33:09 - cmdstanpy - INFO - Chain [1] start processing
19:34:44 - cmdstanpy - INFO - Chain [1] done processing
19:34:49 - cmdstanpy - INFO - Chain [1] start processing
19:35:16 - cmdstanpy - INFO - Chain [1] done processing
19:35:19 - cmdstanpy - INFO - Chain [1] start processing
19:36:43 - cmdstanpy - INFO - Chain [1] done processing
19:36:46 - cmdstanpy - INFO - Chain [1] start processing
19:36:47 - cmdstanpy - INFO - Chain [1] done processing
19:36:49 - cmdstanpy - INFO - Chain [1] start processing
19:38:39 - cmdstanpy - INFO - Chain [1] done processing
19:38:43 - cmdstanpy - INFO - Chain [1] start processing
19:38:47 - cmdstanpy - INFO - Chain [1] done processing
19:38:50 - cmdstanpy - INFO - Chain [1] start processing
19:40:08 - cmdstanpy - INFO - Chain [1] done processing
19:40:11 - cmdstanpy - INFO - Chain [1] start processing
19:40:23 - cmdstanpy - INFO - Chain [1] done processing
19:40:26 - cmdstanpy - INFO - Chain [1]

Score: 1.3479582954972338 

Overall test performance: 1.2233916078951503


In [26]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor

p_grid = {"strategy": ["mean", "median"]}

my_nested_cv(reg, X, y, p_grid)

NestedCV: 1 of outer fold 5
Score: 1.0774397158466047 

NestedCV: 2 of outer fold 5
Score: 0.8881659348731861 

NestedCV: 3 of outer fold 5
Score: 1.031048058112854 

NestedCV: 4 of outer fold 5
Score: 1.1055348968047596 

NestedCV: 5 of outer fold 5
Score: 1.133640608555693 

Overall test performance: 1.0471658428386195
