In [40]:
import itertools
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_log_error
from copy import deepcopy
from prophet import Prophet

import logging

logging.getLogger("prophet").setLevel(logging.CRITICAL)
logging.getLogger("cmdstanpy").setLevel(logging.CRITICAL)

In [41]:
from joblib import Parallel, delayed

In [42]:
import os

os.chdir("../data/raw")
os.getcwd()

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw'

In [43]:
def my_cross_validation(estimator, X, y, params, cv):
    scores = []
    pointer = 0

    for train_index, test_index in cv.split(X, y):
        model = estimator(**params)
        pointer += 1
        x_train, x_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        if isinstance(estimator(), Prophet):
            df = pd.concat([x_train, y_train], axis=1)
            df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
            model.fit(df)

            df = deepcopy(x_test)
            df = df.rename(columns={"visit_date": "ds"})
            pred = model.predict(df)
            pred = pred[["yhat"]]
            pred[pred < 0] = 0

        else:
            model.fit(x_train, y_train)

            pred = model.predict(x_test)

        score = root_mean_squared_log_error(y_test, pred)

        scores.append(score)

    model_score = np.mean(scores)

    return model_score

In [44]:
def my_grid_search_cv(estimator, X, y, param_grid, cv, n_jobs=1):
    keys, values = zip(*param_grid.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

    scores = Parallel(n_jobs=n_jobs)(
        delayed(my_cross_validation)(
            estimator=estimator, X=X, y=y, params=params, cv=cv
        )
        for params in permutations_dicts
    )

    return permutations_dicts[np.argmin(scores)]

In [45]:
def my_nested_cv(
    estimator,
    X,
    y,
    param_grid,
    inner_splits=5,
    outer_splits=5,
    inner_gap=3,
    outer_gap=3,
    test_size=30,
):
    cv_inner = TimeSeriesSplit(
        n_splits=inner_splits, gap=inner_gap, test_size=test_size
    )
    cv_outer = TimeSeriesSplit(
        n_splits=outer_splits, gap=outer_gap, test_size=test_size
    )

    history = []

    pointer = 0
    for train_index, test_index in cv_outer.split(X, y):
        pointer += 1
        print("NestedCV: {} of outer fold {}".format(pointer, cv_outer.get_n_splits()))
        x_train, x_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        params = my_grid_search_cv(
            estimator=estimator, X=X, y=y, param_grid=param_grid, cv=cv_inner, n_jobs=-1
        )

        model = estimator(**params)

        if isinstance(estimator(), Prophet):
            df = pd.concat([x_train, y_train], axis=1)
            df = df.rename(columns={"visit_date": "ds", "visitors": "y"})
            model.fit(df)

            df = deepcopy(x_test)
            df = df.rename(columns={"visit_date": "ds"})
            pred = model.predict(df)
            pred = pred[["yhat"]]
            pred[pred < 0] = 0

        else:
            model.fit(x_train, y_train)

            pred = model.predict(x_test)

        score = root_mean_squared_log_error(y_test, pred)

        print("Score:", score, "\n")
        history.append(score)

    print("Overall test performance: {}".format(np.mean(history)))

In [46]:
air_visit_data = pd.read_csv("air_visit_data.csv")
data = (
    air_visit_data.where(air_visit_data["air_store_id"] == "air_ba937bf13d40fb24")
    .dropna()
    .reset_index(drop=True)
)
data = data.drop(columns=["air_store_id"])
data

Unnamed: 0,visit_date,visitors
0,2016-01-13,25.0
1,2016-01-14,32.0
2,2016-01-15,29.0
3,2016-01-16,22.0
4,2016-01-18,6.0
...,...,...
386,2017-04-18,11.0
387,2017-04-19,11.0
388,2017-04-20,14.0
389,2017-04-21,40.0


In [47]:
X = data[["visit_date"]]
y = data[["visitors"]]

In [48]:
reg = Prophet

param_grid = {
    "changepoint_prior_scale": [0.001, 0.01, 0.1, 0.5],
    "seasonality_prior_scale": [0.01, 0.1, 1.0, 10.0],
}

my_nested_cv(reg, X, y, param_grid)

NestedCV: 1 of outer fold 5


15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] start processing
15:58:21 - cmdstanpy - INFO - Chain [1] done processing
15:58:21 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
15:58:21 - cmdstanpy - INFO - Chain [1] done processing
15:58:21 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:21 - cmdstanpy - INFO - Chain [1] done processing
15:58:21 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abno

Score: 0.5590467271383507 

NestedCV: 2 of outer fold 5


15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] done processing
15:58:28 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:28 - cmdstanpy - INFO - Chain [1] done processing
15:58:28 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] start processing
15:58:28 - cmdstanpy - INFO - Chain [1] done processing
15:58:2

Score: 0.5696546826206523 

NestedCV: 3 of outer fold 5


15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] done processing
15:58:35 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:35 - cmdstanpy - INFO - Chain [1] start processing
15:58:35 - cmdstanpy - INFO - Chain [1] done processing
15:58:35 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:35 - cmdstanpy - INFO - Chain [1] done processing
15:58:35 - cmdstanpy - INFO - Chain [1] done processing
15:58:35

Score: 0.3571747724269832 

NestedCV: 4 of outer fold 5


15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] done processing
15:58:41 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:41 - cmdstanpy - INFO - Chain [1] start processing
15:58:41 - cmdstanpy - INFO - Chain [1] done processing
15:58:41 - cmdstanpy - INFO - Chain [1] done processing
15:58:41 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
15:58:41 - cmdstanpy - INFO - Chain [1] done processing
15:58:41 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnor

Score: 0.49526026387310573 

NestedCV: 5 of outer fold 5


15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] done processing
15:58:48 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] done processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - INFO - Chain [1] start processing
15:58:48 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
Optimization terminated abnormally. Falling back to Newton.
15:58:48 - cmdstanpy - INFO - Chain [1] done processing
15:58:4

Score: 0.3881454795816534 

Overall test performance: 0.4738563851281491


In [49]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor

p_grid = {"strategy": ["mean", "median"]}

my_nested_cv(reg, X, y, p_grid)

NestedCV: 1 of outer fold 5
Score: 0.7774891990598184 

NestedCV: 2 of outer fold 5
Score: 0.6801533092102668 

NestedCV: 3 of outer fold 5
Score: 0.8549704479338557 

NestedCV: 4 of outer fold 5
Score: 0.8783354107953781 

NestedCV: 5 of outer fold 5
Score: 0.826168015493569 

Overall test performance: 0.8034232764985776
