In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import pandas as pd
from loguru import logger

logger.enable("probafcst")
logger.remove()
logger.add(sink=sys.stderr, level="DEBUG")

In [None]:
data = pd.read_parquet("../data/energy.parquet").asfreq("h").dropna().loc["2021":]
y = data["load"].to_frame()
X = data.drop(columns=["load"])

In [None]:
y.tail()

In [None]:
from probafcst.utils.tabularization import create_lagged_features

bikes = pd.read_parquet("../data/bikes.parquet").asfreq("D").dropna()
y = bikes["bike_count"].to_frame()
X = bikes.drop(columns=["bike_count"])

result, _ = create_lagged_features(
    X,
    y["bike_count"],
    lags=[24],
    include_seasonal_dummies=True,
    cyclical_encodings=False,
    include_rolling_stats=False,
    is_training=False,
)

In [None]:
bikes.tail()

In [None]:
result.info()

In [None]:
from probafcst.models.linear_qr import LinearQuantileForecaster

In [None]:
from sklearn.model_selection import train_test_split
from sktime.forecasting.base import ForecastingHorizon

from probafcst.models.xgboost import XGBQuantileForecaster

quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=168, shuffle=False)
# lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
lags = [24, 48, 168]


fh = ForecastingHorizon(y_test.index, is_relative=False)
xgb_kwargs = dict(
    n_jobs=-1,
    # early_stopping_rounds=50,
    n_estimators=500,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
)
model = XGBQuantileForecaster(
    lags=lags,
    quantiles=quantiles,
    include_seasonal_dummies=True,
    cyclical_encodings=True,
    X_lag_cols=None,
    kwargs=xgb_kwargs,
)
# model.fit(y_train, X_train)
# model

In [None]:
qr_forecaster = LinearQuantileForecaster(
    lags=lags,
    quantiles=quantiles,
    include_seasonal_dummies=True,
    cyclical_encodings=False,
    include_rolling_stats=False,
    X_lag_cols=[],
)
qr_forecaster.fit(y_train, X_train)

In [None]:
from probafcst.plotting import plot_quantiles

y_pred = qr_forecaster.predict_quantiles(fh, X=X, alpha=quantiles)

plot_quantiles(y_test, y_pred)

In [None]:
feature_importances = pd.Series(
    model.model.feature_importances_, index=model.feature_names_in_
)
feature_importances.nlargest(25).sort_values(ascending=True).plot(kind="barh")

In [None]:
from probafcst.models.lgbm import LGBMQuantileForecaster

model = LGBMQuantileForecaster(
    lags=[1, 2], quantiles=quantiles, lgbm_kwargs=dict(verbose=0, n_estimators=10)
)

model.fit(y_train, X_train)
y_pred = model.predict_quantiles(fh, X, alpha=quantiles)

In [None]:
# model.model.best_iteration, model.model.best_score

In [None]:
from probafcst.utils.time import get_next_wednesday

get_next_wednesday()

In [None]:
feature_importances = pd.Series(
    model.model.feature_importances_, index=model.feature_names_in_
)
feature_importances.nlargest(10).sort_values().plot(kind="barh")

In [None]:
from sktime.performance_metrics.forecasting.probabilistic import (
    ConstraintViolation,
    EmpiricalCoverage,
    PinballLoss,
)

In [None]:
fh = ForecastingHorizon(y_test.index, is_relative=False)
y_pred = model.predict_quantiles(fh, X, alpha=quantiles)

pinball_loss = PinballLoss()(y_test, y_pred)
empirical_coverage = EmpiricalCoverage()(y_test, y_pred)
constraint_violation = ConstraintViolation()(y_test, y_pred)
print(f"Pinball loss: {pinball_loss}")
print(f"Empirical coverage: {empirical_coverage}")
print(f"Constraint violation: {constraint_violation}")

In [None]:
from probafcst.backtest import backtest, get_window_params

window_params = get_window_params(
    3, step_length_days=90, forecast_steps_days=7, freq="D"
)
results = backtest(
    model,
    y,
    **window_params,
    X=X,
    backend="loky",
    splitter_type="sliding",
    quantiles=quantiles,
)

In [None]:
# import shap

# explainer = shap.TreeExplainer(
#     model.model, feature_names=model.feature_names_in_, data=features
# )
# features, labels = create_lagged_features(
#     X, y["bike_count"], lags=lags, include_seasonal_dummies=True, is_training=True
# )
# features = features.loc[y_test.index]
# labels = labels.loc[y_test.index]
# features.head()

In [None]:
model.feature_names_in_

In [None]:
# shap_values = explainer(features, labels)

In [None]:
# averaged = shap_values.values.mean(axis=2)  # noqa: PD011
# averaged.shape

In [None]:
# shap.summary_plot(averaged, features)