### Conduct DM test comparing Naive forecast to each model

In [11]:
from dieboldmariano import dm_test
import pandas as pd
from pathlib import Path

### Helper function

In [None]:
def dm_against_naive(path_model_csv, path_naive_csv, loss="mae"):
    # load the two CSVs (must have timestamp index and columns actual,prediction)
    m = pd.read_csv(Path(path_model_csv), index_col="timestamp", parse_dates=True)
    n = pd.read_csv(Path(path_naive_csv), index_col="timestamp", parse_dates=True)

    # align on timestamps, rename for clarity
    df = (
        m[["actual", "prediction"]]
        .rename(columns={"prediction": "pred_model"})
        .join(
            n[["prediction"]]
            .rename(columns={"prediction": "pred_naive"}),
            how="inner",
        )
        .dropna()
    )

    # pull arrays
    y  = df["actual"].to_numpy() # actual value
    p1 = df["pred_naive"].to_numpy() # baseline value
    p2 = df["pred_model"].to_numpy() # model value

    # choose loss function
    if loss == "mae":
        loss_fn = lambda u, v: abs(u - v)
    else:
        loss_fn = lambda u, v: (u - v) ** 2

    # run Diebold–Mariano test
    stat, p_value = dm_test(
        y, p1, p2,
        loss=loss_fn,
        one_sided=True,
        harvey_correction=True,
        variance_estimator="bartlett"
    )

    return stat, p_value, len(df)

Ideal result:
stat ≈ 2.5,  p ≈ 0.006,  T = 52

P value tells how significant (confident) the differences is, while DM stats tell us which one is better (lower loss)

if stats > 2.5 and p value < 0.05 then strong evidence current model beat baseline

if stats is negative and p value < 0.05 then strong evidence current model does not beat baseline

### ARIMA

In [38]:
stat, p, T = dm_against_naive(
    "results/ARIMA_test_2017.csv", # current model
    "results/Baseline_test_2017.csv",
    loss="mae"
)
print(f"stat={stat:.3f}, p={p:.3g}, T={T}")

stat=-12.421, p=2.41e-17, T=52


In [39]:
stat, p, T = dm_against_naive(
    "results/ARIMA_test_2023.csv", # current model
    "results/Baseline_test_2023.csv",
    loss="mae"
)
print(f"stat={stat:.3f}, p={p:.3g}, T={T}")

InvalidParameterException: Forecast horizon cannot be greater than the number of predictions.

In [37]:
stat, p, T = dm_against_naive(
    "results/ARIMA_backtest.csv", # current model
    "results/Baseline_backtest.csv",
    loss="mae"
)
print(f"stat={stat:.3f}, p={p:.3g}, T={T}")

InvalidParameterException: Forecast horizon cannot be greater than the number of predictions.

### ETS

In [35]:
stat, p, T = dm_against_naive(
    "results/ETS_test_2017.csv", # baseline model
    "results/Baseline_test_2017.csv",
    loss="mae"
)
print(f"stat={stat:.3f}, p={p:.3g}, T={T}")

# If P value > 0.05, so we fail to reject the null hypothesis -> the model is not better than the baselines

stat=-38.879, p=7.11e-40, T=52


### Test code

In [2]:
T = [10, 20, 30, 40, 50]
F = [11, 21, 29, 42, 53]
G = [13, 26, 24, 40, 59]

dm_test(T, F, G, one_sided=True)
# (-2.2229922805746782, 0.04515565862099125)

(-2.2229922805746782, 0.04515565862099125)