In [2]:
from datasetsforecast.m4 import M4
df_total, *_ = M4.load('./data', 'Hourly')
df_total.head()

Unnamed: 0,unique_id,ds,y
0,H1,1,605.0
1,H1,2,586.0
2,H1,3,586.0
3,H1,4,559.0
4,H1,5,511.0


In [3]:
df_total['ds'] = df_total['ds'].astype(int)
n_series = 8 # data is too large
uids = df_total['unique_id'].unique()[:n_series]
df = df_total.query('unique_id in @uids')

In [4]:
from statsforecast import StatsForecast
StatsForecast.plot(df, plot_random = False)

  from tqdm.autonotebook import tqdm


To generate the forecast, we’ll use the MSTL model, which is well-suited for low-frequency data like the one used here. We first need to import it from statsforecast.models and then we need to instantiate it. Since we’re using hourly data, we have two seasonal periods: one every 24 hours (hourly) and one every 24*7 hours (daily). Hence, we need to set season_length = [24, 24*7]

In [5]:
from statsforecast.models import MSTL

# Create a list of models and instantiation parameters 
models = [MSTL(season_length = [24, 24*7])]

In [6]:
sf = StatsForecast(
    df = df, 
    models = models, 
    freq = 'H', 
    n_jobs = -1
)

In [7]:
horizon = 48
levels = [99] 

fcst = sf.forecast(h = 48, level = levels, fitted = True)
fcst = fcst.reset_index()
fcst.head()

Unnamed: 0,unique_id,ds,MSTL,MSTL-lo-99,MSTL-hi-99
0,H1,749,615.94397,597.66217,634.225708
1,H1,750,559.297791,531.31665,587.278931
2,H1,751,515.693542,479.151337,552.235718
3,H1,752,480.719269,436.241547,525.197021
4,H1,753,467.146484,415.199738,519.093262


In [8]:
StatsForecast.plot(df, fcst, plot_random = False)

In this example, an anomaly will be any observation outside the prediction interval of the insample forecasts for a given confidence level (here we selected 99%). Hence, we first need to recover the insample forecasts using the forecast_fitted_values method.

In [9]:
insample_forecasts = sf.forecast_fitted_values().reset_index()
insample_forecasts.head()

Unnamed: 0,unique_id,ds,y,MSTL,MSTL-lo-99,MSTL-hi-99
0,H1,1,605.0,604.9245,588.010376,621.838623
1,H1,2,586.0,585.221802,568.307678,602.135925
2,H1,3,586.0,589.740723,572.826599,606.654846
3,H1,4,559.0,557.778076,540.863953,574.6922
4,H1,5,511.0,506.747009,489.832886,523.661133


In [10]:
# We can now find all the observations above or below the 99% prediction interval for the insample forecasts.

anomalies = insample_forecasts.loc[(insample_forecasts['y'] >= insample_forecasts['MSTL-hi-99']) | (insample_forecasts['y'] <= insample_forecasts['MSTL-lo-99'])]
print(anomalies.shape)
anomalies.head()

(120, 6)


Unnamed: 0,unique_id,ds,y,MSTL,MSTL-lo-99,MSTL-hi-99
168,H1,169,813.0,779.849792,762.935669,796.763916
279,H1,280,692.0,672.638123,655.723999,689.552246
289,H1,290,770.0,792.015442,775.101318,808.929565
308,H1,309,844.0,867.809387,850.895203,884.723511
336,H1,337,853.0,822.427002,805.512878,839.341187


In [12]:
StatsForecast.plot(insample_forecasts, plot_random = False, plot_anomalies = True)