# SARIMAX
- Autoregression model where the value at index t depends on the value at index value t - 96

In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from analysis.datasets import load_entsoe
from analysis.transformations import scale_power_data, add_lagged_features, add_interval_index
from analysis.splits import to_train_validation_test_data

In [2]:
feature_columns = ['power_t-96']
target_column='power'

entsoe = load_entsoe()
entsoe = scale_power_data(entsoe)
entsoe = add_lagged_features(entsoe)
entsoe = add_interval_index(entsoe)
entsoe.dropna(inplace=True)

train_end = "2022-12-31 23:45:00"
#train_end = "2016-01-31 23:45:00"
#train_end = "2016-12-31 23:45:00"


validation_end = "2023-12-31 23:45:00"
#validation_end = "2023-12-31 23:45:00"
#validation_end = "2017-12-31 23:45:00"

train, validation, test = to_train_validation_test_data(entsoe, train_end, validation_end)
X_train, y_train = train[feature_columns], train[target_column]
X_validation, y_validation = validation[feature_columns], validation[target_column]

Data loaded and transformed successfully. Shape of DataFrame: (78912, 22)
# of training observations: 245376 | 77.76%
# of validation observations: 35040 | 11.10%
# of test observations: 35133 | 11.13%


In [4]:
y_train_array = y_train.values
y_train_array = y_train_array.astype(np.float32)  # Downcast to float32


In [10]:
y_validation

time
2023-01-01 00:00:00   -0.539165
2023-01-01 00:15:00   -0.518601
2023-01-01 00:30:00   -0.555433
2023-01-01 00:45:00   -0.509552
2023-01-01 01:00:00   -0.552363
                         ...   
2023-12-31 22:45:00   -0.756651
2023-12-31 23:00:00   -0.740979
2023-12-31 23:15:00   -0.755649
2023-12-31 23:30:00   -0.747790
2023-12-31 23:45:00   -0.726035
Name: power, Length: 35040, dtype: float64

In [128]:
y_train_array

array([-3.3242364, -3.2381785, -3.171487 , ..., -1.1947993, -1.1830175,
       -1.1789933], dtype=float32)

vielleicht apply Methode von model für die prediction

In [None]:
model = SARIMAX(endog=y_train_array,
                order=(0,0,0),
                seasonal_order=(1,0,0,96), trend_offset=0)
model.ssm.memory_no_filtered = True
model.ssm.memory_no_gain = True
model.ssm.memory_no_smoothing = True
model.ssm.memory_no_std_forecast = True

result = model.fit(low_memory=True)

print(result.summary())

                                SARIMAX Results                                 
Dep. Variable:                        y   No. Observations:               245376
Model:             SARIMAX(1, 0, 0, 96)   Log Likelihood             -384385.614
Date:                  Wed, 19 Mar 2025   AIC                         768775.228
Time:                          16:15:40   BIC                         768796.049
Sample:                               0   HQIC                        768781.302
                               - 245376                                         
Covariance Type:                 approx                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L96       0.8663      0.001    860.802      0.000       0.864       0.868
sigma2         1.3426      0.004    350.272      0.000       1.335       1.350
Ljung-Box (L1) (Q):              243

In [8]:
result.apply(y_validation)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2c6c550a6f0>

In [16]:
len(y_train) + 2880

248256

In [17]:
248256 - 245376 

2880

In [20]:
X_validation.index[2880]

Timestamp('2023-01-31 00:00:00')

In [37]:
X_validation.iloc[2880-96] * result.params[0]

power_t-96   -1.415892
Name: 2023-01-30 00:00:00, dtype: float64

In [35]:
exog_validation = X_validation.iloc[2880:2891].values  # 11 steps (from 2880 to 2890), one for each forecast


In [41]:
y_train_array


array([-3.3242364 , -3.2381785 , -3.171487  , ..., -0.54807967,
       -0.5207769 , -0.54981107], dtype=float32)

In [42]:
y_train.shape

(245376,)

In [53]:
-0.549*0.8663

-0.47559870000000004

In [46]:
result.predict?

[1;31mSignature:[0m
[0mresult[0m[1;33m.[0m[0mpredict[0m[1;33m([0m[1;33m
[0m    [0mstart[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mend[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdynamic[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0minformation_set[0m[1;33m=[0m[1;34m'predicted'[0m[1;33m,[0m[1;33m
[0m    [0msignal_only[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
predict(self, start=None, end=None, dynamic=False, information_set='predicted', signal_only=False, **kwargs)

In-sample prediction and out-of-sample forecasting

Parameters
----------
start : {int, str,datetime}, optional
    Zero-indexed observation number at which to start forecasting,
    i.e., the first forecast is start. Can also be a date string to
    parse or a datetime type. Default is the zeroth observation.
end : {int

In [60]:
result.forecast(steps=100000)

MemoryError: Unable to allocate 6.87 GiB for an array with shape (96, 96, 100001) and data type float64

In [None]:
result.predict(start=0, end=10, exog=y_train_array)

array([ 0.00000000e+00, -2.14080707e-13, -1.14485313e-13, -3.63602807e-13,
       -4.55942043e-13, -3.91198519e-13, -3.68881079e-13, -2.75452492e-13,
       -2.03141404e-13, -1.81327367e-13,  3.96616796e-14])

In [36]:
mus = result.predict(start=len(y_train) + 2880, end=len(y_train) + 2890, exog=exog_validation)
sigma = np.sqrt(result.params[1])
mus

array([-0.00615951, -0.00611883, -0.00652127, -0.00668328, -0.00688716,
       -0.00709087, -0.00741817, -0.00782718, -0.00773889, -0.00813743,
       -0.00804814])

In [None]:
from scipy.stats import norm
from scipy.stats import lognorm

In [7]:
z = (y_validation - mus) / sigma

crps_score = sigma * ( 
    z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi)
)

# Get mean CRPS
mean_crps = np.mean(crps_score)

print(f"Mean CRPS: {mean_crps:.4f}")

NameError: name 'norm' is not defined

In [None]:
nll_hand = []

for i in range(0, y_validation.shape[0]):
    nll= - norm.logpdf(y_validation[i], loc=mus[i], scale=sigma)
    nll_hand.append(nll)

np.mean(nll_hand)

  nll= - norm.logpdf(y_validation[i], loc=mus[i], scale=sigma)


3.1238171828569565

In [None]:
#y_validation_array = y_validation.values
#y_validation_array = y_validation_array.astype(np.float32)  # Downcast to float32
#y_validation_array[0]

-1.1723222

In [None]:
y_validation.index[0]

Timestamp('2017-01-01 00:00:00')

In [None]:
type(y_train.iloc[-96])

numpy.float64

In [None]:
len(y_train)

35040

In [None]:
y_train.index[-96]

Timestamp('2016-12-31 00:00:00')

In [None]:
y_train.index[len(y_train) - 1]

Timestamp('2016-12-31 23:45:00')

SARIMAX
1. "start" tells the model where the prediction begins. The model has been trained on training data and it knows what the last entry is. SARIMAX automatically infers that the next time step is 2017-01-01 00:00:00 based on the spacing of y_train.
2. Since the model was trained with seasonal_order=(1,0,0,96), it learned a relationship between y(t) and y(t-96).
3. SARIMAX looks back 96 time steps (24h ago) from the "start" parameter and multiplies that value by a learned coefficient (ϕ₁).
4. The "end" parameter tells the model where the prediction stops

e.g. if y_train ends 2016-12-31 23:45 and you want to make a single prediction of the power of the next 15-minute time interval i.e. 2017-01-01 00:00
- start = length of y_train
- end = len(y_train)
- Power_2017-01-01 00:00 = Power_shifted_by_96 (Power_2016-12-31 00:00) * ϕ₁

In [None]:
exog_validation = y_train.iloc[-96]  # Take the last value in y_train as the regressor corresponds to '2016-12-31 00:00:00'

# y_validation.index[0] = Timestamp('2017-01-01 00:00:00')

# Predict the first value in y_validation using the model
mu = result.predict(start=len(y_train), end=len(y_train) + 2)#, exog=[exog_validation])
    #start = predict the next value after training data
    #end = The time index (or date) where prediction stops

# Print the prediction
print(mu)

[-1.19257455 -1.18142727 -1.16496504]


In [None]:
y_train.index[-1]

Timestamp('2016-12-31 23:45:00')

In [None]:
y_train.index[-96]

Timestamp('2016-12-31 00:00:00')

In [None]:
y_train.index[-95]

Timestamp('2016-12-31 00:15:00')

In [None]:
y_train.index[-94]

Timestamp('2016-12-31 00:30:00')

In [None]:
mu_test = y_train.iloc[-96] * result.params[0] #ar.S.L96 = 0.8850
# power at 2017-01-01 00:00:00 = power value at 2016-12-31 00:00:00 * learned parameter
mu_test

-1.1925745941492316

In [None]:
print(y_train.iloc[-96] * result.params[0]) # power at 2016-12-31 00:00:00 * learned parameter
print(y_train.iloc[-95] * result.params[0]) # power at 2016-12-31 00:15:00 * learned parameter
print(y_train.iloc[-94] * result.params[0]) # power at 2016-12-31 00:30:00 * learned parameter

-1.1925745941492316
-1.1814272185858108
-1.1649650821269129


In [None]:
print(X_validation.iloc[0] * result.params[0]) # first entry in X validation = y_train.iloc[-96]
print(X_validation.iloc[1] * result.params[0])
print(X_validation.iloc[2] * result.params[0])

power_t-96   -1.192575
Name: 2017-01-01 00:00:00, dtype: float64
power_t-96   -1.181427
Name: 2017-01-01 00:15:00, dtype: float64
power_t-96   -1.164965
Name: 2017-01-01 00:30:00, dtype: float64


In [None]:
np.array(X_validation["power_t-96"])

-1.3475262390582907

In [None]:
np.array(X_validation).squeeze().squeeze()

array([-1.34752624, -1.33493048, -1.31632941, ..., -0.31229626,
       -0.3231264 , -0.33959475])

In [None]:
y_pred_mean = result.get_prediction(start=start, end=end).predicted_mean
y_pred_mean

MemoryError: Unable to allocate 2.41 GiB for an array with shape (96, 96, 35040) and data type float64

In [None]:
start_date = y_validation.index[0]
end_date = y_validation.index[-1]

interval_length = pd.Timedelta(weeks=12)

y_pred = []

for start in pd.date_range(start=start_date, end=end_date, freq=interval_length):

    end = min(start + interval_length, end_date) #prevent exceeding the total validation period (ensure the end date is capped at end_date)
    
    pred = result.predict(start=start, end=end)
    
    y_pred.extend(pred)

y_pred = np.array(y_pred)

MemoryError: Unable to allocate 567. MiB for an array with shape (96, 96, 8065) and data type float64

# CRPS

In [44]:
from scipy.stats import norm
from scipy.stats import lognorm

## per hand

In [46]:
y_validation[0]

  y_validation[0]


-1.1723221500332002

In [57]:
mu[0]

-1.3475262

In [60]:
#np.sqrt(result.params["sigma2"])

np.sqrt(result.params[1])

1.1730858396600932

In [62]:
mu = X_alidation_array[0]
single_nll = - norm.logpdf(y_validation[0], loc=mu[0], scale=np.sqrt(result.params[1]))
single_nll

  single_nll = - norm.logpdf(y_validation[0], loc=mu[0], scale=np.sqrt(result.params[1]))


1.0897294586014523

In [None]:
import properscoring as ps

crps_ps = ps.crps_gaussian(y_validation, mu=y_pred, sig=sigma)
mean_crps = np.mean(crps_ps)  # Average over all time steps

print(f"Mean CRPS: {mean_crps:.4f}")

Mean CRPS: 1.1446


In [29]:
mu = y_pred
sigma = np.sqrt(result.params["sigma2"])

z = (y_validation - y_pred) / sigma

crps_score = sigma * ( 
    z * (2 * norm.cdf(z) - 1) + 2 * norm.pdf(z) - 1 / np.sqrt(np.pi)
)

# Get mean CRPS
mean_crps = np.mean(crps_score)

print(f"Mean CRPS: {mean_crps:.4f}")

Mean CRPS: 1.1446


## with properscoring

In [33]:
import properscoring as ps

crps_ps = ps.crps_gaussian(y_validation, mu=y_pred, sig=sigma)
mean_crps = np.mean(crps_ps)  # Average over all time steps

print(f"Mean CRPS: {mean_crps:.4f}")

Mean CRPS: 1.1446


# NLL

In [34]:
nll = -result.llf  # llf = log-likelihood function value
print(f"NLL: {nll:.4f}")

NLL: 3867.9798


In [25]:
X_train.iloc[96+2]

power_t-96   -3.171487
Name: 2016-01-03 00:30:00, dtype: float64

In [18]:
X_train

Unnamed: 0_level_0,power_t-96
time,Unnamed: 1_level_1
2016-01-02 00:00:00,-2.465104
2016-01-02 00:15:00,-2.499602
2016-01-02 00:30:00,-2.485377
2016-01-02 00:45:00,-2.451358
2016-01-02 01:00:00,-2.405335
...,...
2016-12-31 22:45:00,-1.397653
2016-12-31 23:00:00,-1.400990
2016-12-31 23:15:00,-1.385354
2016-12-31 23:30:00,-1.372740


In [24]:
y_train.iloc[2]

-3.171487202624057

MemoryError: Unable to allocate 2.41 GiB for an array with shape (35040, 96, 96) and data type float64

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model_lin = LinearRegression(fit_intercept=False)
model_lin.fit(X_train, y_train)

y_pred = model_lin.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
print("ARIMA(1,0,0)")
print("Coefficients:", model_lin.coef_)
print("Intercept:", model_lin.intercept_)
print(f"p_t = {model_lin.intercept_} + {model_lin.coef_[0]} * p_t-1 + {model_lin.intercept_} * t")
print("Mean Squared Error (MSE):", mse)

ARIMA(1,0,0)
Coefficients: [0.83832727]
Intercept: 0.0
p_t = 0.0 + 0.8383272710424725 * p_t-1 + 0.0 * t
Mean Squared Error (MSE): 0.9543466840203708


In [35]:
print(model_lin.coef_)
print(model_lin.intercept_)
#print(model_lin.score())


[0.40095893]
-1.0539988781295424


In [8]:
y_train.shape

(2880,)

In [10]:
y_pred.shape

(2688,)

In [9]:
X_train.shape

(2880, 1)