In [88]:
# for reloading the editable module
%load_ext autoreload
%autoreload 2

from pathlib import Path
from dotenv import load_dotenv
import holidays
from datetime import timezone, datetime
import numpy as np
import pandas as pd



from probabilistic_load_forecast.application.services import (
    GetActualLoadData,
    GetERA5DataFromDB,
)



# Import adapters for data access
from probabilistic_load_forecast.adapters.db import (
    EntsoePostgreRepository,
    Era5PostgreRepository,
)

# Import configuration
from probabilistic_load_forecast import config

# Load environment variables

if not load_dotenv(Path("../..") / ".env"):
    raise FileNotFoundError("Could not open the .env file.")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
# EXPERIMENT_NAME = "fs_03_load_calendar_weather"
OUTPUT_PATH = Path("../../data/processed")

OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

INCLUDE_WEATHER = True
INCLUDE_FUTURE_WEATHER = True
INCLUDE_CALENDAR = True

In [90]:
# Initialize database repositories
try:
    # Load data repository
    load_repo = EntsoePostgreRepository(config.get_postgre_uri())

    # ERA5 weather data repository
    era5_repo = Era5PostgreRepository(config.get_postgre_uri())

    print("Database repositories initialized successfully")
except Exception as e:
    print(f"Error initializing repositories: {e}")
    print("Make sure your .env file contains the correct database credentials")

Database repositories initialized successfully


In [91]:
# Initialize services
get_load_data = GetActualLoadData(load_repo)
get_era5_data = GetERA5DataFromDB(era5_repo)

print("Services initialized successfully")

Services initialized successfully


In [92]:
start = datetime(2018, 10, 1, 0, 0, tzinfo=timezone.utc)
end = datetime(2025, 10, 10, 0, 0, tzinfo=timezone.utc)

era5_variables = ["t2m", "u10", "v10", "ssrd", "tp"]
era5_data = get_era5_data(
    variables=era5_variables, country_code="AT", start=start, end=end
)
actual_load_ts = get_load_data(start, end)

data_combined = pd.DataFrame()

## Preprocess Timeseries 

In [93]:
# Make sure the timestamps are all sorted

actual_load_ts.data = actual_load_ts.data.sort_index()

era5_data = {
    var: ts.sort_index()
    for var, ts in era5_data.items()
}

actual_load_ts.data.index = (
    actual_load_ts.data.index
    .to_timestamp(how="start") 
    .tz_localize("UTC")
)

In [94]:
if INCLUDE_WEATHER:
    # Convert the total amount measurements per time (ssrd and tp) into a rate
    era5_data = {
        var: (
            ts/3600
            if isinstance(ts.index, pd.PeriodIndex)
            else ts
        )
        for var, ts in era5_data.items()
    }

    # All ts with PeriodIndex are converted to DatetimeIndex
    era5_data = {
        var: (
            ts.set_axis(
                ts.index.to_timestamp(how="start",).tz_localize("UTC")
            )
            if isinstance(ts.index, pd.PeriodIndex)
            else ts
        )
        for var, ts in era5_data.items()
    }

    # FFill to meet the 15min load precision
    era5_ffilled  = {
        var: ts.reindex(actual_load_ts.data.index, method="ffill") for var, ts in era5_data.items()
    }

    # Combine the weather data into a single df
    era5_df = pd.concat(era5_ffilled.values(), axis=1)
    era5_df.columns = era5_ffilled.keys()

    # Compute the windspeed feature
    era5_df["wind_speed"] = np.sqrt(
        era5_df["u10"]**2 + era5_df["v10"]**2
    )

    era5_df = era5_df.drop(columns=["u10", "v10"])

## Merge Era5 and Load Ts

In [95]:
if INCLUDE_WEATHER:
    data_combined = pd.concat([actual_load_ts.data, era5_df], axis=1)
else:
    data_combined = actual_load_ts.data

In [96]:
if INCLUDE_FUTURE_WEATHER:
    data_combined["t2m_future"] = data_combined["t2m"].shift(-96)
    data_combined["ssrd_future"] = data_combined["ssrd"].shift(-96)
    data_combined["tp_future"] = data_combined["tp"].shift(-96)
    data_combined["wind_speed_future"] = data_combined["wind_speed"].shift(-96)

## Calendar Features

In [97]:
if INCLUDE_CALENDAR:
    data_combined["is_weekday"] = data_combined.index.weekday < 5

    years = range(data_combined.index.min().year, data_combined.index.max().year + 1)
    at_holidays = holidays.country_holidays("AT", years=years)

    data_combined["is_holiday"] = data_combined.index.map(lambda x: x in at_holidays)

In [98]:
data_combined = data_combined.dropna()
data_combined.head(10)

Unnamed: 0_level_0,actual_load_mw,t2m,ssrd,tp,wind_speed,t2m_future,ssrd_future,tp_future,wind_speed_future,is_weekday,is_holiday
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-01 00:00:00+00:00,5256.0,279.426453,0.0,3.671894e-10,0.693914,277.421722,0.0,0.0,1.758537,True,False
2018-10-01 00:15:00+00:00,5204.0,279.426453,0.0,3.671894e-10,0.693914,277.421722,0.0,0.0,1.758537,True,False
2018-10-01 00:30:00+00:00,5178.0,279.426453,0.0,3.671894e-10,0.693914,277.421722,0.0,0.0,1.758537,True,False
2018-10-01 00:45:00+00:00,5151.0,279.426453,0.0,3.671894e-10,0.693914,277.421722,0.0,0.0,1.758537,True,False
2018-10-01 01:00:00+00:00,5187.0,279.108917,0.0,3.959738e-08,0.583597,277.25235,0.0,6.201566e-08,1.709355,True,False
2018-10-01 01:15:00+00:00,5116.0,279.108917,0.0,3.959738e-08,0.583597,277.25235,0.0,6.201566e-08,1.709355,True,False
2018-10-01 01:30:00+00:00,5076.0,279.108917,0.0,3.959738e-08,0.583597,277.25235,0.0,6.201566e-08,1.709355,True,False
2018-10-01 01:45:00+00:00,5106.0,279.108917,0.0,3.959738e-08,0.583597,277.25235,0.0,6.201566e-08,1.709355,True,False
2018-10-01 02:00:00+00:00,5158.0,278.721954,0.0,4.205152e-08,0.517766,277.095978,0.0,6.385283e-08,1.703105,True,False
2018-10-01 02:15:00+00:00,5190.0,278.721954,0.0,4.205152e-08,0.517766,277.095978,0.0,6.385283e-08,1.703105,True,False


In [99]:
data_combined.to_parquet(OUTPUT_PATH / "data_combined.parquet")