In [1]:
# for reloading the editable module
%load_ext autoreload
%autoreload 2

from pathlib import Path
from dotenv import load_dotenv
from datetime import timezone, datetime
import numpy as np
import pandas as pd



from probabilistic_load_forecast.application.services import (
    GetActualLoadData,
    GetERA5DataFromDB,
)



# Import adapters for data access
from probabilistic_load_forecast.adapters.db import (
    EntsoePostgreRepository,
    Era5PostgreRepository,
)

# Import configuration
from probabilistic_load_forecast import config

# Load environment variables

if not load_dotenv(Path("../..") / ".env"):
    raise FileNotFoundError("Could not open the .env file.")

In [2]:
# Initialize database repositories
try:
    # Load data repository
    load_repo = EntsoePostgreRepository(config.get_postgre_uri())

    # ERA5 weather data repository
    era5_repo = Era5PostgreRepository(config.get_postgre_uri())

    print("Database repositories initialized successfully")
except Exception as e:
    print(f"Error initializing repositories: {e}")
    print("Make sure your .env file contains the correct database credentials")

Database repositories initialized successfully


In [3]:
# Initialize services
get_load_data = GetActualLoadData(load_repo)
get_era5_data = GetERA5DataFromDB(era5_repo)

print("Services initialized successfully")

Services initialized successfully


In [4]:
start = datetime(2018, 10, 1, 0, 0, tzinfo=timezone.utc)
end = datetime(2025, 10, 10, 0, 0, tzinfo=timezone.utc)

era5_variables = ["t2m", "u10", "v10", "ssrd", "tp"]
era5_data = get_era5_data(
    variables=era5_variables, country_code="AT", start=start, end=end
)
actual_load_ts = get_load_data(start, end)

In [5]:
# Make sure the timestamps are all sorted

actual_load_ts.data = actual_load_ts.data.sort_index()

era5_data = {
    var: ts.sort_index()
    for var, ts in era5_data.items()
}

In [6]:
actual_load_ts.data.index = (
    actual_load_ts.data.index
    .to_timestamp(how="start") 
    .tz_localize("UTC")
)

In [7]:
# Convert the total amount measurements per time (ssrd and tp) into a rate
era5_data = {
    var: (
        ts/3600
        if isinstance(ts.index, pd.PeriodIndex)
        else ts
    )
    for var, ts in era5_data.items()
}

In [8]:
era5_data = {
    var: (
        ts.set_axis(
            ts.index.to_timestamp(how="start",).tz_localize("UTC")
        )
        if isinstance(ts.index, pd.PeriodIndex)
        else ts
    )
    for var, ts in era5_data.items()
}

In [9]:
era5_ffilled  = {
    var: ts.reindex(actual_load_ts.data.index, method="ffill") for var, ts in era5_data.items()
}

In [10]:
era5_df = pd.concat(era5_ffilled.values(), axis=1)
era5_df.columns = era5_ffilled.keys()
# era5_df.head(10)

In [11]:
era5_df["wind_speed"] = np.sqrt(
    era5_df["u10"]**2 + era5_df["v10"]**2
)

era5_df = era5_df.drop(columns=["u10", "v10"])

In [12]:
data_combined = pd.concat([actual_load_ts.data, era5_df], axis=1)
data_combined.head(10)

Unnamed: 0_level_0,actual_load_mw,t2m,ssrd,tp,wind_speed
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-01 00:00:00+00:00,5256.0,279.426453,0.0,0.0,0.693914
2018-10-01 00:15:00+00:00,5204.0,279.426453,0.0,0.0,0.693914
2018-10-01 00:30:00+00:00,5178.0,279.426453,0.0,0.0,0.693914
2018-10-01 00:45:00+00:00,5151.0,279.426453,0.0,0.0,0.693914
2018-10-01 01:00:00+00:00,5187.0,279.108917,0.0,3.959739e-08,0.583597
2018-10-01 01:15:00+00:00,5116.0,279.108917,0.0,3.959739e-08,0.583597
2018-10-01 01:30:00+00:00,5076.0,279.108917,0.0,3.959739e-08,0.583597
2018-10-01 01:45:00+00:00,5106.0,279.108917,0.0,3.959739e-08,0.583597
2018-10-01 02:00:00+00:00,5158.0,278.721954,0.0,4.205153e-08,0.517766
2018-10-01 02:15:00+00:00,5190.0,278.721954,0.0,4.205153e-08,0.517766


In [13]:
data_combined.to_parquet("../../data/processed/data_combined.parquet")