# Load Data
Load the data from https://www.nature.com/articles/s41597-022-01156-1.pdf

In [1]:
# imports
import requests
import h5py
import pandas as pd
from io import BytesIO
from tqdm.notebook import tqdm

In [2]:
# hdf5 1min data
hdf5_2018 = 'https://zenodo.org/records/5642902/files/2018_data_1min.zip?download=1'
hdf5_2019 = 'https://zenodo.org/records/5642902/files/2019_data_1min.zip?download=1'
hdf5_2020 = 'https://zenodo.org/records/5642902/files/2020_data_1min.zip?download=1'

# hdf5 1h data
hdf5_2018_1h = 'https://zenodo.org/records/5642902/files/2018_data_60min.hdf5?download=1'
hdf5_2019_1h = 'https://zenodo.org/records/5642902/files/2019_data_60min.hdf5?download=1'
hdf5_2020_1h = 'https://zenodo.org/records/5642902/files/2020_data_60min.hdf5?download=1'

hdf5_1h = [hdf5_2018_1h, hdf5_2019_1h, hdf5_2020_1h]

# weather data

In [3]:
def download_hdf5(url: str) -> BytesIO:
    """
    Downloads an HDF5 file from a URL and returns a BytesIO buffer.
    """
    response = requests.get(url)
    response.raise_for_status()
    return BytesIO(response.content)

In [4]:
data = []
for hdf_url in hdf5_1h:
    print(f"Downloading {hdf_url}...")
    hdf5_buffer = download_hdf5(hdf_url)
    data.append(hdf5_buffer)

Downloading https://zenodo.org/records/5642902/files/2018_data_60min.hdf5?download=1...
Downloading https://zenodo.org/records/5642902/files/2019_data_60min.hdf5?download=1...
Downloading https://zenodo.org/records/5642902/files/2020_data_60min.hdf5?download=1...


# Functions

In [5]:

def concat_dataframes_unique_index(df_list):
    # Combine all indices into a single Series
    all_indices = pd.concat([df.index.to_series() for df in df_list], ignore_index=True)

    # Convert to UTC datetime
    utc_indices = pd.to_datetime(all_indices, unit='s', utc=True)

    #
    # Checking a lot of stuff about the indices
    #

    # Sanity check for UTC conversion
    assert utc_indices.notnull().all(), "Some indices could not be converted to UTC timestamps"
    assert str(utc_indices.dt.tz) == 'UTC', "Indices are not in UTC timezone."
    # Sanity check for duplicates
    assert len(utc_indices) == utc_indices.nunique(), "Duplicate index values found across dataframes."
    # Check if all indices are sorted
    assert utc_indices.is_monotonic_increasing, "Indices are not sorted in increasing order."
    # Check if they are equally spaced
    time_diffs = utc_indices.diff().dropna()
    assert time_diffs.nunique() == 1, "Indices are not equally spaced."

    #
    # Index now contains all unique UTC timestamps from all dataframes
    #

    # Concatenate dataframes
    df_full = pd.concat(df_list, axis=0, ignore_index=True)

    # Replace the index with the UTC-converted version
    df_full.index = utc_indices

    return df_full

In [6]:
def get_data_from_buffer(file_group, data_list):
    """
    Reads HDF5 data from a BytesIO buffer and appends it to a list of DataFrames.
    """
    dfs = []
    for hdf_buf in data:
        with h5py.File(hdf_buf, "r") as h5file:
            # Navigate to the table
            table = h5file[file_group]


            # Or convert to structured array or DataFrame
            df = pd.DataFrame.from_records(table[:])
            df.set_index('index', inplace=True)
            dfs.append(df)

    df_full = concat_dataframes_unique_index(dfs)
    return df_full

In [13]:

def category_data(categorys, data):

    categorys_data = {}

    for category in categorys:
        print(f"Loading {category}...")
        # extract building name
        name = category.split('/')[1]

        try:
            df = get_data_from_buffer(category, data)
            categorys_data[name] = df
        except Exception as e:
            print(f"Error loading {category}: {e}")

    return categorys_data

# get all Buildings

In [14]:
hdf5_buffer = data[0]  # Use the first buffer for interfering building list

list_of_items = []
with h5py.File(hdf5_buffer, "r") as hdf:
    def print_structure(name, obj):
        print(name, dict(obj.attrs))
        list_of_items.append(name)
    hdf.visititems(print_structure)


MISC {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0')}
MISC/ES1 {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0')}
MISC/ES1/TRANSFORMER {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0'), 'data_columns': np.bytes_(b'(lp0\nVS_1\np1\naVS_2\np2\naVS_3\np3\naVS_TOT\np4\naVI_1\np5\naVI_2\np6\naVI_3\np7\naVPF_1\np8\naVPF_2\np9\naVPF_3\np10\naVPF_TOT\np11\naVP_1\np12\naVP_2\np13\naVP_3\np14\naVP_TOT\np15\naVQ_1\np16\naVQ_2\np17\naVQ_3\np18\naVQ_TOT\np19\naVU_1\np20\naVU_2\np21\naVU_3\np22\na.'), 'encoding': np.bytes_(b'UTF-8'), 'error_margin': np.float64(0.005), 'errors': np.bytes_(b'strict'), 'index_cols': np.bytes_(b'(lp0\n(I0\nVindex\np1\ntp2\na.'), 'info': np.bytes_(b'(dp0\nI1\n(dp1\nVnames\np2\n(lp3\nNasVtype\np4\nVIndex\np5\nssVindex\np6\n(dp7\nsVS_1\np8\n(dp9\nsVS_2\np10\n(dp11\nsVS_3\np12\n(dp13\nsVS_TOT\np14\n(dp15\nsVI_1\np16\n(dp17\nsVI_2\np18\n(dp1

In [15]:
# filter the list of items for everthing contain table
table_items = [item for item in list_of_items if 'table' in item]

# Categorize the items based on their type
heatpumps = [item for item in table_items if 'HEATPUMP' in item]
buildings = [item for item in table_items if 'HOUSEHOLD' in item]
misc = [item for item in table_items if 'PV1' in item]

# 

In [16]:
buidding_data = category_data(buildings, data)
heatpump_data = category_data(heatpumps, data)
misc_data = category_data(misc, data)

Loading NO_PV/SFH10/HOUSEHOLD/table...
Loading NO_PV/SFH11/HOUSEHOLD/table...
Loading NO_PV/SFH12/HOUSEHOLD/table...
Loading NO_PV/SFH14/HOUSEHOLD/table...
Loading NO_PV/SFH16/HOUSEHOLD/table...
Loading NO_PV/SFH17/HOUSEHOLD/table...
Loading NO_PV/SFH18/HOUSEHOLD/table...
Loading NO_PV/SFH19/HOUSEHOLD/table...
Loading NO_PV/SFH20/HOUSEHOLD/table...
Loading NO_PV/SFH21/HOUSEHOLD/table...
Loading NO_PV/SFH22/HOUSEHOLD/table...
Loading NO_PV/SFH23/HOUSEHOLD/table...
Loading NO_PV/SFH24/HOUSEHOLD/table...
Error loading NO_PV/SFH24/HOUSEHOLD/table: 'Unable to synchronously open object (component not found)'
Loading NO_PV/SFH25/HOUSEHOLD/table...
Error loading NO_PV/SFH25/HOUSEHOLD/table: 'Unable to synchronously open object (component not found)'
Loading NO_PV/SFH27/HOUSEHOLD/table...
Loading NO_PV/SFH28/HOUSEHOLD/table...
Loading NO_PV/SFH29/HOUSEHOLD/table...
Loading NO_PV/SFH3/HOUSEHOLD/table...
Loading NO_PV/SFH30/HOUSEHOLD/table...
Loading NO_PV/SFH31/HOUSEHOLD/table...
Loading NO_PV/S

In [17]:
heatpump_data

{'SFH10':                            S_TOT  PF_TOT  P_1  P_2  P_3  P_TOT  Q_1  Q_2  Q_3  \
 index                                                                           
 2018-01-01 00:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2018-01-01 01:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2018-01-01 02:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2018-01-01 03:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2018-01-01 04:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 ...                          ...     ...  ...  ...  ...    ...  ...  ...  ...   
 2020-12-31 19:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2020-12-31 20:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2020-12-31 21:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2020-12-31 22:00:00+00:00    NaN     NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN   
 2020-1

In [18]:
buidding_data

{'SFH10':                            S_1  S_2  S_3  S_TOT  I_1  I_2  I_3  PF_1  PF_2  \
 index                                                                        
 2018-01-01 00:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2018-01-01 01:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2018-01-01 02:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2018-01-01 03:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2018-01-01 04:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 ...                        ...  ...  ...    ...  ...  ...  ...   ...   ...   
 2020-12-31 19:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2020-12-31 20:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2020-12-31 21:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2020-12-31 22:00:00+00:00  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN   
 2020-12-31 23:00:00+00:00  NaN  NaN  NaN  