# Load Data
Load the data from https://www.nature.com/articles/s41597-022-01156-1.pdf

In [19]:
# imports
import requests
import h5py
import pandas as pd
from io import BytesIO
from tqdm.notebook import tqdm

In [20]:
# hdf5 1min data
hdf5_2018 = 'https://zenodo.org/records/5642902/files/2018_data_1min.zip?download=1'
hdf5_2019 = 'https://zenodo.org/records/5642902/files/2019_data_1min.zip?download=1'
hdf5_2020 = 'https://zenodo.org/records/5642902/files/2020_data_1min.zip?download=1'

# hdf5 1h data
hdf5_2018_1h = 'https://zenodo.org/records/5642902/files/2018_data_60min.hdf5?download=1'
hdf5_2019_1h = 'https://zenodo.org/records/5642902/files/2019_data_60min.hdf5?download=1'
hdf5_2020_1h = 'https://zenodo.org/records/5642902/files/2020_data_60min.hdf5?download=1'

hdf5_1h = [hdf5_2018_1h, hdf5_2019_1h, hdf5_2020_1h]

# weather data

In [21]:
def download_hdf5(url: str) -> BytesIO:
    """
    Downloads an HDF5 file from a URL and returns a BytesIO buffer.
    """
    response = requests.get(url)
    response.raise_for_status()
    return BytesIO(response.content)

In [22]:
data = []
for hdf_url in hdf5_1h:
    print(f"Downloading {hdf_url}...")
    hdf5_buffer = download_hdf5(hdf_url)
    data.append(hdf5_buffer)

Downloading https://zenodo.org/records/5642902/files/2018_data_60min.hdf5?download=1...
Downloading https://zenodo.org/records/5642902/files/2019_data_60min.hdf5?download=1...
Downloading https://zenodo.org/records/5642902/files/2020_data_60min.hdf5?download=1...


In [23]:
dfs = []
for hdf_buf in data:
    with h5py.File(hdf_buf, "r") as h5file:
        # Navigate to the table
        table = h5file['/NO_PV/SFH11/HOUSEHOLD/table']


        # Or convert to structured array or DataFrame
        df = pd.DataFrame.from_records(table[:])
        df.set_index('index', inplace=True)
        dfs.append(df)

In [24]:
display(dfs)

[                   S_1         S_2          S_3        S_TOT       I_1  \
 index                                                                    
 1514764800         NaN         NaN          NaN          NaN       NaN   
 1514768400         NaN         NaN          NaN          NaN       NaN   
 1514772000         NaN         NaN          NaN          NaN       NaN   
 1514775600         NaN         NaN          NaN          NaN       NaN   
 1514779200         NaN         NaN          NaN          NaN       NaN   
 ...                ...         ...          ...          ...       ...   
 1546282800   97.163694   99.381389  1283.907110  1480.471859  0.419528   
 1546286400  107.144694   87.332167   757.673417   952.143920  0.464222   
 1546290000  112.001528   94.295694   372.052055   578.365307  0.484778   
 1546293600  108.708583   84.389639   349.026112   542.151583  0.470028   
 1546297200  109.370003  197.649994   349.070007   656.080017  0.470000   
 
                  I_2  

In [25]:
dfs[0]

Unnamed: 0_level_0,S_1,S_2,S_3,S_TOT,I_1,I_2,I_3,PF_1,PF_2,PF_3,...,P_2,P_3,P_TOT,Q_1,Q_2,Q_3,Q_TOT,U_1,U_2,U_3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1514764800,,,,,,,,,,,...,,,,,,,,,,
1514768400,,,,,,,,,,,...,,,,,,,,,,
1514772000,,,,,,,,,,,...,,,,,,,,,,
1514775600,,,,,,,,,,,...,,,,,,,,,,
1514779200,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546282800,97.163694,99.381389,1283.907110,1480.471859,0.419528,0.428333,5.590833,0.704983,0.634511,0.970458,...,66.606195,1266.794363,1402.465164,20.948611,47.246111,-49.055639,19.156694,231.487500,232.211944,229.716389
1546286400,107.144694,87.332167,757.673417,952.143920,0.464222,0.376778,3.294444,0.786947,0.624700,0.936494,...,55.601389,732.726110,872.485861,20.171444,44.844583,-51.111167,13.910528,230.804722,231.809723,230.223056
1546290000,112.001528,94.295694,372.052055,578.365307,0.484778,0.405417,1.612778,0.793728,0.624783,0.933428,...,60.331889,347.867973,497.520361,20.264667,49.665472,-25.849306,44.071778,231.194444,232.398055,230.730555
1546293600,108.708583,84.389639,349.026112,542.151583,0.470028,0.362556,1.512278,0.780317,0.628350,0.922489,...,53.815416,321.975222,460.494389,20.437694,54.065444,-62.983472,11.512278,231.384722,232.779722,230.821111


In [26]:
dfs[1]

Unnamed: 0_level_0,S_1,S_2,S_3,S_TOT,I_1,I_2,I_3,PF_1,PF_2,PF_3,...,P_2,P_3,P_TOT,Q_1,Q_2,Q_3,Q_TOT,U_1,U_2,U_3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1546300800,103.353611,133.637945,263.652082,500.671639,0.450500,0.580361,1.152806,0.760106,0.691447,0.884256,...,99.964000,238.354888,419.451778,20.620917,48.193583,-26.323278,42.486583,229.474722,230.280277,228.875278
1546304400,77.478195,101.208139,83.069528,261.756667,0.335194,0.437222,0.360278,0.691194,0.605508,0.539561,...,63.142167,45.137722,161.803250,25.786056,41.974445,-49.179167,18.709333,231.155278,231.255278,230.694166
1546308000,77.498361,89.593417,114.869861,281.932195,0.334472,0.386889,0.497250,0.689950,0.613606,0.602125,...,56.459778,74.047833,183.959916,26.203806,40.235833,-8.106694,58.328278,231.566389,231.853333,230.934722
1546311600,76.155472,81.551028,84.528305,242.218444,0.330111,0.352333,0.367722,0.696917,0.617639,0.533100,...,51.379611,47.641861,152.068472,24.989306,36.977222,-32.195694,29.772222,230.475277,231.172500,230.006111
1546315200,77.101639,81.182805,90.155306,248.425834,0.333000,0.349444,0.390528,0.693894,0.616411,0.565536,...,50.903722,54.684194,159.077583,25.487028,37.704528,-22.160361,41.031611,231.263333,232.352500,230.700555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1577818800,336.499192,793.868888,325.813305,1456.179778,1.477306,3.471194,1.417278,0.828258,0.912322,0.950819,...,772.795996,309.397556,1403.816834,30.829639,94.671611,-32.150944,93.343278,228.906945,230.291666,229.784722
1577822400,131.387444,124.155556,1015.024561,1270.621531,0.571694,0.537500,4.427500,0.827514,0.711753,0.953286,...,86.296167,996.092113,1198.783089,26.886722,67.997833,-57.471917,37.405833,229.738889,230.942222,229.672500
1577826000,191.531889,133.539278,644.740638,970.739500,0.835944,0.580556,2.825333,0.959022,0.539244,0.941969,...,73.101889,628.154778,884.905414,26.571000,102.605806,-48.278250,80.898444,229.199167,230.263889,229.051111
1577829600,95.090639,89.025861,347.525806,531.653862,0.414111,0.386417,1.513861,0.741928,0.628397,0.930875,...,57.140083,323.701528,455.860528,27.304944,45.449250,-71.154667,1.590805,229.670000,230.599166,229.581111


In [64]:

def concat_dataframes_unique_index(df_list):
    # Combine all indices into a single Series
    all_indices = pd.concat([df.index.to_series() for df in df_list], ignore_index=True)

    # Convert to UTC datetime
    utc_indices = pd.to_datetime(all_indices, unit='s', utc=True)

    #
    # Checking a lot of stuff about the indices
    #

    # Sanity check for UTC conversion
    assert utc_indices.notnull().all(), "Some indices could not be converted to UTC timestamps"
    assert str(utc_indices.dt.tz) == 'UTC', "Indices are not in UTC timezone."
    # Sanity check for duplicates
    assert len(utc_indices) == utc_indices.nunique(), "Duplicate index values found across dataframes."
    # Check if all indices are sorted
    assert utc_indices.is_monotonic_increasing, "Indices are not sorted in increasing order."
    # Check if they are equally spaced
    time_diffs = utc_indices.diff().dropna()
    assert time_diffs.nunique() == 1, "Indices are not equally spaced."

    #
    # Index now contains all unique UTC timestamps from all dataframes
    #

    # Concatenate dataframes
    df_full = pd.concat(df_list, axis=0, ignore_index=True)

    # Replace the index with the UTC-converted version
    df_full.index = utc_indices

    return df_full

In [65]:
result = concat_dataframes_unique_index(dfs)


In [66]:
result


Unnamed: 0_level_0,S_1,S_2,S_3,S_TOT,I_1,I_2,I_3,PF_1,PF_2,PF_3,...,P_2,P_3,P_TOT,Q_1,Q_2,Q_3,Q_TOT,U_1,U_2,U_3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2018-01-01 01:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2018-01-01 02:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2018-01-01 03:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2018-01-01 04:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00+00:00,68.514889,88.162028,90.774917,247.454139,0.299528,0.385639,0.397056,0.670058,0.637150,0.708508,...,57.536583,65.846445,169.284722,29.993389,45.084667,20.765444,95.841917,228.626389,229.112500,228.769444
2020-12-31 20:00:00+00:00,67.845917,90.130805,63.662972,221.632055,0.297611,0.394417,0.279750,0.675228,0.638378,0.689681,...,59.607389,44.165778,149.593973,29.504000,45.778500,-7.423167,67.853306,228.243055,228.581945,227.779722
2020-12-31 21:00:00+00:00,67.959805,79.606972,94.706111,242.278833,0.297833,0.348139,0.415667,0.674733,0.642914,0.708181,...,52.921333,68.755417,167.531944,29.819222,36.524222,25.067972,91.410249,228.618611,228.651389,227.846945
2020-12-31 22:00:00+00:00,67.972139,63.908639,58.726750,190.586417,0.297250,0.279333,0.257417,0.674769,0.640089,0.687406,...,41.175750,40.366806,127.392750,29.923333,24.890389,-12.755861,42.054528,228.827500,228.640834,228.343334


In [32]:
with h5py.File(hdf5_buffer, "r") as hdf:
    def print_structure(name, obj):
        print(name, dict(obj.attrs))
    hdf.visititems(print_structure)

MISC {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0')}
MISC/ES1 {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0')}
MISC/ES1/TRANSFORMER {'CLASS': np.bytes_(b'GROUP'), 'TITLE': Empty(dtype=dtype('S1')), 'VERSION': np.bytes_(b'1.0'), 'data_columns': np.bytes_(b'(lp0\nVS_1\np1\naVS_2\np2\naVS_3\np3\naVS_TOT\np4\naVI_1\np5\naVI_2\np6\naVI_3\np7\naVPF_1\np8\naVPF_2\np9\naVPF_3\np10\naVPF_TOT\np11\naVP_1\np12\naVP_2\np13\naVP_3\np14\naVP_TOT\np15\naVQ_1\np16\naVQ_2\np17\naVQ_3\np18\naVQ_TOT\np19\naVU_1\np20\naVU_2\np21\naVU_3\np22\na.'), 'encoding': np.bytes_(b'UTF-8'), 'error_margin': np.float64(0.005), 'errors': np.bytes_(b'strict'), 'index_cols': np.bytes_(b'(lp0\n(I0\nVindex\np1\ntp2\na.'), 'info': np.bytes_(b'(dp0\nI1\n(dp1\nVnames\np2\n(lp3\nNasVtype\np4\nVIndex\np5\nssVindex\np6\n(dp7\nsVS_1\np8\n(dp9\nsVS_2\np10\n(dp11\nsVS_3\np12\n(dp13\nsVS_TOT\np14\n(dp15\nsVI_1\np16\n(dp17\nsVI_2\np18\n(dp1

In [33]:
with h5py.File(hdf5_buffer, "r") as h5file:
    # Navigate to the table
    table = h5file['/NO_PV/SFH11/HOUSEHOLD/table']

    # Print one row
    #print(table[0])  # First row as a numpy.void
    #print(table[0]['P_TOT'])  # Access specific field

    # Or convert to structured array or DataFrame
    df = pd.DataFrame.from_records(table[:])

    print(df.head())


        index  S_1  S_2  S_3  S_TOT  I_1  I_2  I_3  PF_1  PF_2  ...  P_2  P_3  \
0  1514764800  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN  ...  NaN  NaN   
1  1514768400  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN  ...  NaN  NaN   
2  1514772000  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN  ...  NaN  NaN   
3  1514775600  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN  ...  NaN  NaN   
4  1514779200  NaN  NaN  NaN    NaN  NaN  NaN  NaN   NaN   NaN  ...  NaN  NaN   

   P_TOT  Q_1  Q_2  Q_3  Q_TOT  U_1  U_2  U_3  
0    NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN  
1    NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN  
2    NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN  
3    NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN  
4    NaN  NaN  NaN  NaN    NaN  NaN  NaN  NaN  

[5 rows x 23 columns]


In [34]:
df

Unnamed: 0,index,S_1,S_2,S_3,S_TOT,I_1,I_2,I_3,PF_1,PF_2,...,P_2,P_3,P_TOT,Q_1,Q_2,Q_3,Q_TOT,U_1,U_2,U_3
0,1514764800,,,,,,,,,,...,,,,,,,,,,
1,1514768400,,,,,,,,,,...,,,,,,,,,,
2,1514772000,,,,,,,,,,...,,,,,,,,,,
3,1514775600,,,,,,,,,,...,,,,,,,,,,
4,1514779200,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,1546282800,97.163694,99.381389,1283.907110,1480.471859,0.419528,0.428333,5.590833,0.704983,0.634511,...,66.606195,1266.794363,1402.465164,20.948611,47.246111,-49.055639,19.156694,231.487500,232.211944,229.716389
8756,1546286400,107.144694,87.332167,757.673417,952.143920,0.464222,0.376778,3.294444,0.786947,0.624700,...,55.601389,732.726110,872.485861,20.171444,44.844583,-51.111167,13.910528,230.804722,231.809723,230.223056
8757,1546290000,112.001528,94.295694,372.052055,578.365307,0.484778,0.405417,1.612778,0.793728,0.624783,...,60.331889,347.867973,497.520361,20.264667,49.665472,-25.849306,44.071778,231.194444,232.398055,230.730555
8758,1546293600,108.708583,84.389639,349.026112,542.151583,0.470028,0.362556,1.512278,0.780317,0.628350,...,53.815416,321.975222,460.494389,20.437694,54.065444,-62.983472,11.512278,231.384722,232.779722,230.821111


In [35]:
# ## 5. Extract Relevant Datasets

def extract_relevant_data(hdf_buffer: BytesIO, datasets: dict) -> pd.DataFrame:
    """
    Reads specified datasets from the HDF5 buffer and returns a consolidated DataFrame.
    `datasets` should map column names to HDF5 paths.
    """
    with h5py.File(hdf_buffer, "r") as hdf:
        data = {}
        for col_name, path in datasets.items():
            data[col_name] = hdf[path][()]
    df = pd.DataFrame(data)
    return df

In [36]:
transformer_df = pd.read_hdf(hdf5_buffer, key='/MISC/ES1/TRANSFORMER/table')

NotImplementedError: Support for generic buffers has not been implemented.

In [None]:
# Example mapping: change to your actual paths
dataset_map = {
    'timestamp': 'sensor/time',
    'temperature': 'sensor/temperature',
    'humidity': 'sensor/humidity'
}

# Extract DataFrame
df = extract_relevant_data(hdf5_buffer, dataset_map)

df.head()