In [None]:
# default_exp io.hdf

# HDF

This module provides a common interface to access HDF files.

In [1]:
#export

import h5py
import numpy as np
import pandas as pd
import re
import os
import contextlib

Instead of relying directly on the `h5py` interface, we will use an HDF wrapper file to provide consistent access to only those specific HDF features we want. Since components of an HDF file come in three shapes `datasets`, `groups` and `attributes`, we will first define a generic HDF wrapper object to handle these components. Once this is done, the HDF wrapper file can be treated as such an object with additional features to open and close the initial connection.

In [2]:
#export

class HDF_Object_Wrapper(object):
    '''
    A generic class to access HDF components.
    '''

    @property
    def read_only(self):
        return self._read_only
    
    @property
    def truncate(self):
        return self._truncate
    
    @property
    def hdf_parent_file_name(self):
        return self._hdf_parent_file_name
    
    @property
    def hdf_parent_group_name(self):
        return self._hdf_parent_group_name
        
    @property
    def values(self):
        return self[...]
          
    @property
    def metadata(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            return dict(hdf_file[self.hdf_parent_group_name].attrs)

    @property
    def dtype(self):
        return self._dtype
        
    @property
    def shape(self):
        return self._shape
    
    def __init__(
        self,
        *,
        hdf_parent_file_name: str,
        hdf_parent_group_name: str,
        read_only: bool = True,
        truncate: bool = False,
    ):
        object.__setattr__(self, "_read_only", read_only)
        object.__setattr__(self, "_truncate", truncate)
        object.__setattr__(
            self,
            "_hdf_parent_file_name",
            hdf_parent_file_name
        )
        object.__setattr__(
            self,
            "_hdf_parent_group_name",
            hdf_parent_group_name
        )
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            for name, value in hdf_object.attrs.items():
                object.__setattr__(self, name, value)
            if isinstance(hdf_object, h5py.Dataset):
                object.__setattr__(self, "_dtype", hdf_object.dtype)
                object.__setattr__(self, "_shape", hdf_object.shape)
            else:              
                for name in hdf_object:
                    subobject = HDF_Object_Wrapper(
                        hdf_parent_file_name=self.hdf_parent_file_name,
                        hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
                        read_only=self.read_only,
                        truncate=self.truncate,
                    )
                    object.__setattr__(self, name, subobject)
                if "is_pd_dataframe" in hdf_object.attrs:
                    object.__setattr__(self, "_dtype", "dataframe")
                    object.__setattr__(
                        self,
                        "_shape",
                        (
                            subobject.shape,
                            len(hdf_object)
                        )
                    )
                else:
                    object.__setattr__(self, "_dtype", "group")
                    object.__setattr__(self, "_shape", len(hdf_object))

    def __iter__(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            if isinstance(self.dtype, str):
                for name in hdf_object:    
                    yield self.__getattribute__(name)
            else:
                for i in hdf_object:    
                    yield i
                    
    def __eq__(self, other):
        return (
            self.hdf_parent_file_name == other.self.hdf_parent_file_name
        ) and (
            self.hdf_parent_group_name == other.hdf_parent_group_name
        )
                    
    def set_read_only(self, read_only: bool = True):
        object.__setattr__(self, "_read_only", read_only)
        if isinstance(self.dtype, str):
            for subset in self:
                subset.set_read_only(read_only)
        
    def set_truncate(self, truncate: bool = True):
        object.__setattr__(self, "_truncate", truncate)
        if isinstance(self.dtype, str):
            for subset in self:
                subset.set_truncate(truncate)


    @contextlib.contextmanager
    def modify(self, read_only=False, truncate=True):
        original_read_only = self.read_only
        original_truncate = self.truncate
        try:
            self.set_read_only(read_only)
            self.set_truncate(truncate)
            yield self
        finally:
            self.set_read_only(original_read_only)
            self.set_truncate(original_truncate)
                
    def __setattr__(self, name, value):
        if self.read_only:
            raise AttributeError("Cannot set read-only attributes")
        elif not isinstance(name, str):
            raise KeyError(f"Attribute name '{name}' is not a string")
        elif not bool(re.match(r'^[a-zA-Z_][\w.-]*$', name)):
            raise KeyError(f"Invalid attribute name: {name}")
        with h5py.File(self.hdf_parent_file_name, "a") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            exists = (name in hdf_object) or (name in hdf_object.attrs)
            if exists:
                if not self.truncate:
                    raise KeyError(
                        f"Attribute name '{name}' cannot be truncated"
                    )
            if isinstance(value, (str, bool, int, float)):
                hdf_object.attrs[name] = value
                parsed_value = value
            elif isinstance(value, (np.ndarray, pd.core.series.Series)):
                parsed_value = self.__create_new_dataset(
                    name,
                    value,
                    hdf_object,
                    exists,
                )
            elif isinstance(value, (dict, pd.DataFrame)):
                parsed_value = self.__create_new_group(
                    name,
                    value,
                    hdf_object,
                    exists,
                )
            else:
                raise NotImplementedError(f"Invalid attribute value {value}")
            object.__setattr__(self, name, parsed_value)
    
    def __create_new_dataset(
        self,
        name:str,
        array: np.ndarray,
        hdf_object: h5py.Group,
        exists: bool,
    ):
        if exists:
            del hdf_object[name]
        if isinstance(array, (pd.core.series.Series)):
            array = array.values
        hdf_dataset = hdf_object.create_dataset(
            name,
            data=array,
#             TODO
            compression="lzf",
#             # compression="gzip" if compress else None, # TODO slower to make, faster to load?
            shuffle=True,
            chunks=True,
            maxshape=tuple([None for i in array.shape]),
        )
        new_array = HDF_Object_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        object.__setattr__(self, "_shape", self._shape + 1)
        return new_array
    
    def __create_new_group(
        self,
        name:str,
        new_dict: dict,
        hdf_object: h5py.Group,
        exists: bool,
    ):
        if exists:
            del hdf_object[name]
        new_group = hdf_object.create_group(name)
        if isinstance(new_dict, pd.DataFrame):
            new_dict = dict(new_dict)
            new_dict["is_pd_dataframe"] = True
            is_pd_dataframe = True
        else:
            is_pd_dataframe = False
        new_group = HDF_Object_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        for key, value in new_dict.items():
            new_group.__setattr__(key, value)
        if is_pd_dataframe:
            object.__setattr__(new_group, "_dtype", "dataframe")
        object.__setattr__(self, "_shape", self._shape + 1)
        return new_group
    
    def append(self, data):
        if isinstance(self.dtype, str):
            if self.dtype == "dataframe":
                for column_name in self:
                    array.append(data[column_name])
            else:
                raise NotImplementedError()
        else:
            with h5py.File(self.hdf_parent_file_name, "a") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                new_shape = tuple([i + j for i, j in zip(self.shape, data.shape)])
                hdf_object.resize(new_shape)
                hdf_object[self.shape[0]: self.shape[0] + data.shape[0]] = data
                object.__setattr__(self, "_shape", new_shape)
    
    def __getitem__(self, keys):
        if not isinstance(self.dtype, str):
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                return hdf_file[self.hdf_parent_group_name][keys]
        elif self.dtype == "dataframe":
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                arrays = {}
                for name in hdf_object:
                    if isinstance(hdf_object[name], h5py.Dataset):
                        arrays[name] = hdf_object[name]
                return pd.DataFrame(
                    {
                        name: array[keys] for name, array in arrays.items()
                    }
                )
        elif self.dtype == "group":
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                if keys is Ellipsis:
                    return {
                        name: self.__getattribute__(name) for name in hdf_object
                    }
                elif keys not in hdf_object:
                    raise KeyError(
                        f"No object with {keys} available in this group"
                    )
                return self.__getattribute__(keys)
            return object.self.__dict__[keys]  # TODO might be to generic?
        else:
            raise KeyError("Dtype not understood")
            
            
class HDF_File(HDF_Object_Wrapper):
    
    def __init__(
        self,
        hdf_parent_file_name: str,
        *,
        read_only: bool = True,
        truncate: bool = False,
        delete_existing: bool = False,
    ):
        if delete_existing:
            mode = "w"
        else:
            mode = "a"
        with h5py.File(hdf_parent_file_name, mode) as hdf_file:
            pass
        super().__init__(
            hdf_parent_file_name=hdf_parent_file_name,
            hdf_parent_group_name="/",
            read_only=read_only,
            truncate=truncate,
        )

In [40]:
#hide

import tempfile

def test_HDF_wrappers():
    hdf_file_name = os.path.join(tempfile.gettempdir(), "sandbox.hdf")
    hdf_file = HDF_File(
        hdf_file_name,
        read_only=False,
        truncate=True,
        delete_existing=True
    )
    np.testing.assert_equal(0, hdf_file.shape)
    file_size = os.path.getsize(hdf_file_name)
    hdf_file.attr1 = 1
    np.testing.assert_equal(1, hdf_file.attr1)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    array = np.random.rand(10)
    hdf_file.array = array
    np.testing.assert_equal(array, hdf_file.array.values)
    np.testing.assert_equal(array[:3], hdf_file.array[:3])
    np.testing.assert_equal((10,), hdf_file.array.shape)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    hdf_file.array.array_attr = "some attr"
    np.testing.assert_equal(hdf_file.array.array_attr, "some attr")
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    group = {
        "subgroup1": {
            "subsubgroup": {},
            "same_array": array,
            "a_bool": True
        },
        "subgroup2": {}
    }
    hdf_file.group = group
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    np.testing.assert_equal(hdf_file.group.subgroup1.a_bool, True)
    np.testing.assert_equal(hdf_file.group.subgroup1.shape, 2)
    df = pd.DataFrame(
        {
            "col2": np.arange(3),
            "col_str": ["str", "i", "ngs"],
        }
    )
    hdf_file.df = df
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    return hdf_file.df.values, df
#     assert hdf_file.df.values.equals(df)
    
test_HDF_wrappers()

(   col2 col_str
 0     0  b'str'
 1     1    b'i'
 2     2  b'ngs',
    col2 col_str
 0     0     str
 1     1       i
 2     2     ngs)

In [48]:
hdf_file_name = "sandbox.hdf"
hdf_file = HDF_File(
    hdf_file_name,
    read_only=False,
    truncate=True,
    delete_existing=True
)

In [49]:
hdf_file.attr1 = 1
hdf_file.name = "Mathhias"
hdf_file.group = {}
hdf_file.group.array = np.arange(10)

In [50]:
hdf_file.group.array.append(np.arange(5))

In [51]:
hdf_file.group.array.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])

In [52]:
hdf_file.df = pd.DataFrame(
    {
        "feng": np.arange(10),
        "sander": "my name"
    }
)

In [55]:
hdf_file.df.__dict__

{'_read_only': False,
 '_truncate': True,
 '_hdf_parent_file_name': 'sandbox.hdf',
 '_hdf_parent_group_name': '//df',
 '_dtype': 'dataframe',
 '_shape': 2,
 'feng': <__main__.HDF_Object_Wrapper at 0x7fbd00ce1190>,
 'sander': <__main__.HDF_Object_Wrapper at 0x7fbd00ce1460>}

In [18]:
df2 = pd.DataFrame(
    {
        "feng": np.arange(10),
        "sander": "your name"
    }
)

In [26]:
hdf_file_name = "sandbox.hdf"
hdf_file2 = HDF_File(
    hdf_file_name,
)

In [33]:
hdf_file2.feng = 2

In [28]:
hdf_file2.set_read_only(False)

In [32]:
hdf_file2.set_truncate(True)

In [37]:
hdf_file2.metadata

{'attr1': 1, 'feng': 2, 'name': 'Mathhias'}

In [39]:
%timeit hdf_file.group.array[[1,5,8]]
%timeit hdf_file.group.array.values[[1,5,8]]

502 µs ± 3.73 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
440 µs ± 25.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
