In [None]:
# default_exp hdf

# HDF

This module provides a common interface to access HDF files.

In [1]:
#export

import h5py
import numpy as np
import pandas as pd
import re
import os
import contextlib

Instead of relying directly on the `h5py` interface, we will use an HDF wrapper file to provide consistent access to only those specific HDF features we want. Since components of an HDF file come in three shapes `datasets`, `groups` and `attributes`, we will first define a generic HDF wrapper object to handle these components. Once this is done, the HDF wrapper file can be treated as such an object with additional features to open and close the initial connection.

In [2]:
#export

class HDF_Object_Wrapper(object):
    '''
    A generic class to access HDF components.
    '''

    @property
    def read_only(self):
        return self._read_only
    
    @property
    def truncate(self):
        return self._truncate
    
    @property
    def hdf_parent_file_name(self):
        return self._hdf_parent_file_name
    
    @property
    def hdf_parent_group_name(self):
        return self._hdf_parent_group_name
        
    @property
    def values(self):
        return self[...]
          
    @property
    def metadata(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            return dict(hdf_file[self.hdf_parent_group_name].attrs)

    @property
    def dtype(self):
        return self._dtype
        
    @property
    def shape(self):
        return self._shape
    
    def __init__(
        self,
        *,
        hdf_parent_file_name: str,
        hdf_parent_group_name: str,
        read_only: bool = True,
        truncate: bool = False,
    ):
        object.__setattr__(self, "_read_only", read_only)
        object.__setattr__(self, "_truncate", truncate)
        object.__setattr__(
            self,
            "_hdf_parent_file_name",
            hdf_parent_file_name
        )
        object.__setattr__(
            self,
            "_hdf_parent_group_name",
            hdf_parent_group_name
        )
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            for name, value in hdf_object.attrs.items():
                object.__setattr__(self, name, value)
            if isinstance(hdf_object, h5py.Dataset):
                object.__setattr__(self, "_dtype", hdf_object.dtype)
                object.__setattr__(self, "_shape", hdf_object.shape)
            else:              
                for name in hdf_object:
                    subobject = HDF_Object_Wrapper(
                        hdf_parent_file_name=self.hdf_parent_file_name,
                        hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
                        read_only=self.read_only,
                        truncate=self.truncate,
                    )
                    object.__setattr__(self, name, subobject)
                if "is_pd_dataframe" in hdf_object.attrs:
                    object.__setattr__(self, "_dtype", "dataframe")
                    object.__setattr__(
                        self,
                        "_shape",
                        (
                            subobject.shape,
                            len(hdf_object)
                        )
                    )
                else:
                    object.__setattr__(self, "_dtype", "group")
                    object.__setattr__(self, "_shape", len(hdf_object))

    def __iter__(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            if isinstance(self.dtype, str):
                for name in hdf_object:    
                    yield self.__getattribute__(name)
            else:
                for i in hdf_object:    
                    yield i
                    
    def __eq__(self, other):
        return (
            self.hdf_parent_file_name == other.self.hdf_parent_file_name
        ) and (
            self.hdf_parent_group_name == other.hdf_parent_group_name
        )
                    
    def set_read_only(self, read_only: bool = True):
        object.__setattr__(self, "_read_only", read_only)
        if isinstance(self.dtype, str):
            for subset in self:
                subset.set_read_only(read_only)
        
    def set_truncate(self, truncate: bool = True):
        object.__setattr__(self, "_truncate", truncate)
        if isinstance(self.dtype, str):
            for subset in self:
                subset.set_truncate(truncate)


    @contextlib.contextmanager
    def modify(self, read_only=False, truncate=True):
        original_read_only = self.read_only
        original_truncate = self.truncate
        try:
            self.set_read_only(read_only)
            self.set_truncate(truncate)
            yield self
        finally:
            self.set_read_only(original_read_only)
            self.set_truncate(original_truncate)
                
    def __setattr__(self, name, value):
        if self.read_only:
            raise AttributeError("Cannot set read-only attributes")
        elif not isinstance(name, str):
            raise KeyError(f"Attribute name '{name}' is not a string")
        elif not bool(re.match(r'^[a-zA-Z_][\w.-]*$', name)):
            raise KeyError(f"Invalid attribute name: {name}")
        with h5py.File(self.hdf_parent_file_name, "a") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            exists = (name in hdf_object) or (name in hdf_object.attrs)
            if exists:
                if not self.truncate:
                    raise KeyError(
                        f"Attribute name '{name}' cannot be truncated"
                    )
            if isinstance(value, (str, bool, int, float)):
                hdf_object.attrs[name] = value
                parsed_value = value
            elif isinstance(value, (np.ndarray, pd.core.series.Series)):
                parsed_value = self.__create_new_dataset(
                    name,
                    value,
                    hdf_object,
                    exists,
                )
            elif isinstance(value, (dict, pd.DataFrame)):
                parsed_value = self.__create_new_group(
                    name,
                    value,
                    hdf_object,
                    exists,
                )
            else:
                raise NotImplementedError(f"Invalid attribute value {value}")
            object.__setattr__(self, name, parsed_value)
    
    def __create_new_dataset(
        self,
        name:str,
        array: np.ndarray,
        hdf_object: h5py.Group,
        exists: bool,
    ):
        if exists:
            del hdf_object[name]
        if isinstance(array, (pd.core.series.Series)):
            array = array.values
        hdf_dataset = hdf_object.create_dataset(
            name,
            data=array,
#             TODO
            compression="lzf",
#             # compression="gzip" if compress else None, # TODO slower to make, faster to load?
            shuffle=True,
            chunks=True,
            maxshape=tuple([None for i in array.shape]),
        )
        new_array = HDF_Object_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        object.__setattr__(self, "_shape", self._shape + 1)
        return new_array
    
    def __create_new_group(
        self,
        name:str,
        new_dict: dict,
        hdf_object: h5py.Group,
        exists: bool,
    ):
        if exists:
            del hdf_object[name]
        new_group = hdf_object.create_group(name)
        if isinstance(new_dict, pd.DataFrame):
            new_dict = dict(new_dict)
            new_dict["is_pd_dataframe"] = True
            is_pd_dataframe = True
        else:
            is_pd_dataframe = False
        new_group = HDF_Object_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        for key, value in new_dict.items():
            new_group.__setattr__(key, value)
        if is_pd_dataframe:
            object.__setattr__(new_group, "_dtype", "dataframe")
        object.__setattr__(self, "_shape", self._shape + 1)
        return new_group
    
    def append(self, data):
        if isinstance(self.dtype, str):
            if self.dtype == "dataframe":
                for column_name in self:
                    array.append(data[column_name])
            else:
                raise NotImplementedError()
        else:
            with h5py.File(self.hdf_parent_file_name, "a") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                new_shape = tuple([i + j for i, j in zip(self.shape, data.shape)])
                hdf_object.resize(new_shape)
                hdf_object[self.shape[0]: self.shape[0] + data.shape[0]] = data
                object.__setattr__(self, "_shape", new_shape)
    
    def __getitem__(self, keys):
        if not isinstance(self.dtype, str):
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                return hdf_file[self.hdf_parent_group_name][keys]
        elif self.dtype == "dataframe":
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                arrays = {}
                for name in hdf_object:
                    if isinstance(hdf_object[name], h5py.Dataset):
                        arrays[name] = hdf_object[name]
                return pd.DataFrame(
                    {
                        name: array[keys] for name, array in arrays.items()
                    }
                )
        elif self.dtype == "group":
            with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
                hdf_object = hdf_file[self.hdf_parent_group_name]
                if keys is Ellipsis:
                    return {
                        name: self.__getattribute__(name) for name in hdf_object
                    }
                elif keys not in hdf_object:
                    raise KeyError(
                        f"No object with {keys} available in this group"
                    )
                return self.__getattribute__(keys)
            return object.self.__dict__[keys]  # TODO might be to generic?
        else:
            raise KeyError("Dtype not understood")
            
            
class HDF_File(HDF_Object_Wrapper):
    
    def __init__(
        self,
        hdf_parent_file_name: str,
        *,
        read_only: bool = True,
        truncate: bool = False,
        delete_existing: bool = False,
    ):
        if delete_existing:
            mode = "w"
        else:
            mode = "a"
        with h5py.File(hdf_parent_file_name, mode) as hdf_file:
            pass
        super().__init__(
            hdf_parent_file_name=hdf_parent_file_name,
            hdf_parent_group_name="/",
            read_only=read_only,
            truncate=truncate,
        )

In [40]:
#hide

import tempfile

def test_HDF_wrappers():
    hdf_file_name = os.path.join(tempfile.gettempdir(), "sandbox.hdf")
    hdf_file = HDF_File(
        hdf_file_name,
        read_only=False,
        truncate=True,
        delete_existing=True
    )
    np.testing.assert_equal(0, hdf_file.shape)
    file_size = os.path.getsize(hdf_file_name)
    hdf_file.attr1 = 1
    np.testing.assert_equal(1, hdf_file.attr1)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    array = np.random.rand(10)
    hdf_file.array = array
    np.testing.assert_equal(array, hdf_file.array.values)
    np.testing.assert_equal(array[:3], hdf_file.array[:3])
    np.testing.assert_equal((10,), hdf_file.array.shape)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    hdf_file.array.array_attr = "some attr"
    np.testing.assert_equal(hdf_file.array.array_attr, "some attr")
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    group = {
        "subgroup1": {
            "subsubgroup": {},
            "same_array": array,
            "a_bool": True
        },
        "subgroup2": {}
    }
    hdf_file.group = group
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    np.testing.assert_equal(hdf_file.group.subgroup1.a_bool, True)
    np.testing.assert_equal(hdf_file.group.subgroup1.shape, 2)
    df = pd.DataFrame(
        {
            "col2": np.arange(3),
            "col_str": ["str", "i", "ngs"],
        }
    )
    hdf_file.df = df
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    return hdf_file.df.values, df
#     assert hdf_file.df.values.equals(df)
    
test_HDF_wrappers()

(   col2 col_str
 0     0  b'str'
 1     1    b'i'
 2     2  b'ngs',
    col2 col_str
 0     0     str
 1     1       i
 2     2     ngs)

In [48]:
hdf_file_name = "sandbox.hdf"
hdf_file = HDF_File(
    hdf_file_name,
    read_only=False,
    truncate=True,
    delete_existing=True
)

In [49]:
hdf_file.attr1 = 1
hdf_file.name = "Mathhias"
hdf_file.group = {}
hdf_file.group.array = np.arange(10)

In [50]:
hdf_file.group.array.append(np.arange(5))

In [51]:
hdf_file.group.array.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])

In [52]:
hdf_file.df = pd.DataFrame(
    {
        "feng": np.arange(10),
        "sander": "my name"
    }
)

In [55]:
hdf_file.df.__dict__

{'_read_only': False,
 '_truncate': True,
 '_hdf_parent_file_name': 'sandbox.hdf',
 '_hdf_parent_group_name': '//df',
 '_dtype': 'dataframe',
 '_shape': 2,
 'feng': <__main__.HDF_Object_Wrapper at 0x7fbd00ce1190>,
 'sander': <__main__.HDF_Object_Wrapper at 0x7fbd00ce1460>}

In [18]:
df2 = pd.DataFrame(
    {
        "feng": np.arange(10),
        "sander": "your name"
    }
)

In [26]:
hdf_file_name = "sandbox.hdf"
hdf_file2 = HDF_File(
    hdf_file_name,
)

In [33]:
hdf_file2.feng = 2

In [28]:
hdf_file2.set_read_only(False)

In [32]:
hdf_file2.set_truncate(True)

In [37]:
hdf_file2.metadata

{'attr1': 1, 'feng': 2, 'name': 'Mathhias'}

In [39]:
%timeit hdf_file.group.array[[1,5,8]]
%timeit hdf_file.group.array.values[[1,5,8]]

502 µs ± 3.73 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
440 µs ± 25.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
class HDF_Object_Wrapper(object):
    '''
    A generic class to access HDF components.
    '''
    
    def __init__(
        self,
        *,
        hdf_parent_file_name: str,
        hdf_parent_group_name: str,
        read_only: bool = True,
        truncate: bool = False,
    ):
        self.set_read_only(read_only)
        self.set_truncate(truncate)
        object.__setattr__(self, "_hdf_parent_file_name", hdf_parent_file_name)
        object.__setattr__(self, "_hdf_parent_group_name", hdf_parent_group_name)
        self._import_from_hdf_file()
    
    def _import_from_hdf_file(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            for name, value in hdf_object.attrs.items():
                object.__setattr__(self, name, value)
            for name in hdf_object:
                if isinstance(hdf_object[name], h5py.Dataset):
                    dataset = HDF_Dataset_Wrapper(
                        hdf_parent_file_name=self.hdf_parent_file_name,
                        hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
                        read_only=self.read_only,
                        truncate=self.truncate,
                    )
                    object.__setattr__(self, name, dataset)
                else:
                    subgroup = HDF_Object_Wrapper(
                        hdf_parent_file_name=self.hdf_parent_file_name,
                        hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
                        read_only=self.read_only,
                        truncate=self.truncate,
                    )
                    object.__setattr__(self, name, subgroup)
    
    def __setattr__(self, name, value):
        if self.read_only:
            raise AttributeError("Cannot set read-only attributes")
        elif not isinstance(name, str):
            raise KeyError(f"Attribute name '{name}' is not a string")
        elif not bool(re.match(r'^[a-zA-Z_][\w.-]*$', test_string)):
            raise KeyError(f"Invalid attribute name: {name}")
        with h5py.File(self.hdf_parent_file_name, "a") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            exists = name in [
                item for group in x.components for item in group
            ]
            if exists:
                if not self.truncate:
                    raise KeyError(
                        f"Attribute name '{name}' cannot be truncated"
                    )
            if isinstance(value, (str, bool, int, float)):
                hdf_object.attrs[name] = value
                parsed_value = value
            elif isinstance(value, (np.ndarray, pd.core.series.Series)):
                if exists:
                    del hdf_object[name]
                parsed_value = self.__create_new_dataset(
                    name,
                    value,
                    hdf_object
                )
            elif isinstance(value, (dict, pd.DataFrame)):
                if exists:
                    del hdf_object[name]
                parsed_value = self.__create_new_group(
                    name,
                    value,
                    hdf_object
                )
            else:
                raise NotImplementedError(f"Invalid attribute value {value}")
            object.__setattr__(self, name, parsed_value)
    
    def __create_new_dataset(
        self,
        name:str,
        array: np.ndarray,
        hdf_object: h5py.Group,
    ):    
        if isinstance(array, (pd.core.series.Series)):
            array = array.values
        hdf_dataset = hdf_object.create_dataset(
            name,
            data=array,
#             TODO
#             compression="lzf" if compress else None,
#             # compression="gzip" if compress else None, # TODO slower to make, faster to load?
#             shuffle=compress,
#             chunks=True if chunked else None,
#             dtype=dtype
        )
        new_array = HDF_Dataset_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        return new_array
    
    def __create_new_group(
        self,
        name:str,
        new_dict: dict,
        hdf_object: h5py.Group,
    ):
        if isinstance(new_dict, pd.DataFrame):
            new_dict = dict(new_dict)
            new_dict["is_pd_dataframe"] = True
        new_group = hdf_object.create_group(name)
        new_object = HDF_Object_Wrapper(
            hdf_parent_file_name=self.hdf_parent_file_name,
            hdf_parent_group_name=f"{self.hdf_parent_group_name}/{name}",
            read_only=self.read_only,
            truncate=self.truncate,
        )
        for key, value in new_dict.items():
            new_object.__setattr__(key, value)
        return new_object

    def set_read_only(self, read_only: bool = True):
        object.__setattr__(self, "_read_only", read_only)
        groups, datasets = self.components[:2]
        for group in groups.values():
            group.set_read_only(read_only)
        for dataset in datasets.values():
            dataset.set_read_only(read_only)
        
    def set_truncate(self, truncate: bool = True):
        object.__setattr__(self, "_truncate", truncate)
        groups, datasets = self.components[:2]
        for group in groups.values():
            group.set_truncate(truncate)
        for dataset in datasets.values():
            dataset.set_truncate(truncate)
            
    @property
    def read_only(self):
        return self._read_only
    
    @property
    def truncate(self):
        return self._truncate
    
    @property
    def hdf_parent_file_name(self):
        return self._hdf_parent_file_name
    
    @property
    def hdf_parent_group_name(self):
        return self._hdf_parent_group_name
        
    @property
    def values(self):
        if hasattr(self, "is_pd_dataframe") and self.is_pd_dataframe:
            return pd.DataFrame(
                {
                    name: array.values for name, array in self.components[1].items()
                }
            )
        else:
            return self.components[:2]
        
    @property
    def metadata(self):
        return self.components[2]
    
    @property
    def components(self):
        metadata = {}
        groups = {}
        datasets = {}
        for key, value in self.__dict__.items():
            if key[0] != "_":
                if isinstance(value, HDF_Object_Wrapper):
                    if isinstance(value, HDF_Dataset_Wrapper):
                        datasets[key] = value
                    else:
                        groups[key] = value
                else:
                    metadata[key] = value
        return (groups, datasets, metadata)
        
        
class HDF_Dataset_Wrapper(HDF_Object_Wrapper):
    '''
    A generic class to access HDF Datasets.
    '''
    
    def _import_from_hdf_file(self):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            hdf_object = hdf_file[self.hdf_parent_group_name]
            for name, value in hdf_object.attrs.items():
                object.__setattr__(self, name, value)
            object.__setattr__(self, "_dtype", hdf_object.dtype)
            object.__setattr__(self, "_shape", hdf_object.shape)
    
    def __getitem__(self, keys):
        with h5py.File(self.hdf_parent_file_name, "r") as hdf_file:
            return hdf_file[self.hdf_parent_group_name][keys]

    @property
    def values(self):
        return self[...]      
     
    @property
    def dtype(self):
        return self._dtype
        
    @property
    def shape(self):
        return self._shape
    
#     def set_read_only(self, read_only: bool = True):
#         if not read_only:
#             raise NotImplementedError("Datasets cannot be modified in-place.")

In [None]:
%%time
x = HDF_Object_Wrapper(
    hdf_parent_file_name="/Users/swillems/Data/alphatims_testing2/20201016_tims03_Evo03_PS_MA_HeLa_200ng_DDA_06-15_5_6min_4cm_S1-A1_1_21717.hdf",
    hdf_parent_group_name="/",
)

In [None]:
# %timeit x.raw._intensity_values
# %timeit x.raw._intensity_values[::10]
# %timeit x.raw._intensity_values.values
# %timeit x.raw._intensity_values.values[::10]

In [None]:
x.raw._frames.values

In [None]:
x.set_read_only(False)
x.set_truncate(True)
# x.attr1 = "YAR"
# x.dataset = np.arange(10)
x.df2 = df
x.components

In [None]:
x.df2.x[:2]

In [None]:
test_string = "sanderÖ_1"
bool(re.match(r'^[a-zA-Z_][\w.-]*$', test_string))

In [None]:
[item for group in x.components for item in group]

In [None]:
x.dataset[:]

In [None]:
df = pd.DataFrame(
    {
        "x": np.arange(10),
        "y": np.arange(10) + 10,
    }
)
df.values

In [None]:
h5py.Group

In [None]:
%timeit x.dataset[:3]
%timeit x.dataset.values[:3]

In [None]:
x._read_only = False
# x.group.dataset.dtype = 1

In [None]:
x._hdf_parent_file_name = "x"

In [None]:
x.__dict__

In [None]:
x.group.sub_group.hdf_parent_group_name

In [None]:
with h5py.File("sandbox.hdf", "w") as f:
    f.attrs["file_attribute"] = "some_value"
    dataset = np.arange(5)
    data = f.create_dataset("dataset", data=dataset)
#     help(data)
    xx = data.__dict__
    data.attrs["data_attribute"] = "some_value"
    group = f.create_group("group")
    group.attrs["group_attribute"] = "some_value"
    subdata = group.create_dataset("dataset", data=dataset)
    subdata.attrs["subdata_attribute"] = "some_value"
    subgroup = group.create_group("sub_group")
    subsubdata = subgroup.create_dataset("dataset", data=dataset)
    subsubdata.attrs["subsubdata_attribute"] = "some_value"
    subgroup.attrs["subgroup_attribute"] = "some_value"

xx

In [None]:
help(x)

In [None]:
# x._read_only=False
x.y = 1

In [None]:
x.__dict__

In [None]:
help(object)

In [None]:
h5py.Group

In [None]:
help(f)

In [None]:
import h5py
import numpy as np
import pandas as pd

  
class HDFGroup(h5py.Group):
    
    def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds):
        if isinstance(data, pd.DataFrame):
            print("dataframe")
            super().create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds) 
        else:
            super().create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds)
            
class HDFFile(h5py.File, HDFGroup): pass

In [None]:
with HDFFile("sandbox.hdf", "w") as f:
    x = np.arange(5)
    f.create_dataset("x", data=x)
    d = {
            "zz": x,
            "x": x.astype(np.int64),
            "xx": x.astype(np.float64),
    #         "z": ["sander", "2", "23", "4", "five"],
        }
    y=pd.DataFrame(
        d,
        copy=False
    )
    y["z"] = x
    f.create_dataset("y", data=y)
    yy = f["y"][...]
    g = f.create_group("g")
    g.create_dataset("y", data=y)

In [None]:
type(g)

In [None]:
x = np.arange(5)

In [None]:
f.create_dataset("test", data=x)

In [None]:
y=pd.DataFrame(
    {
        "x": x.astype(np.int64),
        "x": x.astype(np.float64),
#         "z": ["sander", "2", "23", "4", "five"],
    },
    copy=False
)

In [None]:
f.create_dataset("test2", data=y)

In [None]:
f.close()

In [None]:
x[0]=2
d["zz"][0] = 2
y

In [None]:
# f = h5py.File("sandbox2.hdf", "w")
f = HDFFile("sandbox5.hdf", "w")

In [None]:
help(f)

In [None]:
help(g)

In [None]:
#export

import h5py
import os
import time
from alphapept.__main__ import VERSION_NO


class HDF_File(object):
    '''
    A generic class to store and retrieve on-disk
    data with an HDF container.
    '''

    @property
    def original_file_name(self):
        return self.read(
            attr_name="original_file_name"
        )  # See below for function definition

    @property
    def file_name(self):
        return self.__file_name

    @property
    def directory(self):
        return os.path.dirname(self.file_name)

    @property
    def creation_time(self):
        return self.read(
            attr_name="creation_time"
        )  # See below for function definition

    @property
    def last_updated(self):
        return self.read(
            attr_name="last_updated"
        )  # See below for function definition

    @property
    def version(self):
        return self.read(
            attr_name="version"
        )  # See below for function definition

    @property
    def is_read_only(self):
        return self.__is_read_only

    @property
    def is_overwritable(self):
        return self.__is_overwritable

    def read(self):
        pass

    def write(self):
        pass

    def __init__(
        self,
        file_name: str,
        is_read_only: bool = True,
        is_new_file: bool = False,
        is_overwritable: bool = False,
    ):
        """Create/open a wrapper object to access HDF data.

        Args:
            file_name (str): The file_name of the HDF file.
            is_read_only (bool): If True, the HDF file cannot be modified. Defaults to True.
            is_new_file (bool): If True, an already existing file will be completely removed. Defaults to False.
            is_overwritable (bool): If True, already existing arrays will be overwritten. If False, only new data can be appended. Defaults to False.

        """
        self.__file_name = os.path.abspath(file_name)
        if is_new_file:
            is_read_only = False
            if not os.path.exists(self.directory):
                os.makedirs(self.directory)
            with h5py.File(self.file_name, "w") as hdf_file:
                current_time = time.asctime()
                hdf_file.attrs["creation_time"] = current_time
                hdf_file.attrs["original_file_name"] = self.__file_name
                hdf_file.attrs["version"] = VERSION_NO
                hdf_file.attrs["last_updated"] = current_time
        else:
            with h5py.File(self.file_name, "r") as hdf_file:
                self.check()
        if is_overwritable:
            is_read_only = False
        self.__is_read_only = is_read_only
        self.__is_overwritable = is_overwritable

    def __eq__(self, other):
        return self.file_name == other.file_name

    def __hash__(self):
        return hash(self.file_name)

    def __str__(self):
        return f"<HDF_File {self.file_name}>"

    def __repr__(self):
        return str(self)

    def check(
        self,
        version: bool = True,
        file_name: bool = True,
    ) -> list:
        """Check if the `version` or `file_name` of this HDF_File have changed.

        Args:
            version (bool): If False, do not check the version. Defaults to True.
            file_name (bool): If False, do not check the file_name. Defaults to True.

        Returns:
            list: A list of warning messages stating any issues.

        """
        warning_messages = []
        if version:
            current_version = VERSION_NO
            creation_version = self.version
            if creation_version != current_version:
                warning_messages.append(
                    f"{self} was created with version "
                    f"{creation_version} instead of {current_version}."
                )
        if file_name:
            if self.file_name != self.original_file_name:
                warning_messages.append(
                    f"The file name of {self} has been changed from"
                    f"{self.original_file_name} to {self.file_name}."
                )
        return warning_messages

In [None]:
# %%time
# x = HDF_Object_Wrapper(
#     hdf_parent_file_name="/Users/swillems/Data/alphatims_testing2/20201016_tims03_Evo03_PS_MA_HeLa_200ng_DDA_06-15_5_6min_4cm_S1-A1_1_21717.hdf",
#     hdf_parent_group_name="/",
# )

In [None]:
x = HDF_File("sanderbox.hdf", read_only=False, truncate=True, new=True)

In [None]:
x.group = {}

In [None]:
x.group_with_att = {
    "name": "sander",
    "number": 1,
    "is_true": False
}

In [None]:
x.group.data = np.arange(10) / 3

In [None]:
df = pd.DataFrame(
    {
        "col_str": ["str", "i", "ngs"],
        "col2": np.empty(3)
    }
)
x.df = df

In [None]:
x.df.values

In [None]:
x.set_read_only(True)
# x.set_truncate(True)
# x.raw.test = arr

In [None]:
isinstance(x.raw._mz_values.dtype, str)

In [None]:
for i in x.raw._mobility_values: print(i)

In [None]:
x.raw._mz_values.hdf_parent_group_name