In [1]:
%reload_ext autoreload
%autoreload 2

# HDF

This module provides a common interface to access HDF files. It can be imported as follows:

In [2]:
import alphabase.io.hdf

# Other packages used to demonstrate functionality
import numpy as np
import pandas as pd
import re
import os
import contextlib

Instead of relying directly on the `h5py` interface, we will use an HDF wrapper file to provide consistent access to only those specific HDF features we want. Since components of an HDF file come in three shapes `datasets`, `groups` and `attributes`, we will first define a generic HDF wrapper object to handle these components. Once this is done, the HDF wrapper file can be treated as such an object with additional features to open and close the initial connection.

In [21]:
#hide

import tempfile

def test_HDF_wrappers():
#     hdf_file_name = os.path.join(tempfile.gettempdir(), "sandbox.hdf")
    hdf_file_name = "sandbox.hdf"
    hdf_file = alphabase.io.hdf.HDF_File(
        hdf_file_name,
        read_only=False,
        truncate=True,
        delete_existing=True
    )
#     np.testing.assert_equal(0, hdf_file.shape)
    file_size = os.path.getsize(hdf_file_name)
    hdf_file.attr1 = 1
    np.testing.assert_equal(1, hdf_file.attr1)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    array = np.random.rand(10)
    hdf_file.array = array
    np.testing.assert_equal(array, hdf_file.array.values)
    np.testing.assert_equal(array[:3], hdf_file.array[:3])
    np.testing.assert_equal((10,), hdf_file.array.shape)
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    hdf_file.array.array_attr = "some attr"
    np.testing.assert_equal(hdf_file.array.array_attr, "some attr")
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
#     assert 0 == 1, "YAR"
    group = {
        "subgroup1": {
            "subsubgroup": {},
            "same_array": array,
            "a_bool": True
        },
        "subgroup2": {}
    }
    hdf_file.group = group
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
    np.testing.assert_equal(hdf_file.group.subgroup1.a_bool, True)
#     np.testing.assert_equal(hdf_file.group.subgroup1.shape, 2)
    df = pd.DataFrame(
        {
            "col2": np.arange(3),
            "col_str": ["str", "i", "ngs"],
        }
    )
    hdf_file.df = df
    file_size, old_file_size = os.path.getsize(hdf_file_name), file_size
    assert file_size > old_file_size, "Filesize not increased"
#     return hdf_file.df.values, df
    assert hdf_file.df.values.equals(df)
    
test_HDF_wrappers()

In [4]:
hdf_file_name = "sandbox.hdf"
hdf_file = alphabase.io.hdf.HDF_File(
    hdf_file_name,
    read_only=False,
    truncate=True,
    delete_existing=True
)

In [5]:
hdf_file.attr1 = 1
hdf_file.author = "Mathhias"
hdf_file.group = {}
hdf_file.group.array = np.arange(10)

In [6]:
hdf_file.group.array.append(np.arange(5))

In [7]:
hdf_file.group.array.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0])

In [8]:
df = pd.DataFrame(
    {
        "feng": np.arange(10),
        "sander": "my name"
    }
)
hdf_file.df = df

In [9]:
df["sander"] = "YAR"
hdf_file.df.append(df)

In [10]:
len(hdf_file.df)

20

In [11]:
# df2 = pd.DataFrame(
#     {
#         "feng": np.arange(10),
#         "sander": "your name"
#     }|
# )

In [12]:
# hdf_file_name = "sandbox.hdf"
# hdf_file2 = HDF_File(
#     hdf_file_name,
# )

In [13]:
# hdf_file2.feng = 2

In [14]:
# hdf_file2.set_read_only(False)

In [15]:
# hdf_file2.set_truncate(True)

In [16]:
# hdf_file2.metadata

In [17]:
# %timeit hdf_file.group.array[[1,5,8]]
# %timeit hdf_file.group.array.values[[1,5,8]]

In [18]:
array = np.array(["str", "i", "ngs"])
array = array.astype(object)
hdf_file.array2 = array
hdf_file.array = np.arange(10)

In [19]:
hdf_file.array2.values

array(['str', 'i', 'ngs'], dtype=object)

In [20]:
import h5py
self = hdf_file.array
with h5py.File(self.file_name, "r") as hdf_file2:
    ds = hdf_file2[self.name]
    if h5py.check_string_dtype(ds.dtype) is not None:
        ds = ds.asstr()
    x = ds[...]
        
    
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])