In [1]:
import random
import numpy as np
USE_H5PY=False
if USE_H5PY:
    import h5py
    filepath = "./compound.h5"
else:
    import h5pyd as h5py
    filepath = "/home/test_user1/test/compound.h5"

In [2]:
# create a new domain/file
f = h5py.File(filepath, "w")

In [3]:
# create a numpy dtype with 260 Fields: 
# A0, A1, A2, ..., Z7, Z8, Z9
fields = []
for i in range(26):
    ch1 = chr(ord('A') + i)
    for j in range(10):
        ch2 = chr(ord('0') + j)
        fields.append((ch1+ch2, "S6"))
dt = np.dtype(fields)


In [4]:
# create a dataset using the dtype
NUM_ROWS = 10000
dset = f.create_dataset("dset", (NUM_ROWS,), dtype=dt)
dset


<HDF5 dataset "dset": shape (10000,), type "|V1560">

In [5]:
# write some values into the dataset
arr = np.zeros((NUM_ROWS,), dtype=dt)
for i in range(NUM_ROWS):
    row = arr[i]
    for name in dt.names:
        row[name] = f"{i:03d}_{name}".encode()
dset[:] = arr[:]

In [6]:
# get a random set of field names.
# k controls the max number of names returned
names = random.choices(dt.names, k=10)
names = list(set(names))
names

['C4', 'P6', 'V0', 'S8', 'P4', 'B5', 'L1', 'E7']

In [7]:
# Get the dataset values and then return the field selection
%time dset[:][names]

CPU times: user 27.1 ms, sys: 16.1 ms, total: 43.2 ms
Wall time: 93.8 ms


array([(b'000_C4', b'000_P6', b'000_V0', b'000_S8', b'000_P4', b'000_B5', b'000_L1', b'000_E7'),
       (b'001_C4', b'001_P6', b'001_V0', b'001_S8', b'001_P4', b'001_B5', b'001_L1', b'001_E7'),
       (b'002_C4', b'002_P6', b'002_V0', b'002_S8', b'002_P4', b'002_B5', b'002_L1', b'002_E7'),
       ...,
       (b'9997_C', b'9997_P', b'9997_V', b'9997_S', b'9997_P', b'9997_B', b'9997_L', b'9997_E'),
       (b'9998_C', b'9998_P', b'9998_V', b'9998_S', b'9998_P', b'9998_B', b'9998_L', b'9998_E'),
       (b'9999_C', b'9999_P', b'9999_V', b'9999_S', b'9999_P', b'9999_B', b'9999_L', b'9999_E')],
      dtype={'names': ['C4', 'P6', 'V0', 'S8', 'P4', 'B5', 'L1', 'E7'], 'formats': ['S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6'], 'offsets': [144, 936, 1260, 1128, 924, 90, 666, 282], 'itemsize': 1560})

In [8]:
# Have HSDS (or HDF5 lib) return just the values for the given set of field names
# Will return same values as above cell, but should be faster as less data needs 
# to be transferred 
%time dset.fields(names)[:]

CPU times: user 3.92 ms, sys: 0 ns, total: 3.92 ms
Wall time: 20.7 ms


array([(b'000_C4', b'000_P6', b'000_V0', b'000_S8', b'000_P4', b'000_B5', b'000_L1', b'000_E7'),
       (b'001_C4', b'001_P6', b'001_V0', b'001_S8', b'001_P4', b'001_B5', b'001_L1', b'001_E7'),
       (b'002_C4', b'002_P6', b'002_V0', b'002_S8', b'002_P4', b'002_B5', b'002_L1', b'002_E7'),
       ...,
       (b'9997_C', b'9997_P', b'9997_V', b'9997_S', b'9997_P', b'9997_B', b'9997_L', b'9997_E'),
       (b'9998_C', b'9998_P', b'9998_V', b'9998_S', b'9998_P', b'9998_B', b'9998_L', b'9998_E'),
       (b'9999_C', b'9999_P', b'9999_V', b'9999_S', b'9999_P', b'9999_B', b'9999_L', b'9999_E')],
      dtype=[('C4', 'S6'), ('P6', 'S6'), ('V0', 'S6'), ('S8', 'S6'), ('P4', 'S6'), ('B5', 'S6'), ('L1', 'S6'), ('E7', 'S6')])