In [1]:
import random
import numpy as np
USE_H5PY=False
if USE_H5PY:
    import h5py
    filepath = "./compound.h5"
else:
    import h5pyd as h5py
    filepath = "/home/test_user1/test/compound.h5"

In [2]:
# create a new domain/file
f = h5py.File(filepath, "w")

In [3]:
# create a numpy dtype with 260 Fields: 
# A0, A1, A2, ..., Z7, Z8, Z9
fields = []
for i in range(26):
    ch1 = chr(ord('A') + i)
    for j in range(10):
        ch2 = chr(ord('0') + j)
        fields.append((ch1+ch2, "S6"))
dt = np.dtype(fields)


In [4]:
# create a dataset using the dtype
NUM_ROWS = 10000
dset = f.create_dataset("dset", (NUM_ROWS,), dtype=dt)
dset


<HDF5 dataset "dset": shape (10000,), type "|V1560">

In [5]:
# write some values into the dataset
arr = np.zeros((NUM_ROWS,), dtype=dt)
for i in range(NUM_ROWS):
    row = arr[i]
    for name in dt.names:
        row[name] = f"{i:03d}_{name}".encode()
dset[:] = arr[:]

In [6]:
# get a random set of field names.
# k controls the max number of names returned
names = random.choices(dt.names, k=10)
names = list(set(names))
names

['G4', 'Z3', 'P5', 'A2', 'Q7', 'N9', 'C2', 'H6', 'A0', 'K3']

In [7]:
# Get the dataset values and then return the field selection
%time dset[:][names]

CPU times: user 45.6 ms, sys: 4.02 ms, total: 49.7 ms
Wall time: 146 ms


array([(b'000_G4', b'000_Z3', b'000_P5', b'000_A2', b'000_Q7', b'000_N9', b'000_C2', b'000_H6', b'000_A0', b'000_K3'),
       (b'001_G4', b'001_Z3', b'001_P5', b'001_A2', b'001_Q7', b'001_N9', b'001_C2', b'001_H6', b'001_A0', b'001_K3'),
       (b'002_G4', b'002_Z3', b'002_P5', b'002_A2', b'002_Q7', b'002_N9', b'002_C2', b'002_H6', b'002_A0', b'002_K3'),
       ...,
       (b'9997_G', b'9997_Z', b'9997_P', b'9997_A', b'9997_Q', b'9997_N', b'9997_C', b'9997_H', b'9997_A', b'9997_K'),
       (b'9998_G', b'9998_Z', b'9998_P', b'9998_A', b'9998_Q', b'9998_N', b'9998_C', b'9998_H', b'9998_A', b'9998_K'),
       (b'9999_G', b'9999_Z', b'9999_P', b'9999_A', b'9999_Q', b'9999_N', b'9999_C', b'9999_H', b'9999_A', b'9999_K')],
      dtype={'names': ['G4', 'Z3', 'P5', 'A2', 'Q7', 'N9', 'C2', 'H6', 'A0', 'K3'], 'formats': ['S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6'], 'offsets': [384, 1518, 930, 12, 1002, 834, 132, 456, 0, 618], 'itemsize': 1560})

In [8]:
# Have HSDS (or HDF5 lib) return just the values for the given set of field names
# Will return same values as above cell, but should be faster as less data needs 
# to be transferred 
%time dset.fields(names)[:]

CPU times: user 1.11 ms, sys: 3.89 ms, total: 5 ms
Wall time: 25.6 ms


array([(b'000_G4', b'000_Z3', b'000_P5', b'000_A2', b'000_Q7', b'000_N9', b'000_C2', b'000_H6', b'000_A0', b'000_K3'),
       (b'001_G4', b'001_Z3', b'001_P5', b'001_A2', b'001_Q7', b'001_N9', b'001_C2', b'001_H6', b'001_A0', b'001_K3'),
       (b'002_G4', b'002_Z3', b'002_P5', b'002_A2', b'002_Q7', b'002_N9', b'002_C2', b'002_H6', b'002_A0', b'002_K3'),
       ...,
       (b'9997_G', b'9997_Z', b'9997_P', b'9997_A', b'9997_Q', b'9997_N', b'9997_C', b'9997_H', b'9997_A', b'9997_K'),
       (b'9998_G', b'9998_Z', b'9998_P', b'9998_A', b'9998_Q', b'9998_N', b'9998_C', b'9998_H', b'9998_A', b'9998_K'),
       (b'9999_G', b'9999_Z', b'9999_P', b'9999_A', b'9999_Q', b'9999_N', b'9999_C', b'9999_H', b'9999_A', b'9999_K')],
      dtype=[('G4', 'S6'), ('Z3', 'S6'), ('P5', 'S6'), ('A2', 'S6'), ('Q7', 'S6'), ('N9', 'S6'), ('C2', 'S6'), ('H6', 'S6'), ('A0', 'S6'), ('K3', 'S6')])

In [9]:
f.close()