# Compression and Chunking

Objectives
 * Explore the use of compression 
 * Learn about chunking
 * Understand how dataset data is allocated

In [1]:
USE_H5PY = True # set to False to use HSDS instead
if USE_H5PY:
    import h5py
    WORK_DIR="."  # this directory
else:
    import h5pyd as h5py
    WORK_DIR="hdf5://tutorial/"  # can be any folder where you have write access
import os.path as op
import numpy as np

In [2]:
filepath = op.join(WORK_DIR, "04.h5")
print(f"creating HDF5 file here: {filepath}")
f = h5py.File(filepath, 'w')
f.id.id

creating HDF5 file here: ./04.h5


72057594037927936

In [3]:
# there a different compression filters that can be used
# In HSDS, f.compressors will return a list of available filters
# For HDF5Lib, filters "gzip" and "lzf" are always available - others 
# may be sorted via a plugin
compressors = [None,]
if USE_H5PY:
    compressors.extend(("gzip", "lzf"))
else:
    compressors.extend(f.compressors)
compressors

[None, 'gzip', 'lzf']

In [4]:
# create a dataset with each type of compressor (+None for no compression)
shape = (80, 40)
dtype = 'i8'
for compression in compressors:
    dset_name = f"dset_{compression}"
    if dset_name in f:
        del f[dset_name] # delete the dataset if it already exists
    # create an array of random ints with a upper cap on the upper value
    # (otherwise compressor won't be able to much)
    arr = np.random.randint(0, 1000, shape)
    dset = f.create_dataset(dset_name, data=arr, compression=compression)


In [5]:
f.close()  # close file we can use h5ls on it (for h5py)

In [6]:
def get_dset_stats(filename, dsetname):
    if USE_H5PY:
        print(f"Running h5ls to get dataset info")
        ! h5ls -v {filename}/{dsetname}
    else:
        # H5PYD has a num_chunks attribute to tell you how many chunks
        # have been allocated
        # num_chunks is determined asynchronously by the HDF Server. 
        # If num_chunks is 0, wait a few seconds and re-run the cell.
        with h5py.File(filename) as f:
            dset = f[dsetname]
            logical_size = dset.dtype.itemsize
        for dim in dset.shape:
            logical_size *= dim
        print(f"logical size:   {logical_size}")
        if not dset.num_chunks:
            print("No chunks found, if something has been written to this dataset, wait a minute and try this again")
        else:
            obj_json = dset.id.obj_json
            layout = obj_json["layout"]
            chunk_shape = layout["dims"]
            chunk_size = dset.dtype.itemsize
            for dim in chunk_shape:
                chunk_size *= dim
            print(f"Chunks: {chunk_shape} {chunk_size} bytes")
            # allocated size is also determined asynchronously, but 
            # is show be updated if num_chunks is
            print(f"allocated size: {dset.allocated_size}")
            ratio = logical_size/dset.allocated_size
            ratio *= 100.0
            print(f"utilization: {ratio:.2f}%")
            print(f"num_chunks: {dset.num_chunks}")
            if dset.compression:
                print(f"Filter: {dset.compression} OPT: {dset.compression_opts}")
    print("-"*40)  

In [7]:
for compression in compressors:
    dset_name = f"dset_{compression}"
    print(dset_name)
    get_dset_stats(filepath, dset_name)

dset_None
Running h5ls to get dataset info
Opened "./04.h5" with sec2 driver.
dset_None                Dataset {80/80, 40/40}
    Location:  1:800
    Links:     1
    Storage:   25600 logical bytes, 25600 allocated bytes, 100.00% utilization
    Type:      native long
----------------------------------------
dset_gzip
Running h5ls to get dataset info
Opened "./04.h5" with sec2 driver.
dset_gzip                Dataset {80/80, 40/40}
    Location:  1:1400
    Links:     1
    Chunks:    {40, 20} 6400 bytes
    Storage:   25600 logical bytes, 7128 allocated bytes, 359.15% utilization
    Filter-0:  deflate-1 OPT {4}
    Type:      native long
----------------------------------------
dset_lzf
Running h5ls to get dataset info
Opened "./04.h5" with sec2 driver.
dset_lzf                 Dataset {80/80, 40/40}
    Location:  1:1672
    Links:     1
    Chunks:    {40, 20} 6400 bytes
    Storage:   25600 logical bytes, 10852 allocated bytes, 235.90% utilization
    Filter-0:  lzf-32000 OPT {4,

Problem: which filter gave the best compression?  Why do you think other filters would be used?

# Chunking

HDF datasets can be tiled into a a set of chunks.  This enables reading/writing data from storage to be done more efficiently.

With the HDF5 native library, each chunk is stored in a contiguous section of the file.

With HSDS, each chunk is stored as a seperate object. The chunk shape determines how many chunks will be used.  If not provided in create_dataset, the HDF5 library will store all the data for the file contiguously.  In HSDS a chunk layout will be determined automatically. 

In [8]:
# create a datatset using the chunks option
f = h5py.File(filepath, "a")  # re-open in append mode
dset4 = f.create_dataset("dset_chunks", (40,80), dtype='i1', chunks=(4,8))
dset4.id.id

360287970189639683

In [9]:
# The chunk property will return the chunk shape provided in the initializer
dset4.chunks

(4, 8)

In [10]:
# create a larger dataset using the chunks layout
dset5 = f.create_dataset("big_dset", (4000,8000), dtype='i1', chunks=(4,8))

In [11]:
# get the chunk layout for dsset5
dset5.chunks

(4, 8)

In [12]:
# internally, HSDS may alter the requested chunk shape for efficiency
if not USE_H5PY:
    obj_json = dset5.id.obj_json
    layout = obj_json["layout"]
    print(layout)

In [13]:
# no actual chunks have been stored since we haven't written anything to the dataset.
# write something to the dataset, this will initialize several chunks
dset5[2000,:] = 42

In [14]:
# close for use with h5stat
f.close()

In [15]:
# get info on what's been allocated
get_dset_stats(filepath, dset5.name)

Running h5ls to get dataset info
Opened "./04.h5" with sec2 driver.
None       **NOT FOUND** ----------------------------------------
