# Compression and Chunking

Objectives
 * Explore the use of compression 
 * Learn about chunking
 * Understand how dataset data is allocated

In [None]:
USE_H5PY = True # set to False to use HSDS instead
if USE_H5PY:
    import h5py
    WORK_DIR="."  # this directory
else:
    import h5pyd as h5py
    WORK_DIR="hdf5://home/test_user1/"
import os.path as op
import random

In [None]:
filepath = op.join(WORK_DIR, "04.h5")
print(f"creating HDF5 file here: {filepath}")
f = h5py.File(filepath, 'w')
f.id.id

In [None]:
# there a different compression filters that can be used
# In HSDS, f.compressors will return a list of available filters
# For HDF5Lib, filters "gzip" and "lzf" are always available - others 
# may be sorted via a plugin
compressors = [None,]
if USE_H5PY:
    compressors.extend(("gzip", "lzf"))
else:
    compressors.extend(f.compressors)
compressors

In [None]:
# create a dataset with each type of compressor (+None for no compression)
shape = (80, 40)
dtype = 'f8'
for compression in compressors:
    dset_name = f"dset_{compression}"
    if dset_name in f:
        del f[dset_name] # delete the dataset if it already exists
    dset = f.create_dataset(dset_name, shape=shape, dtype=dtype, compression=compression)
    # write a few random values to the dataset
    i = random.randint(0, shape[0]-1)
    j = random.randint(0, shape[1]-1)
    v = random.random()
    dset[i,j] = v


In [None]:
f.close()  # close file we can use h5ls on it (for h5py)

In [None]:
def get_dset_stats(filename, dsetname):
    if USE_H5PY:
        print(f"Running h5ls to get dataset info")
        ! h5ls -v {filename}/{dsetname}
    else:
        # H5PYD has a num_chunks attribute to tell you how many chunks
        # have been allocated
        # num_chunks is determined asynchronously by the HDF Server. 
        # If num_chunks is 0, wait a few seconds and re-run the cell.
        with h5py.File(filename) as f:
            dset = f[dsetname]
            logical_size = dset.dtype.itemsize
        for dim in dset.shape:
            logical_size *= dim
            print(f"logical size:   {logical_size}")
            if not dset.num_chunks:
                print("No chunks found, if something has been written to this dataset, wait a minute and try this again")
            else:
                chunk_size = dset.dtype.itemsize
                for dim in dset.chunks:
                    chunk_size *= dim
                print(f"Chunks: {dset.chunks} {chunk_size} bytes")
                # allocated size is also determined asynchronously, but 
                # is show be updated if num_chunks is
                print(f"allocated size: {dset.allocated_size}")
                ratio = logical_size/dset.allocated_size
                ratio *= 100.0
                print(f"utilization: {ratio:.2f}%")
                print(f"num_chunks: {dset.num_chunks}")
                if dset.compression:
                    print(f"Filter: {dset.compression} OPT: {dset.compression_opts}")
    print("-"*40)  

In [None]:
for compression in compressors:
    dset_name = f"dset_{compression}"
    get_dset_stats(filepath, dset_name)

Problem: which filter gave the best compression?  Why do you think other filters would be used?

# Chunking

HDF datasets can be tiled into a a set of chunks.  This enables reading/writing data from storage to be done more efficiently.

With the HDF5 native library, each chunk is stored in a contiguous section of the file.

With HSDS, each chunk is stored as a seperate object. The chunk shape determines how many chunks will be used.  If not provided in create_dataset, the HDF5 library will store all the data for the file contiguously.  In HSDS a chunk layout will be determined automatically. 

In [None]:
# create a datatset using the chunks option
f = h5py.File(filepath, "a")  # re-open in append mode
dset4 = f.create_dataset("dset_chunks", (40,80), dtype='i1', chunks=(4,8))
dset4.id.id

In [None]:
# The chunk property will return the chunk shape provided in the initializer
dset4.chunks

In [None]:
dset5 = f.create_dataset("big_dset", (4000,8000), dtype='i1', chunks=(4,8))

In [None]:
dset5.chunks

In [None]:
# no actual chunks have been stored since we haven't written anything to the dataset.
# write something to the dataset, this will initialize several chunks
dset5[2000,:] = 42

In [None]:
dset5.id.id