# Compression and Chunking

Objectives
 * Explore the use of compression 
 * Learn about chunking
 * Understand how dataset data is allocated

In [1]:
USE_H5PY=0  # set to 1 to use HDF5Lib instead

In [2]:
if USE_H5PY:
    import h5py
else:
    import h5pyd as h5py
import numpy as np
import os
import random

In [3]:
#
# Get folder/directory for HDF files we create  
#
def getMyFolder():
    DIR_NAME = "HDFLabTutorial/"
    if USE_H5PY:
        myfolder = os.getenv("HOME") + "/" + DIR_NAME
        if not os.path.isdir(myfolder):
            # create a directory on the local disk if needed
            print("created folder:", myfolder)
            os.mkdir(myfolder)
    else:
        dir = h5py.Folder('/home/')  # get folder object for root
        username = os.getenv("JUPYTERHUB_USER")
        myfolder = None
        for name in dir:
            # we should come across the given domain
            if username.startswith(name):
                # check any folders where the name matches at least part of the username
                # e.g. folder: "/home/bob/" for username "bob@acme.com"
                path = '/home/' + name + '/'
                f = h5py.Folder(path)
                if f.owner == username:
                    myfolder = path
                f.close()
                if myfolder:
                    break

        dir.close()
    
        # create a workshop subfolder if not already present
        myfolder += DIR_NAME
        try:
            h5py.Folder(myfolder)
        except IOError as ioe:
            if ioe.errno != 404:
                return None  # unexpected error
            # not present - create it now
            h5py.Folder(myfolder, mode='x')
            print("created folder:", myfolder)
       
    return myfolder

In [10]:
# Get your home folder
# will be a posix directory is H5PY is 1, or a server folder if 0
if USE_H5PY:
    home = "./"
else:
    home = "/home/test_user1/"
home  # this is the folder where you have permission to write to

'/home/test_user1/'

In [11]:
# create a file on the disk, or a domain on the server (based on USE_H5PY)
filename = home + "04.h5"
f = h5py.File(filename, 'w')

In [12]:
f.id.id

'g-3a59db18-8011c475-b2d1-539008-994cfd'

In [8]:
# there a different compression filters that can be used
# In HSDS, f.compressors will return a list of available filters
# For HDF5Lib, filters "gzip" and "lzf" are always available - others 
# may be sorted via a plugin
compressors = [None,]
if USE_H5PY:
    compressors.extend(("gzip", "lzf"))
else:
    compressors.extend(f.compressors)
compressors

[None, 'blosclz', 'lz4', 'lz4hc', 'gzip', 'zstd', 'deflate']

In [8]:
# create a dataset with each type of compressor (+None for no compression)
shape = (80, 40)
dtype = 'f8'
for compression in compressors:
    dset_name = f"dset_{compression}"
    if dset_name in f:
        del f[dset_name] # delete the dataset if it already exists
    dset = f.create_dataset(dset_name, shape=shape, dtype=dtype, compression=compression)
    # write a few random values to the dataset
    i = random.randint(0, shape[0]-1)
    j = random.randint(0, shape[1]-1)
    v = random.random()
    dset[i,j] = v


In [9]:
f.close()  # close file we can use h5ls on it (for h5py)

In [10]:
def get_dset_stats(filename, dsetname):
    if USE_H5PY:
        print(f"Running h5ls to get dataset info")
        ! h5ls -v {filename}/{dsetname}
    else:
        # H5PYD has a num_chunks attribute to tell you how many chunks
        # have been allocated
        # num_chunks is determined asynchronously by the HDF Server. 
        # If num_chunks is 0, wait a few seconds and re-run the cell.
        with h5py.File(filename) as f:
            dset = f[dsetname]
            logical_size = dset.dtype.itemsize
        for dim in dset.shape:
            logical_size *= dim
            print(f"logical size:   {logical_size}")
            if not dset.num_chunks:
                print("No chunks found, if something has been written to this dataset, wait a few seconds and try this again")
            else:
                chunk_size = dset.dtype.itemsize
                for dim in dset.chunks:
                    chunk_size *= dim
                print(f"Chunks: {dset.chunks} {chunk_size} bytes")
                # allocated size is also determined asynchronously, but 
                # is show be updated if num_chunks is
                print(f"allocated size: {dset.allocated_size}")
                ratio = logical_size/dset.allocated_size
                ratio *= 100.0
                print(f"utilization: {ratio:.2f}%")
                print(f"num_chunks: {dset.num_chunks}")
                if dset.compression:
                    print(f"Filter: {dset.compression} OPT: {dset.compression_opts}")
    print("-"*40)  

In [11]:
for compression in compressors:
    dset_name = f"dset_{compression}"
    get_dset_stats(filename, dset_name)

logical size:   640
Chunks: (80, 40) 25600 bytes
allocated size: 25600
utilization: 2.50%
num_chunks: 1
logical size:   25600
Chunks: (80, 40) 25600 bytes
allocated size: 25600
utilization: 100.00%
num_chunks: 1
----------------------------------------
logical size:   640
Chunks: (80, 40) 25600 bytes
allocated size: 193
utilization: 331.61%
num_chunks: 1
Filter: blosclz OPT: None
logical size:   25600
Chunks: (80, 40) 25600 bytes
allocated size: 193
utilization: 13264.25%
num_chunks: 1
Filter: blosclz OPT: None
----------------------------------------
logical size:   640
Chunks: (80, 40) 25600 bytes
allocated size: 191
utilization: 335.08%
num_chunks: 1
Filter: lz4 OPT: None
logical size:   25600
Chunks: (80, 40) 25600 bytes
allocated size: 191
utilization: 13403.14%
num_chunks: 1
Filter: lz4 OPT: None
----------------------------------------
logical size:   640
Chunks: (80, 40) 25600 bytes
allocated size: 178
utilization: 359.55%
num_chunks: 1
Filter: lz4hc OPT: None
logical size:   2

Problem: which filter gave the best compression?  Why do you think other filters would be used?

# Chunking

HDF datasets are tiled into a a set of chunks.  This enables reading/writing data from storage to be done more efficiently.

With the HDF5 native library, each chunk is stored in a contiguous section of the file.

With HSDS, each chunk is stored as a seperate S3 object. The chunk shape determines how many chunks will be used.  If not provided in create_dataset, the chunk layout will be determined automatically. 

In [12]:
# create a datatset using the chunks option
f = h5py.File(filename, "a")  # re-open in append mode
dset4 = f.create_dataset("dset_chunks", (40,80), dtype='i1', chunks=(4,8))

In [13]:
# The library will faithfully use the chunk layout provided, but the 
# the server will take the chunk layout as a hint.
dset4.chunks

(40, 80)

Unlike with HDF5 library, the HDF Server may alter the inputed chunk layout so that each chunk is a reasonable size when stored in s3 (typically between 2-4MB).

Try again with a larger dataset.

In [14]:
dset5 = f.create_dataset("big_dset", (4000,8000), dtype='i1', chunks=(4,8))

In [15]:
dset5.chunks

(1024, 2048)

For HSDS, note that the "shape" of the chunk was preserved, but scaled up to hit the desired chunk size

In [16]:
# no actual chunks have been stored since we haven't written anything to the dataset.
# write something to the dataset, this will initialize several chunks
dset5[2000,:] = 42

In [17]:
f.close()

In [18]:
get_dset_stats(filename, "big_dset")

logical size:   4000
Chunks: (1024, 2048) 2097152 bytes
allocated size: 8388608
utilization: 0.05%
num_chunks: 4
logical size:   32000000
Chunks: (1024, 2048) 2097152 bytes
allocated size: 8388608
utilization: 381.47%
num_chunks: 4
----------------------------------------
