# Compression - my first tutorial

Objectives
 * Explore the use of compression 
 * Learn about chunking
 * Understand how dataset data is allocated

In [1]:
USE_H5PY=1  # set to 0 to use HDF Server instead

In [2]:
if USE_H5PY:
    import h5py
else:
    import h5pyd as h5py
import numpy as np
import os
import subprocess

In [3]:
#
# Get folder/directory for HDF files we create  
#
def getMyFolder():
    DIR_NAME = "HDFLabTutorial/"
    if USE_H5PY:
        myfolder = os.getenv("HOME") + "/" + DIR_NAME
        if not os.path.isdir(myfolder):
            # create a directory on the local disk if needed
            print("created folder:", myfolder)
            os.mkdir(myfolder)
    else:
        dir = h5py.Folder('/home/')  # get folder object for root
        username = os.getenv("JUPYTERHUB_USER")
        myfolder = None
        for name in dir:
            # we should come across the given domain
            if username.startswith(name):
                # check any folders where the name matches at least part of the username
                # e.g. folder: "/home/bob/" for username "bob@acme.com"
                path = '/home/' + name + '/'
                f = h5py.Folder(path)
                if f.owner == username:
                    myfolder = path
                f.close()
                if myfolder:
                    break

        dir.close()
    
        # create a workshop subfolder if not already present
        myfolder += DIR_NAME
        try:
            h5py.Folder(myfolder)
        except IOError as ioe:
            if ioe.errno != 404:
                return None  # unexpected error
            # not present - create it now
            h5py.Folder(myfolder, mode='x')
            print("created folder:", myfolder)
       
    return myfolder

In [4]:
# Get your home folder
# will be a posix directory is H5PY is 1, or a server folder if 0
home = getMyFolder()
home  # this is the folder where you have permission to write to

'/home/jovyan/HDFLabTutorial/'

In [5]:
# create a file on the disk, or a domain on the server (based on USE_H5PY)
filename = home + "04a.h5"
f = h5py.File(filename, 'w')

In [6]:
f.id.id

72057594037927936

In [7]:
# create some test data
arr = np.random.rand(40, 80)
arr

array([[0.66684702, 0.42902893, 0.40111429, ..., 0.51002711, 0.95600509,
        0.51617687],
       [0.4909785 , 0.03470718, 0.67728895, ..., 0.31901017, 0.39192206,
        0.65180373],
       [0.04044594, 0.34854649, 0.45788198, ..., 0.29439028, 0.7946077 ,
        0.51152485],
       ...,
       [0.74341396, 0.91914145, 0.00918109, ..., 0.12316645, 0.83323736,
        0.6971266 ],
       [0.99814172, 0.74376858, 0.35049818, ..., 0.4942349 , 0.93584094,
        0.64501134],
       [0.88213715, 0.02006558, 0.45317104, ..., 0.19532329, 0.87759811,
        0.91487925]])

In [8]:
# non-compressed array
dset1 = f.create_dataset('dset_nocompression', data=arr)

In [9]:
dset1.dtype

dtype('<f8')

In [10]:
dset1.shape

(40, 80)

In [11]:
# this will return None for h5py, since chunks are not auto created for small datasets
dset1.chunks

In [12]:
# create a dataset using the gzip filter
dset2 = f.create_dataset('dset_gzip1', data=arr, compression='gzip', compression_opts=9)

In [13]:
dset2.shape

(40, 80)

In [14]:
# create another compressed dataset, but initialize it with data that is 
# easily compressed
dset3 = f.create_dataset("dset_gzip2", (40,80), dtype='f8', compression='gzip', compression_opts=9)
dset3[...] = 42.0  # writes 42 to every element

In [15]:
f.close()  # close file we can use h5ls on it (for h5py)

In [16]:
def get_dset_stats(filename, dsetname):
    if USE_H5PY:
        print(f"Running h5ls to get dataset info")
        ! h5ls -v {filename}/{dsetname}
    else:
        # H5PYD has a num_chunks attribute to tell you how many chunks
        # have been allocated
        # num_chunks is determined asynchronously by the HDF Server. 
        # If num_chunks is 0, wait a few seconds and re-run the cell.
        with h5py.File(filename) as f:
            dset = f[dsetname]
            logical_size = dset.dtype.itemsize
            for dim in dset.shape:
                logical_size *= dim
            print(f"logical size:   {logical_size}")
            if not dset.num_chunks:
                print("No chunks found, if something has been written to this dataset, wait a few seconds and try this again")
            else:
                chunk_size = dset.dtype.itemsize
                for dim in dset.chunks:
                    chunk_size *= dim
                print(f"Chunks: {dset.chunks} {chunk_size} bytes")
                # allocated size is also determined asynchronously, but 
                # is show be updated if num_chunks is
                print(f"allocated size: {dset.allocated_size}")
                ratio = logical_size/dset.allocated_size
                ratio *= 100.0
                print(f"utilization: {ratio:.2f}%")
                print(f"num_chunks: {dset.num_chunks}")
                if dset.compression:
                    print(f"Filter: {dset.compression} OPT: {dset.compression_opts}")
    print("-"*40)  

In [17]:
get_dset_stats(filename, "dset_nocompression")
get_dset_stats(filename, "dset_gzip1")
get_dset_stats(filename, "dset_gzip2")

Running h5ls to get dataset info
Opened "/home/jovyan/HDFLabTutorial/04a.h5" with sec2 driver.
dset_nocompression       Dataset {40/40, 80/80}
    Location:  1:800
    Links:     1
    Storage:   25600 logical bytes, 25600 allocated bytes, 100.00% utilization
    Type:      native double
----------------------------------------
Running h5ls to get dataset info
Opened "/home/jovyan/HDFLabTutorial/04a.h5" with sec2 driver.
dset_gzip1               Dataset {40/40, 80/80}
    Location:  1:1400
    Links:     1
    Chunks:    {20, 40} 6400 bytes
    Storage:   25600 logical bytes, 24256 allocated bytes, 105.54% utilization
    Filter-0:  deflate-1 OPT {9}
    Type:      native double
----------------------------------------
Running h5ls to get dataset info
Opened "/home/jovyan/HDFLabTutorial/04a.h5" with sec2 driver.
dset_gzip2               Dataset {40/40, 80/80}
    Location:  1:1672
    Links:     1
    Chunks:    {20, 40} 6400 bytes
    Storage:   25600 logical bytes, 152 allocated byte

HDF datasets are tiled into a a set of chunks.  This enables reading/writing data from storage to be done more efficiently.

With the HDF5 native library, each chunk is stored in a contiguous section of the file.

With HDF Server, each chunk is stored as a seperate S3 object. The chunk shape determines how many chunks will be used.  If not provided in create_dataset, the chunk layout will be determined automatically. 

In [18]:
# create a datatset using the chunks option
f = h5py.File(filename, "a")  # re-open in append mode
dset4 = f.create_dataset("dset_chunks", (40,80), dtype='i1', chunks=(4,8))

In [19]:
# The library will faithfully use the chunk layout provided, but the 
# the server will take the chunk layout as a hint.
dset4.chunks

(4, 8)

Unlike with HDF5 library, the HDF Server may alter the inputed chunk layout so that each chunk is a reasonable size when stored in s3 (typically between 2-4MB).

Try again with a larger dataset.

In [20]:
dset5 = f.create_dataset("big_dset", (4000,8000), dtype='i1', chunks=(4,8))

In [21]:
dset5.chunks

(4, 8)

For HDF Server, note that the "shape" of the chunk was preserved, but scaled up to hit the desired chunk size

In [22]:
# no actual chunks have been stored since we haven't written anything to the dataset.
# write something to the dataset, this will initialize several chunks
dset5[2000,:] = 42

In [23]:
f.close()
get_dset_stats(filename, "dset_chunks")
get_dset_stats(filename, "big_dset")

Running h5ls to get dataset info
Opened "/home/jovyan/HDFLabTutorial/04a.h5" with sec2 driver.
dset_chunks              Dataset {40/40, 80/80}
    Location:  1:57288
    Links:     1
    Chunks:    {4, 8} 32 bytes
    Storage:   3200 logical bytes, 0 allocated bytes
    Type:      native signed char
----------------------------------------
Running h5ls to get dataset info
Opened "/home/jovyan/HDFLabTutorial/04a.h5" with sec2 driver.
big_dset                 Dataset {4000/4000, 8000/8000}
    Location:  1:57736
    Links:     1
    Chunks:    {4, 8} 32 bytes
    Storage:   32000000 logical bytes, 32000 allocated bytes, 100000.00% utilization
    Type:      native signed char
----------------------------------------
