# Accessing HSDS from the HDF5 library

<div class="alert alert-block alert-danger">
This notebook will work only with a running HSDS instance. If you don't have one running, you can start one using the <a href="./05-HSDS.ipynb">HSDS notebook</a>.
</div>

## Introducing the HDF5 REST VOL connector

In our discussion of the Highly Scalable Data Service (HSDS), we showed that by merely swapping Python modules (`import h5pyd as h5py`), without further modifications, we were able to run a Python script against HSDS rather than a file system. This begs the question if something similar can be done with the HDF5 library. The answer is, yes, and what makes this possible is an extension layer of the HDF5 library, the so-called __[Virtual Object Layer (VOL)](https://github.com/HDFGroup/arch-doc)__. Such extensions or *VOL connectors*, are implemented as dynamically loadable plugins, and the connector implementing "HSDS connectivity" is the __[REST VOL connector](https://github.com/HDFGroup/vol-rest)__.

### Installing the HDF5 REST VOL connector

As a plugin, the REST VOL connector is not installed by default. It also is only available with HDF5 library versions 1.12 and newer. The library version pre-installed in this Docker container is pre-1.12, and we must install a newer library version and build the REST VOL connector from source (__[GitHub](https://github.com/HDFGroup/vol-rest)__).

<div class="alert alert-block alert-warning">
The cloning and compilation takes, depending on the machine type, about three minutes.
</div>

The script shown below clones the HDF5 library and REST VOL repositories and build the binaries using CMake. The headers and binaries are installed in `$HOME/.local`.

In [None]:
%%bash
git clone https://github.com/HDFGroup/hdf5.git build/hdf5
mkdir -p build/hdf5/build
cd build/hdf5/build
cmake -DCMAKE_INSTALL_PREFIX=/home/vscode/.local -DBUILD_STATIC_LIBS=OFF -DBUILD_TESTING=OFF -DHDF5_BUILD_EXAMPLES=OFF -DHDF5_BUILD_TOOLS=OFF -DHDF5_BUILD_UTILS=OFF ../ 2>&1 > /dev/null
make -j 4 2>&1 > /dev/null
make install 2>&1 > /dev/null
cd ../../..
git clone https://github.com/HDFGroup/vol-rest.git build/rest-vol
cd build/rest-vol
./build_vol_cmake.sh -P /home/vscode/.local -H /home/vscode/.local -B ./build 2>&1 > /dev/null
cd build && make install 2>&1 > /dev/null

### Modifying the C++ HDF5 example to use the REST VOL connector

By adding five lines and changing a single line of code, we can make our original C++ example "talk" to HSDS instead of the file system.

- We must initialize the REST VOL connector (line 27)
- We must add a file access property (lines 28,29) and pass it to `H5Fcreate` (line 30) and release the file access property list (line 31)
- We terminate the REST VOL connector (line 70)

And that's it!

In [None]:
%%writefile src/ou_restvol.cpp
#include "docstring.hpp"
#include "ou_sampler.hpp"
#include "rest_vol_public.h"
#include "hdf5.h"
#include <iostream>
#include <vector>

using namespace std;

int main()
{
    const size_t path_count = 100, step_count = 1000;
    const double dt = 0.01, theta = 1.0, mu = 0.0, sigma = 0.1;

    cout << "Running with parameters:"
         << " paths=" << path_count << " steps=" << step_count
         << " dt=" << dt << " theta=" << theta << " mu=" << mu << " sigma=" << sigma << endl;

    vector<double> ou_process;
    ou_sampler(ou_process, path_count, step_count, dt, theta, mu, sigma);
    
    //
    // Write the sample paths to an HDF5 file using the HDF5 REST VOL!
    //

    H5rest_init();
    auto fapl = H5Pcreate(H5P_FILE_ACCESS);
    H5Pset_fapl_rest_vol(fapl);
    auto file = H5Fcreate("/home/vscode/ou_restvol.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl);
    H5Pclose(fapl);

    add_docstring(file, ".", "source", "https://github.com/HDFGroup/hdf5-tutorial");

    { // create & write the dataset
        hsize_t dimsf[] = {(hsize_t)path_count, (hsize_t)step_count};
        auto space = H5Screate_simple(2, dimsf, NULL);
        auto dataset = H5Dcreate(file, "/dataset", H5T_NATIVE_DOUBLE, space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
        H5Dwrite(dataset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, ou_process.data());
        H5Dclose(dataset);
        H5Sclose(space);
    }

    { // make the file self-describing by adding a few attributes to `dataset`
        add_docstring(file, "dataset", "comment", "This dataset contains sample paths of an Ornstein-Uhlenbeck process.");
        add_docstring(file, "dataset", "Wikipedia", "https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process");
        add_docstring(file, "dataset", "rows", "path");
        add_docstring(file, "dataset", "columns", "time");
        
        auto scalar = H5Screate(H5S_SCALAR);
        auto acpl = H5Pcreate(H5P_ATTRIBUTE_CREATE);
        H5Pset_char_encoding(acpl, H5T_CSET_UTF8);
        
        auto set_attribute = [&](const string& name, const double& value) {
            auto attr = H5Acreate_by_name(file, "dataset", name.c_str(), H5T_NATIVE_DOUBLE, scalar, acpl, H5P_DEFAULT, H5P_DEFAULT);
            H5Awrite(attr, H5T_NATIVE_DOUBLE, &value);
            H5Aclose(attr);
        };
        set_attribute("dt", dt);
        set_attribute("θ", theta);
        set_attribute("μ", mu);
        set_attribute("σ", sigma);

        H5Pclose(acpl);
        H5Sclose(scalar);
    }

    H5Fclose(file);

    H5rest_term();

    return 0;
}

In [None]:
%%bash
g++ -I/home/vscode/.local/include -L/home/vscode/.local/lib ./src/ou_restvol.cpp ./src/docstring.cpp ./src/ou_sampler.cpp -o ./build/ou_restvol -lhdf5 -lhdf5_vol_rest -lcurl -ldl -Wl,-rpath=/home/vscode/.local/lib
export HSDS_USERNAME=vscode
export HSDS_PASSWORD=vscode
export HSDS_ENDPOINT=http://localhost:5101
./build/ou_restvol

In [None]:
%%bash
hsls --showattrs /home/vscode/ou_restvol.h5

## Improving performance with multi-dataset I/O

Sometimes we perform I/O (read or write) against multiple datasets. In the past, there was no API for "vectorizing" such access patterns. Since HDF5 library version 1.14.0, the VOL layer has `H5D[read,write]_multi` calls that support this particular use case. The REST VOL connector implements both calls, by taking advantage of __[`libcurl`'s multi interface](https://curl.se/libcurl/c/libcurl-multi.html)__. The example below, shows how to use the `H5Dwrite_multi` API by creating request vectors whose length is the number of datasets to be written in one call.

In [None]:
%%writefile src/multi_dataset.cpp
#include "hdf5.h"
#include "rest_vol_public.h"

#include <iostream>
#include <sstream>
#include <vector>

using namespace std;

const size_t NUM_DATASETS = 2;
const size_t NUM_INTEGERS = (1000 * 1000 * 100) / NUM_DATASETS;

int main()
{
    H5rest_init();

    hid_t fapl = H5Pcreate(H5P_FILE_ACCESS);
    H5Pset_fapl_rest_vol(fapl);
    hid_t file = H5Fcreate("/home/vscode/multi_dataset.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl);
    H5Pclose(fapl);
    
    vector<int> write_buf(NUM_DATASETS*NUM_INTEGERS), read_buf(NUM_DATASETS*NUM_INTEGERS);
    int *write_data[NUM_DATASETS], *read_data[NUM_DATASETS];
    for (size_t i = 0; i < NUM_DATASETS; ++i)
    {
        write_data[i] = &write_buf[i*NUM_INTEGERS];
        read_data[i] = &read_buf[i*NUM_INTEGERS];
    }

    hid_t dspace_ids[NUM_DATASETS], dset_ids[NUM_DATASETS], type_ids[NUM_DATASETS], sel_space_ids[NUM_DATASETS];
    
    ostringstream dset_name;
    hsize_t dims[] = { NUM_INTEGERS };

    for (size_t i = 0; i < NUM_DATASETS; ++i)
    {
        type_ids[i] = H5T_NATIVE_INT;
        dspace_ids[i] = H5Screate_simple(1, dims, NULL);
        sel_space_ids[i] = H5S_ALL;

        dset_name.str("");
        dset_name << "/dset_" << i;
        dset_ids[i] = H5Dcreate(file, dset_name.str().c_str(), type_ids[i], dspace_ids[i], H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);

        for (size_t j = 0; j < NUM_INTEGERS; ++j)
            write_data[i][j] = i * 10000 + j;
    }

    H5Dwrite_multi(NUM_DATASETS, dset_ids, type_ids, sel_space_ids, sel_space_ids, H5P_DEFAULT, (const void**)write_data);

    H5Dread_multi(NUM_DATASETS, dset_ids, type_ids, sel_space_ids, sel_space_ids, H5P_DEFAULT, (void**) read_data);

    for (size_t i = 0; i < read_buf.size(); ++i)
    {
        if (read_buf[i] != write_buf[i])
        {
            cerr << "At read_buf[" << i << "], " << read_buf[i] << " != expected " << write_buf[i] << endl;
            exit(1);
        }
    }
    
    cout << "SUCCESS!" << endl;

    H5Fclose(file);
    H5rest_term();

    return 0;
}

In [None]:
%%bash
g++ -std=c++17 -Wall -pedantic -I/home/vscode/.local/include -L/home/vscode/.local/lib ./src/multi_dataset.cpp -o ./build/restvol -lhdf5 -lhdf5_vol_rest -lcurl -ldl -Wl,-rpath=/home/vscode/.local/lib
export HSDS_USERNAME=vscode
export HSDS_PASSWORD=vscode
export HSDS_ENDPOINT=http://localhost:5101
./build/restvol
hsls /home/vscode/multi_dataset.h5