diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 602f49e..a5f346d 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,6 +1,4 @@ -FROM mcr.microsoft.com/devcontainers/miniconda:0-3 - -RUN conda install -n base -c conda-forge mamba +FROM mcr.microsoft.com/devcontainers/miniconda:1.2.4-3 # Copy environment.yml (if found) to a temp location so we update the environment. Also # copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists. diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e3af3ee..9db5524 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -17,7 +17,7 @@ "settings": { "C_Cpp.default.cppStandard": "c++17", "C_Cpp.default.cStandard": "c99", - "python.defaultInterpreterPath": "/opt/conda/envs/hdf5-tutorial/python", + "python.defaultInterpreterPath": "/opt/conda/envs/hdf5-tutorial/bin/python", "python.languageServer": "Default", "terminal.integrated.shell.linux": "/bin/bash" } diff --git a/07-S3-and-the-Cloud.ipynb b/07-S3-and-the-Cloud.ipynb index d558d5b..adb5bc3 100644 --- a/07-S3-and-the-Cloud.ipynb +++ b/07-S3-and-the-Cloud.ipynb @@ -5,34 +5,74 @@ "id": "b0bf0f0a-b19e-4a50-8a13-820ec7439437", "metadata": {}, "source": [ + "
\n", + "Be sure to select the Python kernel from the hdf5-tutorial conda environment to run this notebook.\n", + "
\n", + "\n", + "---\n", + "\n", "# Reading Data Directly from the Cloud\n", "\n", - "In the previous notebook, we showed how the library can use a VOL connector to access files stored in a cloud backend through HSDS. But this is not the only way to use HDF5 in the cloud. HDF5 files stored in an S3 bucket can be accessed directly from the library without any external connector or drivers. This is possible through the Read-Only S3 VFD (ROS3 VFD). \n", + "We showed in the previous notebook how the library can use a VOL connector to access files stored in a cloud backend through HSDS. But this is not the only way to use HDF5 in the cloud. HDF5 files stored in an S3 bucket can be accessed directly from the library without any external connector or middleware. This is possible through the Read-Only S3 (ROS3) virtual file driver (VFD).\n", "\n", - "## Enabling the ROS3 VFD\n", + "## Where is ROS3 VFD?\n", "\n", - "The ROS3 VFD is packaged with the library, and only needs to be enabled during the build process. When using CMake to build HDF5, this is done by setting the variable `HDF5_ENABLE_ROS3_VFD` to `ON`.\n", + "The ROS3 VFD is part of the HDF5 library and only needs to be enabled during the build process. Or to install the library package from a repository with this VFD already enabled.\n", "\n", - "
\n", - "The cloning and compilation takes, depending on the machine type, about three minutes.\n", - "
" + "When building HDF5 yourself, this is done by setting the variable `HDF5_ENABLE_ROS3_VFD` to `ON`. The example below works in this notebook:\n", + "\n", + "```bash\n", + "git clone https://github.com/HDFGroup/hdf5.git build/hdf5\n", + "mkdir -p build/hdf5/build\n", + "cd build/hdf5/build\n", + "cmake -DCMAKE_INSTALL_PREFIX=/home/vscode/.local \\\n", + " -DHDF5_ENABLE_ROS3_VFD=ON \\\n", + " -DBUILD_STATIC_LIBS=OFF \\\n", + " -DBUILD_TESTING=OFF \\\n", + " -DHDF5_BUILD_EXAMPLES=OFF \\\n", + " -DHDF5_BUILD_TOOLS=ON \\\n", + " -DHDF5_BUILD_UTILS=OFF \\\n", + " ../ 2>&1 > /dev/null\n", + "make -j 4 2>&1 > /dev/null\n", + "make install 2>&1 > /dev/null\n", + "cd ../../..\n", + "```\n", + "\n", + "
\n", + "The cloning and compilation takes about three minutes, depending on the computer type.\n", + "
\n", + "\n", + "We are going to use the `hdf5` package from the conda-forge repostitory. It is available in the `hdf5-tutorial` conda virtual environment on this machine." ] }, { "cell_type": "code", "execution_count": null, - "id": "1a7406c3-96f8-4a65-8cdc-69090897543a", + "id": "8a0d9aa5", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "git clone https://github.com/HDFGroup/hdf5.git build/hdf5\n", - "mkdir -p build/hdf5/build\n", - "cd build/hdf5/build\n", - "cmake -DCMAKE_INSTALL_PREFIX=/home/vscode/.local -DHDF5_ENABLE_ROS3_VFD=ON -DBUILD_STATIC_LIBS=OFF -DBUILD_TESTING=OFF -DHDF5_BUILD_EXAMPLES=OFF -DHDF5_BUILD_TOOLS=ON -DHDF5_BUILD_UTILS=OFF ../ 2>&1 > /dev/null\n", - "make -j 4 2>&1 > /dev/null\n", - "make install 2>&1 > /dev/null\n", - "cd ../../.." + "mamba list | grep -w \"hdf5\"" + ] + }, + { + "cell_type": "markdown", + "id": "90a927b1", + "metadata": {}, + "source": [ + "To verify this package comes with ROS3 VFD enabled:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "275f74d8", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "grep \"S3 VFD\" ${CONDA_PREFIX}/lib/libhdf5.settings" ] }, { @@ -40,7 +80,11 @@ "id": "defa2db2-6da3-4057-ba43-5b5f904cdc9d", "metadata": {}, "source": [ - "For this demonstration, we will use a copy of the `ou_process.h5` file that has been uploaded to an S3 bucket. We can use the command line tool `h5dump` to examine this file and see that it is the same file we're familiar with." + "## With Command-Line Tools\n", + "\n", + "For this demonstration, we will use a copy of the `ou_process.h5` file that has been uploaded to an S3 bucket. We can use the command line tool `h5dump` to examine this file and see that it is the same file we're familiar with.\n", + "\n", + "First we will use the S3 URI of the file as this is more compact and usual way to reference S3 objects:" ] }, { @@ -51,7 +95,55 @@ "outputs": [], "source": [ "%%bash\n", - "/home/vscode/.local/bin/h5dump -pBH --vfd-name ros3 --s3-cred=\"(,,)\" https://s3.us-east-2.amazonaws.com/docs.hdfgroup.org/hdf5/h5/ou_process.h5" + "AWS_REGION=us-west-2 h5dump -pBH s3://hdf5.sample/data/hdf5-tutorial/ou_process.h5" + ] + }, + { + "cell_type": "markdown", + "id": "d745b78e", + "metadata": {}, + "source": [ + "The file's full URL can be used, too:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9224a44c", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "AWS_REGION=us-west-2 h5dump -Ap --vfd-name ros3 https://s3.us-west-2.amazonaws.com/hdf5.sample/data/hdf5-tutorial/ou_process.h5" + ] + }, + { + "cell_type": "markdown", + "id": "506647c6", + "metadata": {}, + "source": [ + "Try with `h5ls` now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b445f4f", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "AWS_REGION=us-west-2 h5ls -r s3://hdf5.sample/data/hdf5-tutorial/ou_process.h5" + ] + }, + { + "cell_type": "markdown", + "id": "05110b4f", + "metadata": {}, + "source": [ + "Note how in the second case the ROS3 driver had to specified due to the generic `https` URL of the file.\n", + "\n", + "The ROS3 driver supports all the documented methods of supplying AWS S3 connection configuration. We set `AWS_REGION` on the command-line because it is the easiest in this case and the recommended way when the S3 object's AWS region differs from those in the AWS configuration file." ] }, { @@ -59,6 +151,8 @@ "id": "4d8d3599-4f70-41d4-aff4-05842fd14efd", "metadata": {}, "source": [ + "## From a C Program\n", + "\n", "Now we will use the ROS3 VFD to read the file. Similarly to other VFDs, the ROS3 VFD is enabled at file access time with a File Access Property List (FAPL) modified via `H5Pset_fapl_ros3()`. This function also requires some S3-specific parameters in the form of the `H5FD_ros3_fapl_t` structure. Because this file is publicly accessible without any authentication, all we need to provide is the region it is located in." ] }, @@ -83,49 +177,57 @@ "\n", "int main(void)\n", "{\n", - "H5FD_ros3_fapl_t s3_params;\n", - "\n", - "// Set the AWS region and the S3 bucket name\n", - "strcpy(s3_params.aws_region, \"us-east-2\");\n", - "string file_uri = \"https://s3.us-east-2.amazonaws.com/docs.hdfgroup.org/hdf5/h5/ou_process.h5\";\n", - "// The version of this ROS3 FAPL parameter structure\n", - "s3_params.version = 1;\n", - "// This file permits anonymous access, so authentication is not needed\n", - "// secret_id and secret_key auth fields that would be required for protected S3 buckets\n", - "s3_params.authenticate = 0; \n", - "\n", - "// Open and read from the file\n", - "auto fapl = H5Pcreate(H5P_FILE_ACCESS);\n", - "H5Pset_fapl_ros3(fapl, &s3_params);\n", - "auto file = H5Fopen(file_uri.c_str(), H5F_ACC_RDONLY, fapl);\n", - "auto dset = H5Dopen(file, \"dataset\", H5P_DEFAULT);\n", - "\n", - "// Get the dataspace details from the file\n", - "auto dspace = H5Dget_space(dset);\n", - "assert(H5Sget_simple_extent_ndims(dspace) == 2);\n", - "array dims;\n", - "H5Sget_simple_extent_dims(dspace, dims.data(), NULL);\n", - "vector read_buffer(dims[0] * dims[1]);\n", - "\n", - "H5Dread(dset, H5T_IEEE_F64LE, H5S_ALL, H5S_ALL, H5P_DEFAULT, (void*) read_buffer.data());\n", - "\n", - "for (size_t i = 0; i < 10; i++) {\n", - " size_t row_idx = i * dims[0] / 10;\n", - " size_t col_idx = i * dims[1] / 10;\n", - " auto element = read_buffer[row_idx * dims[1] + col_idx];\n", - " cout << \"Element at index (\" << row_idx << \", \" << col_idx << \") = \"\n", - " << std::setprecision(16) << element << endl;\n", - "}\n", - "\n", - "H5Dclose(dset);\n", - "H5Sclose(dspace);\n", - "H5Pclose(fapl);\n", - "H5Fclose(file);\n", - "\n", - "return 0;\n", + " H5FD_ros3_fapl_t s3_params;\n", + "\n", + " // Set the AWS region and the S3 bucket name\n", + " strcpy(s3_params.aws_region, \"us-west-2\");\n", + " string file_uri = \"s3://hdf5.sample/data/hdf5-tutorial/ou_process.h5\";\n", + " // The version of this ROS3 FAPL parameter structure\n", + " s3_params.version = 1;\n", + " // This file permits anonymous access, so authentication is not needed\n", + " // secret_id and secret_key auth fields that would be required for protected S3 buckets\n", + " s3_params.authenticate = 0;\n", + "\n", + " // Open and read from the file\n", + " auto fapl = H5Pcreate(H5P_FILE_ACCESS);\n", + " H5Pset_fapl_ros3(fapl, &s3_params);\n", + " auto file = H5Fopen(file_uri.c_str(), H5F_ACC_RDONLY, fapl);\n", + " auto dset = H5Dopen(file, \"dataset\", H5P_DEFAULT);\n", + "\n", + " // Get the dataspace details from the file\n", + " auto dspace = H5Dget_space(dset);\n", + " assert(H5Sget_simple_extent_ndims(dspace) == 2);\n", + " array dims;\n", + " H5Sget_simple_extent_dims(dspace, dims.data(), NULL);\n", + " vector read_buffer(dims[0] * dims[1]);\n", + "\n", + " H5Dread(dset, H5T_IEEE_F64LE, H5S_ALL, H5S_ALL, H5P_DEFAULT, (void*) read_buffer.data());\n", + "\n", + " for (size_t i = 0; i < 10; i++) {\n", + " size_t row_idx = i * dims[0] / 10;\n", + " size_t col_idx = i * dims[1] / 10;\n", + " auto element = read_buffer[row_idx * dims[1] + col_idx];\n", + " cout << \"Element at index (\" << row_idx << \", \" << col_idx << \") = \"\n", + " << std::setprecision(16) << element << endl;\n", + " }\n", + "\n", + " H5Dclose(dset);\n", + " H5Sclose(dspace);\n", + " H5Pclose(fapl);\n", + " H5Fclose(file);\n", + "\n", + " return 0;\n", "}" ] }, + { + "cell_type": "markdown", + "id": "aaf39bd8", + "metadata": {}, + "source": [ + "Compile the above program to create executable `./build/ou_ros3`. We are using the conda environment's HDF5 library hence the `$CONDA_PREFIX` shell variable which points to the top folder of the environment." + ] + }, { "cell_type": "code", "execution_count": null, @@ -134,29 +236,39 @@ "outputs": [], "source": [ "%%bash\n", - "g++ -Wall -pedantic -I/home/vscode/.local/include -L/home/vscode/.local/lib ./src/ou_ros3.c -o ./build/ou_ros3 -lhdf5 -Wl,-rpath=/home/vscode/.local/lib\n", - "./build/ou_ros3" + "g++ ./src/ou_ros3.c -o ./build/ou_ros3 \\\n", + " -Wall -pedantic -Wl,-rpath,$CONDA_PREFIX/lib \\\n", + " -I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib \\\n", + " -lhdf5" ] }, { "cell_type": "markdown", - "id": "e0351df0-5336-416d-86e3-c40ee8a1b1c6", + "id": "a457878b", "metadata": {}, "source": [ - "The ROS3 VFD is also transparently accessible through h5py! (remember Tutorial 04?) The default installation of h5py does not have th ROS3 VFD enabled. To enable it, we build h5py from source against an HDF5 library that already has the ROS3 VFD enabled. Then, we can access our file in the cloud by providing the same ROS3 arguments as before to `h5py.File()`." + "Run the executable to read data from the file in S3:" ] }, { "cell_type": "code", "execution_count": null, - "id": "f8da438a-7df4-4860-90af-e031f86fed7e", + "id": "8f36235c", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "git clone https://github.com/h5py/h5py\n", - "cd h5py\n", - "HDF5_DIR=/home/vscode/.local pip install .\n" + "./build/ou_ros3" + ] + }, + { + "cell_type": "markdown", + "id": "e0351df0-5336-416d-86e3-c40ee8a1b1c6", + "metadata": {}, + "source": [ + "## From a Python Script\n", + "\n", + "The ROS3 VFD is also transparently accessible through h5py! (Remember Tutorial 04?) The h5py packages from PyPI (usually installed via `pip` or `uv`) bundle HDF5 library without the ROS3 VFD enabled. This is why we opted here for the library from conda-forge. Another option is to build h5py from source with your local library build with ROS3, which is explained in its [documentation](https://docs.h5py.org/en/stable/build.html#source-installation)." ] }, { @@ -168,25 +280,23 @@ "source": [ "import h5py\n", "\n", - "dims = [100, 1000]\n", "\n", - "s3_uri = \"https://s3.us-east-2.amazonaws.com/docs.hdfgroup.org/hdf5/h5/ou_process.h5\"\n", + "s3_uri = \"s3://hdf5.sample/data/hdf5-tutorial/ou_process.h5\"\n", "kwargs = {}\n", - "kwargs['mode'] = 'r'\n", - "kwargs['driver'] = \"ros3\"\n", - "kwargs['aws_region'] = (\"us-east-2\").encode(\"utf-8\")\n", - "# kwargs['authenticate'] = 0\n", - "\n", - "f = h5py.File(s3_uri, **kwargs)\n", + "kwargs[\"mode\"] = \"r\"\n", + "kwargs[\"driver\"] = \"ros3\"\n", + "kwargs[\"aws_region\"] = (\"us-west-2\").encode(\"utf-8\")\n", "\n", - "dset = f[\"dataset\"]\n", + "with h5py.File(s3_uri, **kwargs) as f:\n", + " dset = f[\"dataset\"]\n", + " dims = dset.shape\n", "\n", - "data = dset[:]\n", + " data = dset[:]\n", "\n", - "for i in range(10):\n", - " row_idx = int(i * dims[0] / 10)\n", - " col_idx = int(i * dims[1] / 10)\n", - " print(f\"dataset[{row_idx}, {col_idx}] = {data[row_idx, col_idx]}\")\n" + " for i in range(10):\n", + " row_idx = int(i * dims[0] / 10)\n", + " col_idx = int(i * dims[1] / 10)\n", + " print(f\"dataset[{row_idx}, {col_idx}] = {data[row_idx, col_idx]}\")\n" ] }, { @@ -194,15 +304,48 @@ "id": "1c6994af-1120-4870-9c5d-c2e7c4bf0df8", "metadata": {}, "source": [ - "## Cloud-Optimized\n", + "# Cloud-Optimized HDF5 Files\n", + "\n", + "While it is possible to read HDF5 files directly from cloud object stores, as demoed here, reading from larger files introduces significant inefficiencies. The access patterns of the library were optimized for filesystem I/O, not cloud access and the delays it brings. These concerns motivated the concept of cloud optimized HDF5 files.\n", + "\n", + "A cloud-optimized HDF5 file is a regular HDF5 file that has been restructured internally so that its internal metadata and data layout work well with how cloud storage actually behaves. On a local disk, reading small pieces of internal metadata from anywhere in the file is essentially free — the operating system can seek around in microseconds. On a cloud object store, every one of those small reads becomes an HTTP request with tens to hundreds of milliseconds of latency. A standard HDF5 file may scatter its internal metadata (the descriptors that tell a reader where the actual data lives) across many locations throughout the file, which means opening and navigating the file can trigger dozens or even hundreds of tiny HTTP range requests before a single byte of actual data is read.\n", + "\n", + "Cloud optimization addresses this by consolidating internal metadata into a contiguous block near the start of the file, choosing sensible chunk sizes (typically 1–8 MB) so each data read is large enough to amortize request latency, and avoiding free-space fragmentation in the file. The result is a file that still conforms to the HDF5 file format specification and can be read by any HDF5 tool, but from which a cloud-aware reader can extract needed data in just a couple of requests or less.\n", + "\n", + "The `h5stat` command can check whether a file is cloud optimized. For our file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ebd9a96", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "AWS_REGION=us-west-2 h5stat -S s3://hdf5.sample/data/hdf5-tutorial/ou_process.h5 | grep \"File space\"" + ] + }, + { + "cell_type": "markdown", + "id": "da885f09", + "metadata": {}, + "source": [ + "If this was a cloud optimized file, the first line would show `H5F_FSPACE_STRATEGY_PAGE` and the second would display the file page size, typically 4-8 MB. Since our file is very small there is no benefit in making it cloud optimized. The default configuration of the ROS3 driver since HDF5 v2.0 already maximizes the performance.\n", + "\n", + "One method of cloud optimizing files, besides creating them as such, is the `h5repack` command. It can rewrite a file with a new chunk layout and enable paged file-space management, which is the feature responsible for consolidating internal metadata together. Example:\n", + "\n", + "```bash\n", + "h5repack -S PAGE -G 4194304 -l CHUNK=1024x1024 -f GZIP=4 mydata.h5 mydata_cloud.h5\n", + "```\n", "\n", - "While it is possible to read HDF5 files in their native format directly from the Cloud, this introduces some inefficiencies. The access patterns of the library were optimized for filesystem I/O, not cloud access and the delays it brings. These concerns motivated the development of the [HDF5 Cloud-Optimized Read-Only Library (H5Coro)](https://github.com/ICESat2-SlideRule/h5coro). H5Coro a pure Python implementation of a subset of the HDF5 specification that has been optimized for reading data out of S3. For large files, H5Coro can speed up access times by [two orders of magnitude](https://www.hdfgroup.org/wp-content/uploads/2021/05/JPSwinski_H5Coro.pdf). This is achieved primarily through using a larger cache to avoid repeated small reads to the cloud." + "The `-S PAGE` flag selects paged aggregation so metadata and small objects are grouped into fixed-size pages, `-G` sets the page size in bytes, and `-l CHUNK=...` reshapes the datasets so each chunk is a worthwhile unit to fetch over the network. After repacking, the new file behaves identically to the original for any HDF5-compatible reader, but cloud-aware software can open and read data from it with a small, predictable number of HTTP requests." ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "hdf5-tutorial", "language": "python", "name": "python3" }, @@ -216,7 +359,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.14.4" } }, "nbformat": 4, diff --git a/environment.yml b/environment.yml index 36842e3..f95e4e2 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: hdf5-tutorial channels: - conda-forge dependencies: - - python ==3.10 + - python - jupyter - h5py - hdf5 diff --git a/src/ou_ros3.c b/src/ou_ros3.c deleted file mode 100644 index 68ca9d2..0000000 --- a/src/ou_ros3.c +++ /dev/null @@ -1,54 +0,0 @@ -#include "hdf5.h" -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -int main(void) -{ -H5FD_ros3_fapl_t s3_params; - -// Set the AWS region and the S3 bucket name -strcpy(s3_params.aws_region, "us-east-2"); -string file_uri = "https://s3.us-east-2.amazonaws.com/docs.hdfgroup.org/hdf5/h5/ou_process.h5"; -// The version of this ROS3 FAPL parameter structure -s3_params.version = 1; -// This file permits anonymous access, so authentication is not needed -// secret_id and secret_key auth fields that would be required for protected S3 buckets -s3_params.authenticate = 0; - -// Open and read from the file -auto fapl = H5Pcreate(H5P_FILE_ACCESS); -H5Pset_fapl_ros3(fapl, &s3_params); -auto file = H5Fopen(file_uri.c_str(), H5F_ACC_RDONLY, fapl); -auto dset = H5Dopen(file, "dataset", H5P_DEFAULT); - -// Get the dataspace details from the file -auto dspace = H5Dget_space(dset); -assert(H5Sget_simple_extent_ndims(dspace) == 2); -array dims; -H5Sget_simple_extent_dims(dspace, dims.data(), NULL); -vector read_buffer(dims[0] * dims[1]); - -H5Dread(dset, H5T_IEEE_F64LE, H5S_ALL, H5S_ALL, H5P_DEFAULT, (void*) read_buffer.data()); - -for (size_t i = 0; i < 10; i++) { - size_t row_idx = i * dims[0] / 10; - size_t col_idx = i * dims[1] / 10; - auto element = read_buffer[row_idx * dims[1] + col_idx]; - cout << "Element at index (" << row_idx << ", " << col_idx << ") = " - << std::setprecision(16) << element << endl; -} - -H5Dclose(dset); -H5Sclose(dspace); -H5Pclose(fapl); -H5Fclose(file); - -return 0; -}