Merge pull request #230 from HDFGroup/kimmy/vfd_doc

Kimmy/vfd doc
HDFGroup · Oct 27, 2021 · ffe2444 · ffe2444
2 parents 2d51203 + d1e794e
commit ffe2444
Show file tree

Hide file tree

Showing 5 changed files with 215 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 build/
+benchmarks/HermesVFD
 
 GPATH
 GRTAGS

diff --git a/benchmarks/HermesVFD/README.md b/benchmarks/HermesVFD/README.md
@@ -0,0 +1,74 @@
+# Demonstrate how to use Hermes with hdf5_iotest
+
+## Build Hermes
+```bash
+git clone https://github.com/HDFGroup/hermes.git
+```
+build with `HERMES_ENABLE_WRAPPER=ON` following the instructions in hermes README file
+
+## Build hdf5 with Hermes VFD
+```bash
+git clone https://github.com/jya-kmu/hdf5.git
+git checkout hermes_vfd
+git checkout 30d224cec2ea21c41ce09083f96885ea4785bbdf .
+```
+build hdf5 with `HDF5_ENABLE_PARALLEL=ON` and `HDF5_ENABLE_HERMES_VFD=ON`
+
+## Build hdf5-iotest
+```bash
+git checkout https://github.com/jya-kmu/hdf5-iotest.git
+git checkout hermes-vfd
+```
+We need two config files `hermes.conf` and `hdf5_iotest.ini` to start hdf5-io test with 
+Hermes buffering system. 
+
+To create hermes.conf configuration file for Hermes environment setup, user can start from
+the provided example hermes.conf (named `hermes.conf_4KB_128KB_example`) and modify it 
+to match different system setup and test purposes. For example, user can modify 
+`mount_points` and `swap_mount` in the config file according to the system configuration.
+Once the file is ready, place it to top hdf5-iotest directory.
+
+User also needs the file `src/hdf5_iotest.ini` to start hdf5_iotest. For our test we
+only modify the `split` option if to split metadata and data (`split = 1`) 
+or not (`split = 0`). An example `hdf5_iotest.ini` file is also provided in this
+direcotry without splitting metadata and data.
+
+If the user choose not to split metadata and data by setting `split = 0`, the page
+size is the same for metadata and data (if choosing hermes) and can be setup by function 
+`status = H5Pset_fapl_hermes(fapl, false, 1048576)` (@line 176) in file 
+`src/hdf5_iotest.c`. The last parameter in the function is the page size 1MiB in
+this example. User can modify it for performance test.
+
+If the user choose to split metadata and data by setting `split = 1`, they
+can use different VFD methods or same VFD method with different parameter 
+configuration for metadata and data. In the example of hdf5-iotest, function 
+`status = H5Pset_fapl_hermes(fapl, false, 1048576)` (@line 176) in file
+`src/hdf5_iotest.c` will setup the page size for data. And default in this test
+we choose HDF5 core VFD for metadata by HDF5 API
+`status = H5Pset_fapl_core(fapl_m, (size_t)0, 0)` (@line 334), which will keep
+the metadata in memory. As another option, user can choose Hermes VFD for
+metadata with a smaller
+page size by `status = H5Pset_fapl_hermes(fapl_m, false, 4096)` (@line 333) 
+instead of core VFD.
+
+We provide the performance results in hdf5-iotest_Hermes_VFD.csv for splitting and 
+not splitting metadata and data, and further using different methods for metadata
+in the splitting case.
+
+## Before the run
+Set up Herems environment variables by
+```bash
+export HERMES_CONF=/path/to/hermes.conf
+```
+
+## Run
+Run the test as
+```bash
+GLOG_minloglevel=2 /path/to/hdf5_iotest /path/to/hdf5_iotest.ini
+```
+
+## Run with Darshan
+Darshan output can be generated by running the command
+```bash
+LD_PRELOAD=/path/to/libdarshan.so GLOG_minloglevel=2 /path/to/hdf5_iotest /path/to/hdf5_iotest.ini
+```
diff --git a/benchmarks/HermesVFD/hdf5-iotest_Hermes_VFD.csv b/benchmarks/HermesVFD/hdf5-iotest_Hermes_VFD.csv
@@ -0,0 +1,11 @@
+20 time steps,,,,,,,,,,,,,,
+Baedline,posix,,,,,,,,,,,,,
+,PFS,21024,21024,21024,21024,21024,21024,,,,,,,
+,,,,,,,,,,,,,,
+Hermes,buffer,1MB (not split),"1KB, 1MB","4KB, 1MB","8KB, 1MB","16KB, 1MB","core, 1MB",,128KB (not split),"1KB, 128KB","4KB, 128KB","8KB, 128KB","16KB, 128KB","core, 128KB"
+random,NVMe,873,777,730,707,730,635,,570,731,625,600,586,519
+rr,,875,758,718,712,722,630,,556,716,611,589,585,515
+,,,,,,,,,,,,,,
+Hermes,buffer,1MB (not split),"1KB, 1MB","4KB, 1MB","8KB, 1MB","16KB, 1MB","core, 1MB",,128KB (not split),"1KB, 128KB","4KB, 128KB","8KB, 128KB","16KB, 128KB","core, 128KB"
+random,4 tiers,5374,4472,4054,4191,4627,3625,,3120,3152,3099,3196,3593,2572
+rr,,5325,4482,4083,4160,4621,3615,,3133,3053,3058,3173,3594,2557
diff --git a/benchmarks/HermesVFD/hdf5_iotest.ini b/benchmarks/HermesVFD/hdf5_iotest.ini
@@ -0,0 +1,22 @@
+[DEFAULT]
+version = 0
+steps = 20
+arrays = 500
+rows = 100
+columns = 200
+process-rows = 1
+process-columns = 1
+# [weak, strong]
+scaling = weak
+# align along increment [bytes] boundaries
+alignment-increment = 1
+# minimum object size [bytes] to force alignment (0=all objects)
+alignment-threshold = 0
+# minimum metadata block allocation size [bytes]
+meta-block-size = 2048
+# [posix, core, mpi-io-uni, hermes]
+single-process = hermes
+# 1 indicates to use split file diriver, and 0 indicates not.
+split = 0
+hdf5-file = hdf5_iotest.h5
+csv-file = hdf5_iotest.csv
diff --git a/benchmarks/HermesVFD/hermes.conf_4KB_128KB_example b/benchmarks/HermesVFD/hermes.conf_4KB_128KB_example
@@ -0,0 +1,107 @@
+# Example Hermes configuration file
+
+# TODO(chogan): Allow specifying capacity values in bytes, KiB, or GiB.
+
+# The number of buffering tiers available. For example, RAM, NVMe, burst
+# buffer, and parallel file system would be 4 tiers.
+num_devices = 4;
+# For now this should be the same as num_devices.
+num_targets = 4;
+
+# The maximum buffering capacity in MiB of each device.
+capacities_mb = {2876, 1024, 1536, 2048};
+# The size of the smallest available buffer in KiB. In general this should be
+# the page size of your system for byte addressable storage, and the block size
+# of the storage device for block addressable storage.
+block_sizes_kb = {4, 4, 4, 4};
+# The number of size categories for each device. Here we say that each of our 4
+# devices should have 4 different sizes of buffers.
+num_slabs = {2, 2, 2, 2};
+
+# The number of blocks (the size of which is chosen in block_sizes_kb) that each
+# device should contain for each slab (controlled by num_slabs). This allows for
+# precise control of the distribution of buffer sizes.
+slab_unit_sizes = {
+  {1, 32},
+  {1, 32},
+  {1, 32},
+  {1, 32},
+};
+
+# The percentage of buffering capacity per device to allocate for each slab.
+# Each row should add up to 1.
+desired_slab_percentages = {
+  {0.1, 0.9},
+  {0.1, 0.9},
+  {0.1, 0.9},
+  {0.1, 0.9},
+};
+
+# The maximum theoretical bandwidth (as advertised by the manufacturer) in
+# MiB/sec. of each device.
+bandwidths_mbps = {5000, 300, 150, 70};
+# The latency in microseconds of each device (as advertised by the manufacturer).
+latencies_us = {15, 250000, 500000, 1000000};
+
+# Hermes memory management. The following 4 values should add up to 1.
+# The percentage of Hermes memory to reserve for RAM buffers.
+buffer_pool_arena_percentage = 0.18;
+# The percentage of Hermes memory to reserve for metadata.
+metadata_arena_percentage = 0.71;
+# The percentage of Hermes memory to reserve for data transfers.
+transfer_window_arena_percentage = 0.08;
+# The percentage of Hermes memory to reserve for short term storage.
+transient_arena_percentage = 0.03;
+
+# The maximum number of buckets that can be created.
+max_buckets_per_node = 16;
+# The maximum number of virtual buckets that can be created.
+max_vbuckets_per_node = 8;
+# The interval in milliseconds at which to update the global system view.
+system_view_state_update_interval_ms = 1000;
+
+# The mount point of each device. RAM should be the empty string. For block
+# devices, this is the directory where Hermes will create buffering files. For
+# object storage or cloud targets, this will be a url.
+mount_points = {"", "./", "./", "./"};
+# The mount point of a PFS or object store for swap space, in the event that
+# Hermes buffers become full.
+swap_mount = "./";
+# The number of times the buffer organizer will attempt to place a blob from
+# swap space into the hierarchy before giving up.
+num_buffer_organizer_retries = 3;
+# Base hostname for the RPC servers.
+rpc_server_base_name = "localhost";
+# RPC server name suffix. This is appended to the the base name plus host
+# number.
+rpc_server_suffix = "";
+# The RPC protocol. This must come from the documentation of the specific RPC
+# library in use.
+rpc_protocol = "ofi+sockets";
+# RPC domain name for verbs transport. Blank for tcp.
+rpc_domain = "";
+# Desired RPC port number.
+rpc_port = 8080;
+# Desired RPC port number for buffer organizer.
+buffer_organizer_port = 8081;
+# An inclusive range of the first and last server numbers. This is a convenience
+# feature for generating long lists of server names. For example, if your
+# servers are called server-1-40g, server-2-40g, server-3-40g, all the way to
+# server-100-40g, then you would set rpc_server_base_name to 'server',
+# rpc_server_suffix to '-40g', and rpc_host_number_range to {1, 100}.
+# TODO(chogan): Support reading server names from file.
+rpc_host_number_range = {0, 0};
+# The number of handler threads for each RPC server.
+rpc_num_threads = 1;
+# The shared memory prefix for the hermes shared memory segment. A user name
+# will be automatically appended.
+buffer_pool_shmem_name = "/hermes_buffer_pool_";
+
+# Choose Random, RoundRobin, or MinimizeIoTime
+default_placement_policy = "Random";
+#default_placement_policy = "RoundRobin";
+#default_placement_policy = "MinimizeIoTime";
+
+# If true (1) the RoundRobin placement policy algorithm will split each Blob
+# into a random number of smaller Blobs.
+default_rr_split = 0;