Skip to content

Commit

Permalink
relay: expand range of strings that can be read from hdf5 files (#452)
Browse files Browse the repository at this point in the history
* relay hdf5: add ability to read a wider set of strings from hdf5 files

* relay hdf5: add unit tests with different flavors of hdf5 strings
  • Loading branch information
cyrush committed Oct 15, 2019
1 parent a4ccb3c commit 05a193e
Show file tree
Hide file tree
Showing 4 changed files with 439 additions and 27 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ and this project aspires to adhere to [Semantic Versioning](https://semver.org/s

#### Relay
- Added optional ZFP support to relay, that enables wrapping and unwraping zfp arrays into conduit Nodes.

- Extended relay HDF5 I/O support to read a wider range of HDF5 string representations including H5T_VARIABLE strings.

### Changed

Expand Down
204 changes: 179 additions & 25 deletions src/libs/relay/conduit_relay_io_hdf5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ join_ref_paths(const std::string &parent, const std::string &child)
// Data Type Helper methods that are a part of public conduit::relay::io
//
// conduit_dtype_to_hdf5_dtype
// conduit_dtype_to_hdf5_dtype_cleanup
// hdf5_dtype_to_conduit_dtype
//-----------------------------------------------------------------------------

Expand All @@ -488,8 +489,46 @@ conduit_dtype_to_hdf5_dtype(const DataType &dt,
const std::string &ref_path)
{
hid_t res = -1;
// first check endianness
if(dt.is_little_endian()) // we know we are little endian

// // This code path enables writing strings in a way that is friendlier
// // to hdf5 command line tools like hd5dump and h5ls. However
// // using this path we *cannot* compress that string data, so
// // is currently disabled
//
// if(dt.is_string())
// {
//
// // modify the default hdf5 type to include string length info,
// // so hdf5 tools display the string contents in a human friendly way
//
// // create a copy of the default type
// res = H5Tcopy(H5T_C_S1);
// CONDUIT_CHECK_HDF5_ERROR_WITH_REF_PATH(res,
// ref_path,
// "Failed to copy HDF5 type for string");
//
// // set the size
// CONDUIT_CHECK_HDF5_ERROR_WITH_REF_PATH(
// H5Tset_size(res,
// // string size + null
// dt.number_of_elements()),
// ref_path,
// "Failed to set size in HDF5 string type");
//
// // set term
// CONDUIT_CHECK_HDF5_ERROR_WITH_REF_PATH(
// H5Tset_strpad(res, H5T_STR_NULLTERM),
// ref_path,
// "Failed to set strpad in HDF5 string type");
// }

// strings are special, check for them first
if( dt.is_string() )
{
res = H5T_C_S1;
}
// next check endianness
else if(dt.is_little_endian()) // we know we are little endian
{
switch(dt.id())
{
Expand All @@ -506,8 +545,13 @@ conduit_dtype_to_hdf5_dtype(const DataType &dt,
case DataType::FLOAT32_ID: res = H5T_IEEE_F32LE; break;
case DataType::FLOAT64_ID: res = H5T_IEEE_F64LE; break;

case DataType::CHAR8_STR_ID: res = H5T_C_S1; break;

case DataType::CHAR8_STR_ID:
CONDUIT_HDF5_ERROR(ref_path,
"conduit::DataType to HDF5 Leaf DataType "
<< "Conversion:"
<< dt.to_json()
<< " needs to be handled with string logic");
break;
default:
CONDUIT_HDF5_ERROR(ref_path,
"conduit::DataType to HDF5 Leaf DataType "
Expand All @@ -533,8 +577,13 @@ conduit_dtype_to_hdf5_dtype(const DataType &dt,
case DataType::FLOAT32_ID: res = H5T_IEEE_F32BE; break;
case DataType::FLOAT64_ID: res = H5T_IEEE_F64BE; break;

case DataType::CHAR8_STR_ID: res = H5T_C_S1; break;

case DataType::CHAR8_STR_ID:
CONDUIT_HDF5_ERROR(ref_path,
"conduit::DataType to HDF5 Leaf DataType "
<< "Conversion:"
<< dt.to_json()
<< " needs to be handled with string logic");
break;
default:
CONDUIT_HDF5_ERROR(ref_path,
"conduit::DataType to HDF5 Leaf DataType "
Expand All @@ -543,10 +592,35 @@ conduit_dtype_to_hdf5_dtype(const DataType &dt,
<< " is not a leaf data type");
};
}

return res;
}

//-----------------------------------------------------------------------------
// cleanup conduit created hdf5 dtype
// (effectively a noop, except for the string case)
// TODO: This could be a macro ... ?
//-----------------------------------------------------------------------------
void
conduit_dtype_to_hdf5_dtype_cleanup(hid_t hdf5_dtype_id,
const std::string &ref_path)
{
// NOTE: This cleanup won't be triggered when we use thee
// based H5T_C_S1 with a data space that encodes # of elements
// (Our current path, given our logic to encode string size in the
// hdf5 type is disabled )

// if this is a string using a custom type we need to cleanup
// the conduit_dtype_to_hdf5_dtype result
if( (! H5Tequal(hdf5_dtype_id, H5T_C_S1) ) &&
(H5Tget_class(hdf5_dtype_id) == H5T_STRING ) )
{
CONDUIT_CHECK_HDF5_ERROR_WITH_REF_PATH(H5Tclose(hdf5_dtype_id),
ref_path,
"Failed to close HDF5 string Type "
<< hdf5_dtype_id);
}
}


//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -679,8 +753,25 @@ hdf5_dtype_to_conduit_dtype(hid_t hdf5_dtype_id,
//-----------------------------------------------
else if(H5Tequal(hdf5_dtype_id,H5T_C_S1))
{
// string as array case (old way of writing)
res = DataType::char8_str(num_elems);
}
// extended string reps
else if( H5Tget_class(hdf5_dtype_id) == H5T_STRING )
{
// for strings of this type, the length
// is encoded in the hdf5 type not the hdf5 data space
index_t hdf5_strlen = H5Tget_size(hdf5_dtype_id);
// check for variable type first
if( H5Tis_variable_str(hdf5_dtype_id) )
{
res = DataType::char8_str(-1);
}
else
{
res = DataType::char8_str(hdf5_strlen);
}
}
//-----------------------------------------------
// Unsupported
//-----------------------------------------------
Expand All @@ -691,9 +782,6 @@ hdf5_dtype_to_conduit_dtype(hid_t hdf5_dtype_id,
<< "Leaf Conversion");
}

// set proper number of elems from what was passed
res.set_number_of_elements(num_elems);

return res;
}

Expand Down Expand Up @@ -777,9 +865,23 @@ check_if_conduit_leaf_is_compatible_with_hdf5_obj(const DataType &dtype,
// we will check the 1d-properties of the hdf5 dataspace
hssize_t h5_test_num_ele = H5Sget_simple_extent_npoints(h5_test_dspace);

// make sure we have the write dtype and the 1d size matches
if( ! ( (H5Tequal(h5_dtype, h5_test_dtype) > 0) &&
(dtype.number_of_elements() == h5_test_num_ele) ) )

// string case is special, check it first

// if the dataset in the file is a custom string type
// check the type's size vs the # of elements
if( ( ! H5Tequal(h5_test_dtype, H5T_C_S1) &&
( H5Tget_class(h5_test_dtype) == H5T_STRING ) &&
( H5Tget_class(h5_dtype) == H5T_STRING ) ) &&
// if not shorted out, we have a string w/ custom type
// check length to see if compat
// note: both hdf5 and conduit dtypes include null term in string size
(dtype.number_of_elements() != (index_t)H5Tget_size(h5_test_dtype) ) )
{
res = false;
}
else if( ! ( (H5Tequal(h5_dtype, h5_test_dtype) > 0) &&
(dtype.number_of_elements() == h5_test_num_ele) ) )
{
res = false;
}
Expand All @@ -789,6 +891,8 @@ check_if_conduit_leaf_is_compatible_with_hdf5_obj(const DataType &dtype,
ref_path,
"Failed to close HDF5 Datatype "
<< h5_test_dtype);
// clean up when necessary
conduit_dtype_to_hdf5_dtype_cleanup(h5_dtype);
}

CONDUIT_CHECK_HDF5_ERROR_WITH_FILE_AND_REF_PATH(H5Sclose(h5_test_dspace),
Expand Down Expand Up @@ -951,8 +1055,7 @@ create_hdf5_dataset_for_conduit_leaf(const DataType &dtype,
hid_t h5_dtype = conduit_dtype_to_hdf5_dtype(dtype,ref_path);

hsize_t num_eles = (hsize_t) dtype.number_of_elements();



hid_t h5_cprops_id = H5P_DEFAULT;


Expand All @@ -970,11 +1073,27 @@ create_hdf5_dataset_for_conduit_leaf(const DataType &dtype,
CONDUIT_CHECK_HDF5_ERROR_WITH_FILE_AND_REF_PATH(h5_cprops_id,
hdf5_group_id,
ref_path,

"Failed to create HDF5 property list");
hid_t h5_dspace_id = -1;

// string a scalar with size embedded in type is disabled
// b/c this path undermines compression
// if(dtype.is_string())
// {
// h5_dspace_id = H5Screate(H5S_SCALAR);
// }
// else
// {
// h5_dspace_id = H5Screate_simple(1,
// &num_eles,
// NULL);
// }

h5_dspace_id = H5Screate_simple(1,
&num_eles,
NULL);

hid_t h5_dspace_id = H5Screate_simple(1,
&num_eles,
NULL);

CONDUIT_CHECK_HDF5_ERROR_WITH_FILE_AND_REF_PATH(h5_dspace_id,
hdf5_group_id,
Expand All @@ -997,6 +1116,9 @@ create_hdf5_dataset_for_conduit_leaf(const DataType &dtype,
<< hdf5_group_id << " "
<< hdf5_dset_name);

// cleanup if custom data type was used
conduit_dtype_to_hdf5_dtype_cleanup(h5_dtype);

// close plist used for compression
if(h5_cprops_id != H5P_DEFAULT)
{
Expand Down Expand Up @@ -1110,6 +1232,7 @@ write_conduit_leaf_to_hdf5_dataset(const Node &node,
"Failed to write to HDF5 Dataset "
<< hdf5_dset_id);

conduit_dtype_to_hdf5_dtype_cleanup(h5_dtype_id);
}

//---------------------------------------------------------------------------//
Expand Down Expand Up @@ -1752,17 +1875,20 @@ read_hdf5_dataset_into_conduit_node(hid_t hdf5_dset_id,
"Error reading HDF5 Datatype: "
<< hdf5_dset_id);



index_t nelems = H5Sget_simple_extent_npoints(h5_dspace_id);

// Note: string case is handed properly in hdf5_dtype_to_conduit_dtype
DataType dt = hdf5_dtype_to_conduit_dtype(h5_dtype_id,
nelems,
ref_path);

// if the endianness of the dset in the file doesn't
// match the current machine we always want to convert it
// on read.

// check endianness
// Note: string cases never land here b/c they are
// created with default endianness
if(!dt.endianness_matches_machine())
{
// if they don't match, modify the dt
Expand All @@ -1775,6 +1901,7 @@ read_hdf5_dataset_into_conduit_node(hid_t hdf5_dset_id,
ref_path,
"Error closing HDF5 Datatype: "
<< h5_dtype_id);

// get ref to standard variant of this dtype
h5_dtype_id = conduit_dtype_to_hdf5_dtype(dt,
ref_path);
Expand All @@ -1784,17 +1911,44 @@ read_hdf5_dataset_into_conduit_node(hid_t hdf5_dset_id,
ref_path,
"Error creating HDF5 Datatype");

// copy this handle, b/c clean up code later will close it
// copy since the logic after read will cleanup
h5_dtype_id = H5Tcopy(h5_dtype_id);
CONDUIT_CHECK_HDF5_ERROR_WITH_FILE_AND_REF_PATH(h5_dtype_id,
hdf5_dset_id,
ref_path,
"Error copying HDF5 Datatype");
// cleanup our ref from conduit_dtype_to_hdf5_dtype if necessary
conduit_dtype_to_hdf5_dtype_cleanup(h5_dtype_id);
}



hid_t h5_status = 0;

if(dest.dtype().is_compact() &&

// check for string special case, H5T_VARIABLE string
if( H5Tis_variable_str(h5_dtype_id) )
{
//special case for reading variable string data
// hdf5 reads the data onto its heap, and
// gives us a pointer to that location

char *read_ptr[1] = {NULL};
h5_status = H5Dread(hdf5_dset_id,
h5_dtype_id,
H5S_ALL,
H5S_ALL,
H5P_DEFAULT,
read_ptr);

// copy the data out to the conduit node
dest.set_string(read_ptr[0]);
}
// check for bad # of elements
else if( dt.number_of_elements() < 0 )
{
CONDUIT_HDF5_ERROR(ref_path,
"Cannot read dataset with # of elements < 0");
}
else if(dest.dtype().is_compact() &&
dest.dtype().compatible(dt) )
{
// we can read directly from hdf5 dataset if compact
Expand Down Expand Up @@ -1822,7 +1976,7 @@ read_hdf5_dataset_into_conduit_node(hid_t hdf5_dset_id,
n_tmp.data_ptr());

// copy out to our dest
dest.set(n_tmp);
dest.set(n_tmp);
}

CONDUIT_CHECK_HDF5_ERROR_WITH_FILE_AND_REF_PATH(h5_status,
Expand Down
19 changes: 18 additions & 1 deletion src/libs/relay/conduit_relay_io_hdf5_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,21 @@ void CONDUIT_RELAY_API hdf5_read(hid_t hdf5_id,
///
/// Throughout the relay hdf5 implementation, we use DataType::Empty when
/// the hdf5 data space is H5S_NULL, regardless of what the hdf5 data type is.
/// That isn't reflected in these helper functions,they handle
/// That isn't reflected in these helper functions, they handle
/// mapping of endianness and leaf types other than empty.
///
/// conduit_dtype_to_hdf5_dtype uses default HDF5 datatypes except for
/// the string case. String case result needs to be cleaned up with
/// H5Tclose(). You can use conduit_dtype_to_hdf5_dtype_cleanup() to
/// properly cleanup in all cases.
///
/// You also can detect the custom string type case with:
///
/// if( ! H5Tequal(hdf5_dtype_id, H5T_C_S1) &&
/// ( H5Tget_class(hdf5_dtype_id) == H5T_STRING ) )
/// {
/// // custom string type case
/// }
///
/// Note: In these functions, ref_path is used to provide context about the
/// hdf5 tree when an error occurs. Using it is recommend but not required.
Expand All @@ -275,6 +287,11 @@ void CONDUIT_RELAY_API hdf5_read(hid_t hdf5_id,
hid_t CONDUIT_RELAY_API conduit_dtype_to_hdf5_dtype(const DataType &dt,
const std::string &ref_path="");

//-----------------------------------------------------------------------------
void conduit_dtype_to_hdf5_dtype_cleanup(
hid_t hdf5_dtype_id,
const std::string &ref_path="");

//-----------------------------------------------------------------------------
DataType CONDUIT_RELAY_API hdf5_dtype_to_conduit_dtype(hid_t hdf5_dtype_id,
index_t num_elems,
Expand Down

0 comments on commit 05a193e

Please sign in to comment.