From 888ae4d3e23108135f362278311b3b605b77a181 Mon Sep 17 00:00:00 2001 From: Cyrus Harrison Date: Fri, 18 Dec 2020 15:47:37 -0800 Subject: [PATCH 1/2] add Node::describe() --- src/libs/conduit/conduit_data_array.cpp | 204 ++++++++++++++++++++++++ src/libs/conduit/conduit_data_array.hpp | 19 ++- src/libs/conduit/conduit_node.cpp | 139 ++++++++++++++++ src/libs/conduit/conduit_node.hpp | 8 + src/tests/conduit/t_conduit_array.cpp | 54 +++++++ src/tests/conduit/t_conduit_node.cpp | 57 +++++++ 6 files changed, 479 insertions(+), 2 deletions(-) diff --git a/src/libs/conduit/conduit_data_array.cpp b/src/libs/conduit/conduit_data_array.cpp index 4cc9c7b2b..da34a978f 100644 --- a/src/libs/conduit/conduit_data_array.cpp +++ b/src/libs/conduit/conduit_data_array.cpp @@ -279,6 +279,81 @@ DataArray::diff_compatible(const DataArray &array, Node &info, const float return res; } +//---------------------------------------------------------------------------// +/// +/// Summary Stats Helpers +/// +//---------------------------------------------------------------------------// + +//---------------------------------------------------------------------------// +template +T +DataArray::min() const +{ + T res = std::numeric_limits::max(); + for(index_t i = 0; i < number_of_elements(); i++) + { + const T &val = element(i); + if(val < res) + { + res = val; + } + } + + return res; +} + +//---------------------------------------------------------------------------// +template +T +DataArray::max() const +{ + T res = std::numeric_limits::min(); + for(index_t i = 0; i < number_of_elements(); i++) + { + const T &val = element(i); + if(val > res) + { + res = val; + } + } + + return res; +} + + +//---------------------------------------------------------------------------// +template +T +DataArray::sum() const +{ + T res =0; + for(index_t i = 0; i < number_of_elements(); i++) + { + const T &val = element(i); + res += val; + } + + return res; +} + +//---------------------------------------------------------------------------// +template +float64 +DataArray::mean() const +{ + float64 res =0; + for(index_t i = 0; i < number_of_elements(); i++) + { + const T &val = element(i); + res += val; + } + + res = res / float64(number_of_elements()); + return res; +} + + //---------------------------------------------------------------------------// template std::string @@ -1436,6 +1511,135 @@ DataArray::compact_elements_to(uint8 *data) const } +//---------------------------------------------------------------------------// +template +std::string +DataArray::to_summary_string_default() const +{ + return to_summary_string(); +} + +//---------------------------------------------------------------------------// +template +std::string +DataArray::to_summary_string(index_t threshold) const +{ + std::ostringstream oss; + to_summary_string_stream(oss, threshold); + return oss.str(); +} + +//---------------------------------------------------------------------------// +template +void +DataArray::to_summary_string_stream(std::ostream &os, + index_t threshold) const +{ + // if we are less than or equal to threshold, we use to_yaml + index_t nele = number_of_elements(); + + if(nele <= threshold) + { + to_yaml_stream(os); + } + else + { + // if above threshold only show threshold # of values + int half = threshold / 2; + int bottom = half; + int top = half; + + // + // if odd, show 1/2 +1 first + // + + if( (threshold % 2) > 0) + { + bottom++; + } + + if(nele > 1) + os << "["; + + bool first = true; + bool done = false; + int idx = 0; + + while(!done) + { + if(!first) + os << ", "; + + switch(m_dtype.id()) + { + // ints + case DataType::INT8_ID: + case DataType::INT16_ID: + case DataType::INT32_ID: + case DataType::INT64_ID: + { + os << (int64) element(idx); + break; + } + // uints + case DataType::UINT8_ID: + case DataType::UINT16_ID: + case DataType::UINT32_ID: + case DataType::UINT64_ID: + { + os << (uint64) element(idx); + break; + } + // floats + case DataType::FLOAT32_ID: + case DataType::FLOAT64_ID: + { + std::string fs = utils::float64_to_string((float64)element(idx)); + //check for inf and nan + // looking for 'n' covers inf and nan + bool inf_or_nan = fs.find('n') != std::string::npos; + + if(inf_or_nan) + os << "\""; + + os << fs; + + if(inf_or_nan) + os << "\""; + break; + } + default: + { + CONDUIT_ERROR("Leaf type \"" + << m_dtype.name() + << "\"" + << "is not supported in conduit::DataArray.") + } + } + + first=false; + + idx++; + + if(idx == bottom) + { + idx = nele - top; + os << ", ..."; + } + + if(idx == nele) + { + done = true; + } + } + + if(nele > 1) + os << "]"; + } +} + + + //----------------------------------------------------------------------------- // // -- conduit::DataArray explicit instantiations for supported array types -- diff --git a/src/libs/conduit/conduit_data_array.hpp b/src/libs/conduit/conduit_data_array.hpp index 5a240ec26..170165fb2 100644 --- a/src/libs/conduit/conduit_data_array.hpp +++ b/src/libs/conduit/conduit_data_array.hpp @@ -103,6 +103,14 @@ class CONDUIT_API DataArray Node &info, const float64 epsilon = CONDUIT_EPSILON) const; + /// + /// Summary Stats Helpers + /// + T min() const; + T max() const; + T sum() const; + float64 mean() const; + //----------------------------------------------------------------------------- // Setters //----------------------------------------------------------------------------- @@ -207,7 +215,7 @@ class CONDUIT_API DataArray #endif #ifndef CONDUIT_USE_DOUBLE - void setconst std::initializer_list &values); + void set(const std::initializer_list &values); #endif //------------------------------------------------------------------------- @@ -311,7 +319,14 @@ class CONDUIT_API DataArray void to_yaml_stream(std::ostream &os) const; void compact_elements_to(uint8 *data) const; - + + /// Creates a string repression for printing that limits + /// the number of elements shown to a max number + std::string to_summary_string_default() const; + std::string to_summary_string(index_t threshold=5) const; + void to_summary_string_stream(std::ostream &os, + index_t threshold=5) const; + //----------------------------------------------------------------------------- // -- stdout print methods --- //----------------------------------------------------------------------------- diff --git a/src/libs/conduit/conduit_node.cpp b/src/libs/conduit/conduit_node.cpp index e625d77b5..2b99bbfe3 100644 --- a/src/libs/conduit/conduit_node.cpp +++ b/src/libs/conduit/conduit_node.cpp @@ -12700,6 +12700,145 @@ Node::to_pure_yaml(std::ostream &os, // //----------------------------------------------------------------------------- +//---------------------------------------------------------------------------// +void +Node::describe(Node &res) const +{ + Node opts; + describe(opts,res); +} + +//---------------------------------------------------------------------------// +void +Node::describe(const Node &opts, Node &res) const +{ + res.reset(); + index_t dtype_id = dtype().id(); + if(dtype_id == DataType::OBJECT_ID) + { + NodeConstIterator itr = children(); + while(itr.has_next()) + { + const Node &cld = itr.next(); + std::string cld_name = itr.name(); + Node &cld_des = res[cld_name]; + cld.describe(opts,cld_des); + } + } + else if(dtype_id == DataType::LIST_ID) + { + NodeConstIterator itr = children(); + while(itr.has_next()) + { + const Node &cld = itr.next(); + Node &cld_des = res.append(); + cld.describe(opts,cld_des); + } + } + else // leaves! + { + index_t thresh = 5; + + if(opts.has_child("threshold")) + { + thresh = (index_t) opts["threshold"].to_int(); + } + + res["dtype"] = DataType::id_to_name(dtype_id); + // TODO: + // Should we use `number_of_elements` instead of `count` + // I think so, but count also matches r and pandas ... + // + res["count"] = dtype().number_of_elements(); + + if(dtype().is_int8()) + { + int8_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_int16()) + { + int16_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_int32()) + { + int32_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_int64()) + { + int64_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_uint8()) + { + uint8_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_uint16()) + { + uint16_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_uint32()) + { + uint32_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_uint64()) + { + uint64_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_float32()) + { + float32_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_float64()) + { + float64_array t_array = value(); + res["mean"] = t_array.mean(); + res["min"] = t_array.min(); + res["max"] = t_array.max(); + res["values"] = t_array.to_summary_string(thresh); + } + else if(dtype().is_char8_str()) + { + res["values"].set_external(*this); + } + } +} + + // NOTE: several other Node information methods are inlined in Node.h //---------------------------------------------------------------------------// diff --git a/src/libs/conduit/conduit_node.hpp b/src/libs/conduit/conduit_node.hpp index 3f1d2a758..8308d69bc 100644 --- a/src/libs/conduit/conduit_node.hpp +++ b/src/libs/conduit/conduit_node.hpp @@ -3708,10 +3708,18 @@ class CONDUIT_API Node /// info() creates a node that contains metadata about the current /// node's memory properties void info(Node &nres) const; + /// TODO: this is inefficient w/o move semantics, but is very /// convenient for testing and example programs. Node info() const; + /// + /// describe() creates a node that replaces each leaf with + /// descriptive statistics (count, mean, min, max) and a string repd + /// values summary + void describe(Node &nres) const; + void describe(const Node &opts, Node &nres) const; + //----------------------------------------------------------------------------- // -- stdout print methods --- //----------------------------------------------------------------------------- diff --git a/src/tests/conduit/t_conduit_array.cpp b/src/tests/conduit/t_conduit_array.cpp index a69c3ecb7..cda59337f 100644 --- a/src/tests/conduit/t_conduit_array.cpp +++ b/src/tests/conduit/t_conduit_array.cpp @@ -609,6 +609,60 @@ TEST(conduit_array, print_bells_and_whistles) #ifdef CONDUIT_USE_CXX11 //----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +TEST(conduit_array, summary_stats) +{ + std::vector v_int64(3,-64); + std::vector v_uint64(3,64); + std::vector v_float64(3,64.0); + + int64_array va_int64(&v_int64[0],DataType::int64(3)); + uint64_array va_uint64(&v_uint64[0],DataType::uint64(3)); + float64_array va_float64(&v_float64[0],DataType::float64(3)); + + va_int64.set({-1,0,1}); + va_uint64.set({1,2,3}); + va_float64.set({-1.0,0.0,1.0}); + + EXPECT_EQ(va_int64.min(),-1); + EXPECT_EQ(va_int64.max(),1); + EXPECT_EQ(va_int64.mean(),0); + EXPECT_EQ(va_int64.sum(),0); + + EXPECT_EQ(va_uint64.min(),1); + EXPECT_EQ(va_uint64.max(),3); + EXPECT_EQ(va_uint64.mean(),2); + EXPECT_EQ(va_uint64.sum(),6); + + EXPECT_EQ(va_float64.min(),-1.0); + EXPECT_EQ(va_float64.max(),1.0); + EXPECT_EQ(va_float64.mean(),0.0); + EXPECT_EQ(va_float64.sum(),0.0); + +} +//----------------------------------------------------------------------------- +TEST(conduit_array, summary_print) +{ + std::vector v_int64(5,-64); + int64_array va_int64(&v_int64[0],DataType::int64(5)); + + va_int64.set({1,2,3,4,5}); + + std::string v = va_int64.to_summary_string(); + std::cout << v << std::endl; + EXPECT_EQ(v,"[1, 2, 3, 4, 5]"); + + v = va_int64.to_summary_string(2); + std::cout << v << std::endl; + EXPECT_EQ(v,"[1, ..., 5]"); + + v = va_int64.to_summary_string(3); + std::cout << v << std::endl; + EXPECT_EQ(v,"[1, 2, ..., 5]"); + +} + + //----------------------------------------------------------------------------- TEST(conduit_array, cxx_11_init_lists) { diff --git a/src/tests/conduit/t_conduit_node.cpp b/src/tests/conduit/t_conduit_node.cpp index 385083eec..8d9003086 100644 --- a/src/tests/conduit/t_conduit_node.cpp +++ b/src/tests/conduit/t_conduit_node.cpp @@ -1162,3 +1162,60 @@ TEST(conduit_node, add_child) +//----------------------------------------------------------------------------- +TEST(conduit_node, describe) +{ + Node n; + + n["a"] = {1,2,3,4,5}; + n["b"] = {1,2,3}; + n["c"] = {1,2,3,4,5,6}; + n["d"] = {1,2,3,4,5,6,7}; + n["e"] = {1,2,3,4,5,6,7,8,9,10,11,12}; + n["f"] = {1.0,2.0,3.0,4.0,5.0,6.0,7.0}; + n["g"] = {2.0,4.0}; + + Node d; + n.describe(d); + d.print(); + + EXPECT_EQ(d["a/count"].to_int(),5); + EXPECT_EQ(d["b/count"].to_int(),3); + EXPECT_EQ(d["c/count"].to_int(),6); + EXPECT_EQ(d["d/count"].to_int(),7); + EXPECT_EQ(d["e/count"].to_int(),12); + EXPECT_EQ(d["f/count"].to_int(),7); + + EXPECT_EQ(d["a/min"].to_int(),1); + EXPECT_EQ(d["b/min"].to_int(),1); + EXPECT_EQ(d["c/min"].to_int(),1); + EXPECT_EQ(d["d/min"].to_int(),1); + EXPECT_EQ(d["e/min"].to_int(),1); + EXPECT_EQ(d["f/min"].to_float(),1.0); + + EXPECT_EQ(d["a/max"].to_int(),5); + EXPECT_EQ(d["b/max"].to_int(),3); + EXPECT_EQ(d["c/max"].to_int(),6); + EXPECT_EQ(d["d/max"].to_int(),7); + EXPECT_EQ(d["e/max"].to_int(),12); + EXPECT_EQ(d["f/max"].to_float(),7.0); + + EXPECT_EQ(d["g/mean"].to_float(),3.0); + + + n["a"] = {1,2,3,4,5}; + n["b"] = {1,2,3}; + n["c"] = {1,2,3,4,5,6}; + n["d"] = {1,2,3,4,5,6,7}; + n["e"] = {1,2,3,4,5,6,7,8,9,10,11,12}; + n["f"] = {1.0,2.0,3.0,4.0,5.0,6.0,7.0}; + + Node opts; + opts["threshold"] = 10; + n.describe(opts,d); + d.print(); + +} + + + From 5b722d8a97be0fee6e045c28d04aa9bc62abcf24 Mon Sep 17 00:00:00 2001 From: Cyrus Harrison Date: Fri, 18 Dec 2020 15:55:21 -0800 Subject: [PATCH 2/2] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04e0c177b..2c106acbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and this project aspires to adhere to [Semantic Versioning](https://semver.org/s #### General - Added a builtin sandboxed header-only version of fmt. The namespace and directory paths were changed to `conduit_fmt` to avoid potential symbol collisions with other codes using fmt. Downstream software can use by including `conduit_fmt/conduit_fmt.h`. - Added support for using C++11 initializer lists to set Node and DataArray values from numeric arrays. See C++ tutorial docs (https://llnl-conduit.readthedocs.io/en/latest/tutorial_cpp_numeric.html#c-11-initializer-lists) for more details. +- Added a Node::describe() method. This method creates a new node that mirrors the current Node, however each leaf is replaced by summary stats and a truncated display of the values. For use cases with large leaves, printing the describe() output Node is much more helpful for debugging and understanding vs wall of text from other to_string() methods. + ## [0.6.0] - Released 2020-11-02