Consolidate all sites that can hash a value inside a vector (facebook…

…incubator#9963) Summary: Pull Request resolved: facebookincubator#9963 This commit aims to guarantee consistent evaluation and hashing of NaN (Not-a-Number) values for floating-point types across multiple sites, including SimpleVector, VectorHasher, RowContainer, and ContainerRowSerde. Unit tests will be added incrementally to make it easier to review and to include additional changes that might be required for it. The changes to these classes would affect the following: - Multiple aggregates that have 'distinct' like semantics. Only for the codepaths that consume complex types as inputs. Aggregates include: multimap_agg, map_union_sum, map_union, count(distinct) - In-predicate - Map_subscript, only one optimization code path for constant/dictionary encoded map - Hash Partitioning operator (HashPartition) - Hash Join operator (HashProbe, HashTable) - Local partitioning operator (LocalPartition) - Hash Aggregation operator (GroupingSet) Reviewed By: kagamiori Differential Revision: D57892204 fbshipit-source-id: 3729aec02c597379c8fbb1d2eec190614aa57d43
Joe-Abraham · Jun 7, 2024 · b8fe29c · b8fe29c
1 parent e878d7a
commit b8fe29c
Show file tree

Hide file tree

Showing 9 changed files with 140 additions and 9 deletions.
diff --git a/velox/exec/ContainerRowSerde.cpp b/velox/exec/ContainerRowSerde.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "velox/exec/ContainerRowSerde.h"
+#include "velox/type/FloatingPointUtil.h"
 #include "velox/vector/ComplexVector.h"
 #include "velox/vector/FlatVector.h"
 
@@ -728,7 +729,11 @@ uint64_t hashSwitch(ByteInputStream& stream, const Type* type);
 template <TypeKind Kind>
 uint64_t hashOne(ByteInputStream& stream, const Type* /*type*/) {
   using T = typename TypeTraits<Kind>::NativeType;
-  return folly::hasher<T>()(stream.read<T>());
+  if constexpr (std::is_floating_point_v<T>) {
+    return util::floating_point::NaNAwareHash<T>()(stream.read<T>());
+  } else {
+    return folly::hasher<T>()(stream.read<T>());
+  }
 }
 
 template <>

diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp
@@ -20,6 +20,7 @@
 #include "velox/exec/Aggregate.h"
 #include "velox/exec/ContainerRowSerde.h"
 #include "velox/exec/Operator.h"
+#include "velox/type/FloatingPointUtil.h"
 
 namespace facebook::velox::exec {
 namespace {
@@ -866,6 +867,8 @@ void RowContainer::hashTyped(
           Kind == TypeKind::MAP) {
         auto in = prepareRead(row, offset);
         hash = ContainerRowSerde::hash(in, type);
+      } else if constexpr (std::is_floating_point_v<T>) {
+        hash = util::floating_point::NaNAwareHash<T>()(valueAt<T>(row, offset));
       } else {
         hash = folly::hasher<T>()(valueAt<T>(row, offset));
       }

diff --git a/velox/exec/VectorHasher.cpp b/velox/exec/VectorHasher.cpp
@@ -19,6 +19,7 @@
 #include "velox/common/base/Portability.h"
 #include "velox/common/base/SimdUtil.h"
 #include "velox/common/memory/HashStringAllocator.h"
+#include "velox/type/FloatingPointUtil.h"
 
 namespace facebook::velox::exec {
 
@@ -61,7 +62,11 @@ uint64_t hashOne(DecodedVector& decoded, vector_size_t index) {
   }
   // Inlined for scalars.
   using T = typename KindToFlatVector<Kind>::HashRowType;
-  return folly::hasher<T>()(decoded.valueAt<T>(index));
+  if constexpr (std::is_floating_point_v<T>) {
+    return util::floating_point::NaNAwareHash<T>()(decoded.valueAt<T>(index));
+  } else {
+    return folly::hasher<T>()(decoded.valueAt<T>(index));
+  }
 }
 } // namespace
 

diff --git a/velox/exec/tests/ContainerRowSerdeTest.cpp b/velox/exec/tests/ContainerRowSerdeTest.cpp
@@ -571,5 +571,38 @@ TEST_F(ContainerRowSerdeTest, fuzzCompare) {
   }
 }
 
+TEST_F(ContainerRowSerdeTest, nans) {
+  // Verify that the NaNs with different representations are considered equal
+  // and have the same hash value.
+  auto vector = makeNullableFlatVector<double>(
+      {std::nan("1"),
+       std::nan("2"),
+       std::numeric_limits<double>::quiet_NaN(),
+       std::numeric_limits<double>::signaling_NaN()});
+
+  // Compare with the same NaN value
+  auto expected = makeConstant(std::nan("1"), 4, vector->type());
+
+  auto positions = serializeWithPositions(vector);
+
+  CompareFlags compareFlags =
+      CompareFlags::equality(CompareFlags::NullHandlingMode::kNullAsValue);
+
+  DecodedVector decodedVector(*expected);
+
+  for (auto i = 0; i < positions.size(); ++i) {
+    auto stream = HashStringAllocator::prepareRead(positions.at(i).header);
+    ASSERT_EQ(
+        0, ContainerRowSerde::compare(stream, decodedVector, i, compareFlags))
+        << "at " << i << ": " << vector->toString(i);
+
+    stream = HashStringAllocator::prepareRead(positions.at(i).header);
+    ASSERT_EQ(
+        expected->hashValueAt(i),
+        ContainerRowSerde::hash(stream, vector->type().get()))
+        << "at " << i << ": " << vector->toString(i);
+  }
+}
+
 } // namespace
 } // namespace facebook::velox::exec
diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp
@@ -1690,6 +1690,45 @@ TEST_F(RowContainerTest, unknown) {
       }));
 }
 
+TEST_F(RowContainerTest, nans) {
+  const static auto kNaN = std::numeric_limits<double>::quiet_NaN();
+  const static auto kSNaN = std::numeric_limits<double>::signaling_NaN();
+  static const auto kNaNHash = folly::hasher<double>{}(kNaN);
+  std::vector<TypePtr> types = {DOUBLE()};
+  auto rowContainer = std::make_unique<RowContainer>(types, pool_.get());
+
+  auto data = makeRowVector({
+      makeFlatVector<double>({kNaN, kSNaN}),
+  });
+
+  auto size = data->size();
+  DecodedVector decoded(*data->childAt(0));
+  auto rows = store(*rowContainer, decoded, size);
+
+  // Verify that the hashes are equal.
+  std::vector<uint64_t> hashes(size, 0);
+  rowContainer->hash(
+      0, folly::Range(rows.data(), rows.size()), false /*mix*/, hashes.data());
+  for (auto hash : hashes) {
+    ASSERT_EQ(kNaNHash, hash);
+  }
+
+  // Fill in hashes with sequential numbers: 0, 1, 2,..
+  std::iota(hashes.begin(), hashes.end(), 0);
+  rowContainer->hash(
+      0, folly::Range(rows.data(), rows.size()), true /*mix*/, hashes.data());
+  for (auto i = 0; i < size; ++i) {
+    ASSERT_EQ(bits::hashMix(i, kNaNHash), hashes[i]);
+  }
+
+  // Verify that they are considered equal.
+  for (size_t row = 0; row < size; ++row) {
+    ASSERT_TRUE(rowContainer->equals<false>(
+        rows[row], rowContainer->columnAt(0), decoded, row));
+  }
+  ASSERT_EQ(rowContainer->compare(rows[0], rows[1], 0, {}), 0);
+}
+
 TEST_F(RowContainerTest, toString) {
   std::vector<TypePtr> keyTypes = {BIGINT(), VARCHAR()};
   std::vector<TypePtr> dependentTypes = {TINYINT(), REAL(), ARRAY(BIGINT())};

diff --git a/velox/exec/tests/VectorHasherTest.cpp b/velox/exec/tests/VectorHasherTest.cpp
@@ -224,6 +224,38 @@ TEST_F(VectorHasherTest, flat) {
   }
 }
 
+TEST_F(VectorHasherTest, nans) {
+  // Sanity check to ensure the NaNs are correctly hashed, that is, all NaNs are
+  // considered equal and therefore should have the same hash.
+  static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
+  static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();
+  folly::hasher<double> hasher;
+  static const auto kNaNHash = hasher(kNaN);
+  auto vectorHasher = exec::VectorHasher::create(DOUBLE(), 1);
+  ASSERT_EQ(vectorHasher->channel(), 1);
+  ASSERT_EQ(vectorHasher->typeKind(), TypeKind::DOUBLE);
+
+  // Using two different binary representations of NaN.
+  std::vector<double> inputValues = {1.0, -1.0, kNaN, kSNaN, 0.0, -0.0};
+
+  auto vector = BaseVector::create(DOUBLE(), inputValues.size(), pool());
+  auto flatVector = vector->asFlatVector<double>();
+  for (int32_t i = 0; i < inputValues.size(); i++) {
+    flatVector->set(i, inputValues[i]);
+  }
+
+  raw_vector<uint64_t> hashes(inputValues.size());
+  std::fill(hashes.begin(), hashes.end(), 0);
+  SelectivityVector allRows = SelectivityVector(inputValues.size());
+  vectorHasher->decode(*vector, allRows);
+  vectorHasher->hash(allRows, false, hashes);
+  std::vector<uint64_t> expected = {
+      hasher(1.0), hasher(-1.0), kNaNHash, kNaNHash, hasher(0.0), hasher(0.0)};
+  for (int32_t i = 0; i < inputValues.size(); i++) {
+    EXPECT_EQ(hashes[i], expected[i]) << "at " << i;
+  }
+}
+
 TEST_F(VectorHasherTest, nonNullConstant) {
   auto hasher = exec::VectorHasher::create(INTEGER(), 1);
   auto vector = BaseVector::createConstant(INTEGER(), 123, 100, pool());

diff --git a/velox/type/FloatingPointUtil.h b/velox/type/FloatingPointUtil.h
@@ -76,11 +76,11 @@ template <
 struct NaNAwareHash {
   std::size_t operator()(const FLOAT& val) const noexcept {
     static const std::size_t kNanHash =
-        std::hash<FLOAT>{}(std::numeric_limits<FLOAT>::quiet_NaN());
+        folly::hasher<FLOAT>{}(std::numeric_limits<FLOAT>::quiet_NaN());
     if (std::isnan(val)) {
       return kNanHash;
     }
-    return std::hash<FLOAT>{}(val);
+    return folly::hasher<FLOAT>{}(val);
   }
 };
 

diff --git a/velox/vector/SimpleVector.h b/velox/vector/SimpleVector.h
@@ -29,6 +29,7 @@
 
 #include "velox/functions/lib/string/StringCore.h"
 #include "velox/type/DecimalUtil.h"
+#include "velox/type/FloatingPointUtil.h"
 #include "velox/type/Type.h"
 #include "velox/vector/BaseVector.h"
 #include "velox/vector/TypeAliases.h"
@@ -171,8 +172,14 @@ class SimpleVector : public BaseVector {
    * @return the hash of the value at the given index in this vector
    */
   uint64_t hashValueAt(vector_size_t index) const override {
-    return isNullAt(index) ? BaseVector::kNullHash
-                           : folly::hasher<T>{}(valueAt(index));
+    if constexpr (std::is_floating_point_v<T>) {
+      return isNullAt(index)
+          ? BaseVector::kNullHash
+          : util::floating_point::NaNAwareHash<T>{}(valueAt(index));
+    } else {
+      return isNullAt(index) ? BaseVector::kNullHash
+                             : folly::hasher<T>{}(valueAt(index));
+    }
   }
 
   std::optional<bool> isSorted() const {

diff --git a/velox/vector/tests/SimpleVectorTest.cpp b/velox/vector/tests/SimpleVectorTest.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <algorithm>
+#include <functional>
 
 #include <fmt/core.h>
 #include <glog/logging.h>
@@ -25,6 +26,7 @@
 #include <folly/dynamic.h>
 
 #include "velox/common/base/tests/GTestUtils.h"
+#include "velox/type/FloatingPointUtil.h"
 #include "velox/vector/BuilderTypeUtils.h"
 #include "velox/vector/SimpleVector.h"
 #include "velox/vector/tests/SimpleVectorTestHelper.h"
@@ -649,12 +651,17 @@ VELOX_TYPED_TEST_SUITE(SimpleVectorUnaryTypedTest, SimpleTypes);
 
 TYPED_TEST(SimpleVectorUnaryTypedTest, hashAll) {
   LOG(INFO) << "hashAll: " << type_name<TypeParam>();
-  folly::hasher<TypeParam> hasher;
+  std::function<size_t(TypeParam)> hasher;
+  if constexpr (std::is_floating_point_v<TypeParam>) {
+    hasher = util::floating_point::NaNAwareHash<TypeParam>{};
+  } else {
+    hasher = folly::hasher<TypeParam>{};
+  }
   auto hashTest = [&hasher](SimpleVector<TypeParam>* vector) {
     auto hashes = vector->hashAll();
     for (size_t i = 0; i < vector->size(); ++i) {
-      auto expected = vector->isNullAt(i) ? BaseVector::kNullHash
-                                          : hasher(vector->valueAt(i));
+      uint64_t expected = vector->isNullAt(i) ? BaseVector::kNullHash
+                                              : hasher(vector->valueAt(i));
       EXPECT_EQ(hashes->valueAt(i), expected);
     }
   };