From 47d55803a47f8ea90b4e432e86c53240df4b7463 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 6 Apr 2023 12:23:48 -0300 Subject: [PATCH] GH-34361: [C++] Fix the handling of logical nulls for types without bitmaps like Unions and Run-End Encoded (#34408) Bonus: add `ArrayData::IsValid()` to make it consistent with `Array` and `ArraySpan`. ### Rationale for this change This is the proposed fix to #34361 plus the addition of more APIs dealing with validity/nullity. ### What changes are included in this PR? This PR changes the behavior of `IsNull` and `IsValid` in `Array`, `ArrayData`, and `ArraySpan`. It preserves the behavior of `MayHaveNulls`, `GetNullCount` and introduces new member functions to `Array`, `ArrayData`, and `ArraySpan`: - `bool HasValidityBitmap() const` - `bool MayHaveLogicalNulls() const` - `int64_t ComputeLogicalNullCount() const` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. See above. Breakage with these changes can only happen if users rely on `IsNull(i)` always returning `true` for union types, but we have users reporting that the current behavior or broken #34315. This is why the behavior of `IsNull` and `IsValid` is changing., **This PR contains a "Critical Fix".** * Closes: #34361 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Matt Topol --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_base.cc | 8 ++ cpp/src/arrow/array/array_base.h | 37 ++++-- cpp/src/arrow/array/array_test.cc | 83 +++++++++--- cpp/src/arrow/array/builder_nested.h | 16 ++- cpp/src/arrow/array/data.cc | 93 +++++++++++++ cpp/src/arrow/array/data.h | 191 +++++++++++++++++++++++++-- cpp/src/arrow/util/ree_util.cc | 36 +++++ cpp/src/arrow/util/ree_util.h | 3 + cpp/src/arrow/util/union_util.cc | 58 ++++++++ cpp/src/arrow/util/union_util.h | 31 +++++ 11 files changed, 517 insertions(+), 40 deletions(-) create mode 100644 cpp/src/arrow/util/union_util.cc create mode 100644 cpp/src/arrow/util/union_util.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 143fb13ddcca6..22b2bf6e4439c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -230,6 +230,7 @@ set(ARROW_SRCS util/time.cc util/tracing.cc util/trie.cc + util/union_util.cc util/unreachable.cc util/uri.cc util/utf8.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 1ccde5222f31a..f7b8d7954e1cf 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -52,6 +52,10 @@ class ExtensionArray; int64_t Array::null_count() const { return data_->GetNullCount(); } +int64_t Array::ComputeLogicalNullCount() const { + return data_->ComputeLogicalNullCount(); +} + namespace internal { struct ScalarFromArraySlotImpl { @@ -175,6 +179,10 @@ struct ScalarFromArraySlotImpl { array_.length()); } + // Skip checking for nulls in RUN_END_ENCODED arrays to avoid potentially + // making two O(log n) searches for the physical index of the slot -- one + // here and another in Visit(const RunEndEncodedArray&) in case the values + // is not null. if (array_.type()->id() != Type::RUN_END_ENCODED && array_.IsNull(index_)) { auto null = MakeNullScalar(array_.type()); if (is_dictionary(array_.type()->id())) { diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 2333a0c06fb8d..48d65e30de8d9 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -55,18 +55,28 @@ class ARROW_EXPORT Array { virtual ~Array() = default; /// \brief Return true if value at index is null. Does not boundscheck - bool IsNull(int64_t i) const { - return null_bitmap_data_ != NULLPTR - ? !bit_util::GetBit(null_bitmap_data_, i + data_->offset) - : data_->null_count == data_->length; - } + bool IsNull(int64_t i) const { return !IsValid(i); } /// \brief Return true if value at index is valid (not null). Does not /// boundscheck bool IsValid(int64_t i) const { - return null_bitmap_data_ != NULLPTR - ? bit_util::GetBit(null_bitmap_data_, i + data_->offset) - : data_->null_count != data_->length; + if (null_bitmap_data_ != NULLPTR) { + return bit_util::GetBit(null_bitmap_data_, i + data_->offset); + } + // Dispatching with a few conditionals like this makes IsNull more + // efficient for how it is used in practice. Making IsNull virtual + // would add a vtable lookup to every call and prevent inlining + + // a potential inner-branch removal. + if (type_id() == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*data_, i); + } + if (type_id() == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*data_, i); + } + if (type_id() == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*data_, i); + } + return data_->null_count != data_->length; } /// \brief Return a Scalar containing the value of this array at i @@ -85,6 +95,17 @@ class ARROW_EXPORT Array { /// function int64_t null_count() const; + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// null_count(). For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + std::shared_ptr type() const { return data_->type; } Type::type type_id() const { return data_->type->id(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index d18b16ba78d5e..602a468fafb05 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -37,6 +37,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" +#include "arrow/array/builder_run_end.h" #include "arrow/array/builder_time.h" #include "arrow/array/data.h" #include "arrow/array/util.h" @@ -83,15 +84,48 @@ TEST_F(TestArray, TestNullCount) { auto data = std::make_shared(nullptr, 0); auto null_bitmap = std::make_shared(nullptr, 0); - std::unique_ptr arr(new Int32Array(100, data, null_bitmap, 10)); + std::shared_ptr arr(new Int32Array(100, data, null_bitmap, 10)); + ASSERT_EQ(10, arr->ComputeLogicalNullCount()); ASSERT_EQ(10, arr->null_count()); + ASSERT_TRUE(arr->data()->MayHaveNulls()); + ASSERT_TRUE(arr->data()->MayHaveLogicalNulls()); - std::unique_ptr arr_no_nulls(new Int32Array(100, data)); + std::shared_ptr arr_no_nulls(new Int32Array(100, data)); + ASSERT_EQ(0, arr_no_nulls->ComputeLogicalNullCount()); ASSERT_EQ(0, arr_no_nulls->null_count()); + ASSERT_FALSE(arr_no_nulls->data()->MayHaveNulls()); + ASSERT_FALSE(arr_no_nulls->data()->MayHaveLogicalNulls()); - std::unique_ptr arr_default_null_count( + std::shared_ptr arr_default_null_count( new Int32Array(100, data, null_bitmap)); ASSERT_EQ(kUnknownNullCount, arr_default_null_count->data()->null_count); + ASSERT_TRUE(arr_default_null_count->data()->MayHaveNulls()); + ASSERT_TRUE(arr_default_null_count->data()->MayHaveLogicalNulls()); + + RunEndEncodedBuilder ree_builder(pool_, std::make_shared(pool_), + std::make_shared(pool_), + run_end_encoded(int32(), int32())); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(2), 2)); + ASSERT_OK(ree_builder.AppendNull()); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(4), 3)); + ASSERT_OK(ree_builder.AppendNulls(2)); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(8), 5)); + ASSERT_OK(ree_builder.AppendNulls(7)); + ASSERT_OK_AND_ASSIGN(auto ree, ree_builder.Finish()); + + ASSERT_EQ(0, ree->null_count()); + ASSERT_EQ(10, ree->ComputeLogicalNullCount()); + ASSERT_FALSE(ree->data()->MayHaveNulls()); + ASSERT_TRUE(ree->data()->MayHaveLogicalNulls()); + + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(2), 2)); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(4), 3)); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(8), 5)); + ASSERT_OK_AND_ASSIGN(auto ree_no_nulls, ree_builder.Finish()); + ASSERT_EQ(0, ree_no_nulls->null_count()); + ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount()); + ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls()); + ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls()); } TEST_F(TestArray, TestSlicePreservesAllNullCount) { @@ -377,20 +411,23 @@ TEST_F(TestArray, TestMakeArrayOfNull) { ASSERT_EQ(array->length(), length); if (is_union(type->id())) { ASSERT_EQ(array->null_count(), 0); + ASSERT_EQ(array->ComputeLogicalNullCount(), length); const auto& union_array = checked_cast(*array); for (int i = 0; i < union_array.num_fields(); ++i) { ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length()); } } else if (type->id() == Type::RUN_END_ENCODED) { ASSERT_EQ(array->null_count(), 0); + ASSERT_EQ(array->ComputeLogicalNullCount(), length); const auto& ree_array = checked_cast(*array); ASSERT_EQ(ree_array.values()->null_count(), ree_array.values()->length()); } else { ASSERT_EQ(array->null_count(), length); - for (int64_t i = 0; i < length; ++i) { - ASSERT_TRUE(array->IsNull(i)); - ASSERT_FALSE(array->IsValid(i)); - } + ASSERT_EQ(array->ComputeLogicalNullCount(), length); + } + for (int64_t i = 0; i < length; ++i) { + ASSERT_TRUE(array->IsNull(i)); + ASSERT_FALSE(array->IsValid(i)); } } } @@ -482,12 +519,14 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) std::unique_ptr builder; auto null_scalar = MakeNullScalar(scalar->type); ASSERT_OK(MakeBuilderExactIndex(pool, scalar->type, &builder)); - ASSERT_OK(builder->AppendScalar(*scalar)); - ASSERT_OK(builder->AppendScalar(*scalar)); - ASSERT_OK(builder->AppendScalar(*null_scalar)); - ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); - ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2)); - ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); + ASSERT_OK(builder->AppendScalar(*scalar)); // [0] = scalar + ASSERT_OK(builder->AppendScalar(*scalar)); // [1] = scalar + ASSERT_OK(builder->AppendScalar(*null_scalar)); // [2] = NULL + ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); // [3, 4] = {scalar, NULL} + ASSERT_OK( + builder->AppendScalar(*scalar, /*n_repeats=*/2)); // [5, 6] = {scalar, scalar} + ASSERT_OK( + builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); // [7, 8] = {NULL, NULL} std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); @@ -495,22 +534,30 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) AssertTypeEqual(scalar->type, out->type()); ASSERT_EQ(out->length(), 9); - const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id()); + auto out_type_id = out->type()->id(); + const bool has_validity = internal::HasValidityBitmap(out_type_id); // For a dictionary builder, the output dictionary won't necessarily be the same - const bool can_check_values = !is_dictionary(out->type()->id()); + const bool can_check_values = !is_dictionary(out_type_id); - if (can_check_nulls) { + if (has_validity) { ASSERT_EQ(out->null_count(), 4); + } else { + ASSERT_EQ(out->null_count(), 0); + } + if (scalar->is_valid) { + ASSERT_EQ(out->ComputeLogicalNullCount(), 4); + } else { + ASSERT_EQ(out->ComputeLogicalNullCount(), 9); } for (const auto index : {0, 1, 3, 5, 6}) { - ASSERT_FALSE(out->IsNull(index)); + ASSERT_NE(out->IsNull(index), scalar->is_valid); ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); ASSERT_OK(scalar_i->ValidateFull()); if (can_check_values) AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true); } for (const auto index : {2, 4, 7, 8}) { - ASSERT_EQ(out->IsNull(index), can_check_nulls); + ASSERT_TRUE(out->IsNull(index)); ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); ASSERT_OK(scalar_i->ValidateFull()); AssertScalarsEqual(*null_scalar, *scalar_i, /*verbose=*/true); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 3e9328bfdf0ff..74f7c04d6ad24 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -131,9 +131,13 @@ class BaseListBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + const bool all_valid = !array.MayHaveLogicalNulls(); + const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - if (!validity || bit_util::GetBit(validity, array.offset + row)) { + const bool is_valid = + all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || + array.IsValid(row); + if (is_valid) { ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], @@ -301,9 +305,13 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const int32_t* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + const bool all_valid = !array.MayHaveLogicalNulls(); + const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - if (!validity || bit_util::GetBit(validity, array.offset + row)) { + const bool is_valid = + all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || + array.IsValid(row); + if (is_valid) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; // Add together the inner StructArray offset to the Map/List offset diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index e23facce6395b..18c9eb720a016 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -34,7 +34,9 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/ree_util.h" #include "arrow/util/slice_util_internal.h" +#include "arrow/util/union_util.h" namespace arrow { @@ -60,6 +62,38 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length, } } +namespace internal { + +bool IsNullSparseUnion(const ArrayData& data, int64_t i) { + auto* union_type = checked_cast(data.type.get()); + const auto* types = reinterpret_cast(data.buffers[1]->data()); + const int child_id = union_type->child_ids()[types[data.offset + i]]; + return data.child_data[child_id]->IsNull(i); +} + +bool IsNullDenseUnion(const ArrayData& data, int64_t i) { + auto* union_type = checked_cast(data.type.get()); + const auto* types = reinterpret_cast(data.buffers[1]->data()); + const int child_id = union_type->child_ids()[types[data.offset + i]]; + const auto* offsets = reinterpret_cast(data.buffers[2]->data()); + const int64_t child_offset = offsets[data.offset + i]; + return data.child_data[child_id]->IsNull(child_offset); +} + +bool IsNullRunEndEncoded(const ArrayData& data, int64_t i) { + return ArraySpan(data).IsNullRunEndEncoded(i); +} + +bool UnionMayHaveLogicalNulls(const ArrayData& data) { + return ArraySpan(data).MayHaveLogicalNulls(); +} + +bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { + return ArraySpan(data).MayHaveLogicalNulls(); +} + +} // namespace internal + std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, std::vector> buffers, int64_t null_count, int64_t offset) { @@ -132,6 +166,13 @@ int64_t ArrayData::GetNullCount() const { return precomputed; } +int64_t ArrayData::ComputeLogicalNullCount() const { + if (this->buffers[0]) { + return GetNullCount(); + } + return ArraySpan(*this).ComputeLogicalNullCount(); +} + // ---------------------------------------------------------------------- // Methods for ArraySpan @@ -407,6 +448,20 @@ int64_t ArraySpan::GetNullCount() const { return precomputed; } +int64_t ArraySpan::ComputeLogicalNullCount() const { + const auto t = this->type->id(); + if (t == Type::SPARSE_UNION) { + return union_util::LogicalSparseUnionNullCount(*this); + } + if (t == Type::DENSE_UNION) { + return union_util::LogicalDenseUnionNullCount(*this); + } + if (t == Type::RUN_END_ENCODED) { + return ree_util::LogicalNullCount(*this); + } + return GetNullCount(); +} + int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); } std::shared_ptr ArraySpan::ToArrayData() const { @@ -445,6 +500,44 @@ std::shared_ptr ArraySpan::ToArray() const { return MakeArray(this->ToArrayData()); } +bool ArraySpan::IsNullSparseUnion(int64_t i) const { + auto* union_type = checked_cast(this->type); + const auto* types = reinterpret_cast(this->buffers[1].data); + const int child_id = union_type->child_ids()[types[this->offset + i]]; + return this->child_data[child_id].IsNull(i); +} + +bool ArraySpan::IsNullDenseUnion(int64_t i) const { + auto* union_type = checked_cast(this->type); + const auto* types = reinterpret_cast(this->buffers[1].data); + const auto* offsets = reinterpret_cast(this->buffers[2].data); + const int64_t child_id = union_type->child_ids()[types[this->offset + i]]; + const int64_t child_offset = offsets[this->offset + i]; + return this->child_data[child_id].IsNull(child_offset); +} + +bool ArraySpan::IsNullRunEndEncoded(int64_t i) const { + const auto& values = ree_util::ValuesArray(*this); + if (values.MayHaveLogicalNulls()) { + const int64_t physical_offset = ree_util::FindPhysicalIndex(*this, i, this->offset); + return ree_util::ValuesArray(*this).IsNull(physical_offset); + } + return false; +} + +bool ArraySpan::UnionMayHaveLogicalNulls() const { + for (auto& child : this->child_data) { + if (child.MayHaveLogicalNulls()) { + return true; + } + } + return false; +} + +bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const { + return ree_util::ValuesArray(*this).MayHaveLogicalNulls(); +} + // ---------------------------------------------------------------------- // Implement internal::GetArrayView diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e024483f66551..27a30ade0bb83 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -33,6 +33,19 @@ namespace arrow { class Array; +struct ArrayData; + +namespace internal { +// ---------------------------------------------------------------------- +// Null handling for types without a validity bitmap + +ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); + +ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); +} // namespace internal // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count @@ -167,9 +180,23 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr Copy() const { return std::make_shared(*this); } - bool IsNull(int64_t i) const { - return ((buffers[0] != NULLPTR) ? !bit_util::GetBit(buffers[0]->data(), i + offset) - : null_count.load() == length); + bool IsNull(int64_t i) const { return !IsValid(i); } + + bool IsValid(int64_t i) const { + if (buffers[0] != NULLPTR) { + return bit_util::GetBit(buffers[0]->data(), i + offset); + } + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*this, i); + } + if (type == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*this, i); + } + if (type == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*this, i); + } + return null_count.load() != length; } // Access a buffer's data as a typed C pointer @@ -229,15 +256,88 @@ struct ARROW_EXPORT ArrayData { void SetNullCount(int64_t v) { null_count.store(v); } - /// \brief Return null count, or compute and set it if it's not known + /// \brief Return physical null count, or compute and set it if it's not known int64_t GetNullCount() const; + /// \brief Return true if the data has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known. + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls bool MayHaveNulls() const { // If an ArrayData is slightly malformed it may have kUnknownNullCount set // but no buffer return null_count.load() != 0 && buffers[0] != NULLPTR; } + /// \brief Return true if the data has a validity bitmap + bool HasValidityBitmap() const { return buffers[0] != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls + /// + /// This is not a drop-in replacement for MayHaveNulls, as historically + /// MayHaveNulls() has been used to check for the presence of a validity + /// bitmap that needs to be checked. + /// + /// Code that previously used MayHaveNulls() and then dealt with the validity + /// bitmap directly can be fixed to handle all types correctly without + /// performance degradation when handling most types by adopting + /// HasValidityBitmap and MayHaveLogicalNulls. + /// + /// Before: + /// + /// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// if (validity && !bit_util::GetBit(validity, i)) { + /// continue; // skip a NULL + /// } + /// ... + /// } + /// + /// After: + /// + /// bool all_valid = !array.MayHaveLogicalNulls(); + /// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// bool is_valid = all_valid || + /// (validity && bit_util::GetBit(validity, i)) || + /// array.IsValid(i); + /// if (!is_valid) { + /// continue; // skip a NULL + /// } + /// ... + /// } + bool MayHaveLogicalNulls() const { + if (buffers[0] != NULLPTR) { + return null_count.load() != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return internal::UnionMayHaveLogicalNulls(*this); + } + if (t == Type::RUN_END_ENCODED) { + return internal::RunEndEncodedMayHaveLogicalNulls(*this); + } + return null_count.load() != 0; + } + + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + std::shared_ptr type; int64_t length = 0; mutable std::atomic null_count{0}; @@ -329,14 +429,26 @@ struct ARROW_EXPORT ArraySpan { return GetValues(i, this->offset); } + inline bool IsNull(int64_t i) const { return !IsValid(i); } + inline bool IsValid(int64_t i) const { - return ((this->buffers[0].data != NULLPTR) - ? bit_util::GetBit(this->buffers[0].data, i + this->offset) - : this->null_count != this->length); + if (this->buffers[0].data != NULLPTR) { + return bit_util::GetBit(this->buffers[0].data, i + this->offset); + } else { + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !IsNullSparseUnion(i); + } + if (type == Type::DENSE_UNION) { + return !IsNullDenseUnion(i); + } + if (type == Type::RUN_END_ENCODED) { + return !IsNullRunEndEncoded(i); + } + return this->null_count != this->length; + } } - inline bool IsNull(int64_t i) const { return !IsValid(i); } - std::shared_ptr ToArrayData() const; std::shared_ptr ToArray() const; @@ -363,14 +475,73 @@ struct ARROW_EXPORT ArraySpan { } } - /// \brief Return null count, or compute and set it if it's not known + /// \brief Return physical null count, or compute and set it if it's not known int64_t GetNullCount() const; + /// \brief Return true if the array has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls bool MayHaveNulls() const { // If an ArrayData is slightly malformed it may have kUnknownNullCount set // but no buffer return null_count != 0 && buffers[0].data != NULLPTR; } + + /// \brief Return true if the array has a validity bitmap + bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls + /// + /// \see ArrayData::MayHaveLogicalNulls + bool MayHaveLogicalNulls() const { + if (buffers[0].data != NULLPTR) { + return null_count != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return UnionMayHaveLogicalNulls(); + } + if (t == Type::RUN_END_ENCODED) { + return RunEndEncodedMayHaveLogicalNulls(); + } + return null_count != 0; + } + + /// \brief Compute the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the logical null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + private: + friend bool internal::IsNullRunEndEncoded(const ArrayData& span, int64_t i); + + bool IsNullSparseUnion(int64_t i) const; + bool IsNullDenseUnion(int64_t i) const; + + /// \brief Return true if the value at logical index i is null + /// + /// This function uses binary-search, so it has a O(log N) cost. + /// Iterating over the whole array and calling IsNull is O(N log N), so + /// for better performance it is recommended to use a + /// ree_util::RunEndEncodedArraySpan to iterate run by run instead. + bool IsNullRunEndEncoded(int64_t i) const; + + bool UnionMayHaveLogicalNulls() const; + bool RunEndEncodedMayHaveLogicalNulls() const; }; namespace internal { diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 9324a1adbca49..11da64313a811 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -19,12 +19,48 @@ #include #include "arrow/builder.h" +#include "arrow/util/bit_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" namespace arrow { namespace ree_util { +namespace { + +template +int64_t LogicalNullCount(const ArraySpan& span) { + const auto& values = ValuesArray(span); + const auto& values_bitmap = values.buffers[0].data; + int64_t null_count = 0; + + RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + const bool is_null = + values_bitmap && + !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); + if (is_null) { + null_count += it.run_length(); + } + } + return null_count; +} + +} // namespace + +int64_t LogicalNullCount(const ArraySpan& span) { + const auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + return LogicalNullCount(span); + } + if (type_id == Type::INT32) { + return LogicalNullCount(span); + } + DCHECK_EQ(type_id, Type::INT64); + return LogicalNullCount(span); +} + int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) { const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index 1eb4d5a4db27e..83d828bbe5353 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -53,6 +53,9 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, const std::shared_ptr& values_data, int64_t null_count, int64_t logical_offset); +/// \brief Compute the logical null count of an REE array +int64_t LogicalNullCount(const ArraySpan& span); + namespace internal { /// \brief Uses binary-search to find the physical offset given a logical offset diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc new file mode 100644 index 0000000000000..ca3ecf286441e --- /dev/null +++ b/cpp/src/arrow/util/union_util.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/union_util.h" + +#include + +#include "arrow/array/data.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow::union_util { + +int64_t LogicalSparseUnionNullCount(const ArraySpan& span) { + const auto* sparse_union_type = + internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + int64_t null_count = 0; + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; + + null_count += span.child_data[child_id].IsNull(i); + } + return null_count; +} + +int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { + const auto* dense_union_type = internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + const int32_t* offsets = span.GetValues(2); // NOLINT + int64_t null_count = 0; + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; + const int32_t offset = offsets[span.offset + i]; + null_count += span.child_data[child_id].IsNull(offset); + } + return null_count; +} + +} // namespace arrow::union_util diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h new file mode 100644 index 0000000000000..0f30d5a327819 --- /dev/null +++ b/cpp/src/arrow/util/union_util.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/array/data.h" + +namespace arrow { +namespace union_util { + +/// \brief Compute the number of of logical nulls in a sparse union array +int64_t LogicalSparseUnionNullCount(const ArraySpan& span); + +/// \brief Compute the number of of logical nulls in a dense union array +int64_t LogicalDenseUnionNullCount(const ArraySpan& span); + +} // namespace union_util +} // namespace arrow