Skip to content

Commit

Permalink
apacheGH-34361: [C++] Fix the handling of logical nulls for types wit…
Browse files Browse the repository at this point in the history
…hout bitmaps like Unions and Run-End Encoded (apache#34408)

Bonus: add `ArrayData::IsValid()` to make it consistent with `Array` and `ArraySpan`.

### Rationale for this change

This is the proposed fix to apache#34361 plus the addition of more APIs dealing with validity/nullity.

### What changes are included in this PR?

This PR changes the behavior of `IsNull` and `IsValid` in `Array`, `ArrayData`, and `ArraySpan`.

It preserves the behavior of `MayHaveNulls`, `GetNullCount` and introduces new member functions to `Array`, `ArrayData`, and `ArraySpan`:

 - `bool HasValidityBitmap() const`
 - `bool MayHaveLogicalNulls() const`
 - `int64_t ComputeLogicalNullCount() const`

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes. See above.

Breakage with these changes can only happen if users rely on `IsNull(i)` always returning `true` for union types, but we have users reporting that the current behavior or broken apache#34315. This is why the behavior of `IsNull` and `IsValid` is changing.,

**This PR contains a "Critical Fix".**
* Closes: apache#34361

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
felipecrv authored and liujiacheng777 committed May 11, 2023
1 parent 08bb756 commit 47d5580
Show file tree
Hide file tree
Showing 11 changed files with 517 additions and 40 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ set(ARROW_SRCS
util/time.cc
util/tracing.cc
util/trie.cc
util/union_util.cc
util/unreachable.cc
util/uri.cc
util/utf8.cc
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ class ExtensionArray;

int64_t Array::null_count() const { return data_->GetNullCount(); }

int64_t Array::ComputeLogicalNullCount() const {
return data_->ComputeLogicalNullCount();
}

namespace internal {

struct ScalarFromArraySlotImpl {
Expand Down Expand Up @@ -175,6 +179,10 @@ struct ScalarFromArraySlotImpl {
array_.length());
}

// Skip checking for nulls in RUN_END_ENCODED arrays to avoid potentially
// making two O(log n) searches for the physical index of the slot -- one
// here and another in Visit(const RunEndEncodedArray&) in case the values
// is not null.
if (array_.type()->id() != Type::RUN_END_ENCODED && array_.IsNull(index_)) {
auto null = MakeNullScalar(array_.type());
if (is_dictionary(array_.type()->id())) {
Expand Down
37 changes: 29 additions & 8 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,28 @@ class ARROW_EXPORT Array {
virtual ~Array() = default;

/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count == data_->length;
}
bool IsNull(int64_t i) const { return !IsValid(i); }

/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count != data_->length;
if (null_bitmap_data_ != NULLPTR) {
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
}
// Dispatching with a few conditionals like this makes IsNull more
// efficient for how it is used in practice. Making IsNull virtual
// would add a vtable lookup to every call and prevent inlining +
// a potential inner-branch removal.
if (type_id() == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*data_, i);
}
if (type_id() == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*data_, i);
}
if (type_id() == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*data_, i);
}
return data_->null_count != data_->length;
}

/// \brief Return a Scalar containing the value of this array at i
Expand All @@ -85,6 +95,17 @@ class ARROW_EXPORT Array {
/// function
int64_t null_count() const;

/// \brief Computes the logical null count for arrays of all types including
/// those that do not have a validity bitmap like union and run-end encoded
/// arrays
///
/// If the array has a validity bitmap, this function behaves the same as
/// null_count(). For types that have no validity bitmap, this function will
/// recompute the null count every time it is called.
///
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;

std::shared_ptr<DataType> type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }

Expand Down
83 changes: 65 additions & 18 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_dict.h"
#include "arrow/array/builder_run_end.h"
#include "arrow/array/builder_time.h"
#include "arrow/array/data.h"
#include "arrow/array/util.h"
Expand Down Expand Up @@ -83,15 +84,48 @@ TEST_F(TestArray, TestNullCount) {
auto data = std::make_shared<Buffer>(nullptr, 0);
auto null_bitmap = std::make_shared<Buffer>(nullptr, 0);

std::unique_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
std::shared_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
ASSERT_EQ(10, arr->ComputeLogicalNullCount());
ASSERT_EQ(10, arr->null_count());
ASSERT_TRUE(arr->data()->MayHaveNulls());
ASSERT_TRUE(arr->data()->MayHaveLogicalNulls());

std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
std::shared_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
ASSERT_EQ(0, arr_no_nulls->ComputeLogicalNullCount());
ASSERT_EQ(0, arr_no_nulls->null_count());
ASSERT_FALSE(arr_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(arr_no_nulls->data()->MayHaveLogicalNulls());

std::unique_ptr<Int32Array> arr_default_null_count(
std::shared_ptr<Int32Array> arr_default_null_count(
new Int32Array(100, data, null_bitmap));
ASSERT_EQ(kUnknownNullCount, arr_default_null_count->data()->null_count);
ASSERT_TRUE(arr_default_null_count->data()->MayHaveNulls());
ASSERT_TRUE(arr_default_null_count->data()->MayHaveLogicalNulls());

RunEndEncodedBuilder ree_builder(pool_, std::make_shared<Int32Builder>(pool_),
std::make_shared<Int32Builder>(pool_),
run_end_encoded(int32(), int32()));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
ASSERT_OK(ree_builder.AppendNull());
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
ASSERT_OK(ree_builder.AppendNulls(2));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
ASSERT_OK(ree_builder.AppendNulls(7));
ASSERT_OK_AND_ASSIGN(auto ree, ree_builder.Finish());

ASSERT_EQ(0, ree->null_count());
ASSERT_EQ(10, ree->ComputeLogicalNullCount());
ASSERT_FALSE(ree->data()->MayHaveNulls());
ASSERT_TRUE(ree->data()->MayHaveLogicalNulls());

ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
ASSERT_OK_AND_ASSIGN(auto ree_no_nulls, ree_builder.Finish());
ASSERT_EQ(0, ree_no_nulls->null_count());
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());
}

TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Expand Down Expand Up @@ -377,20 +411,23 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
ASSERT_EQ(array->length(), length);
if (is_union(type->id())) {
ASSERT_EQ(array->null_count(), 0);
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
const auto& union_array = checked_cast<const UnionArray&>(*array);
for (int i = 0; i < union_array.num_fields(); ++i) {
ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length());
}
} else if (type->id() == Type::RUN_END_ENCODED) {
ASSERT_EQ(array->null_count(), 0);
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
const auto& ree_array = checked_cast<const RunEndEncodedArray&>(*array);
ASSERT_EQ(ree_array.values()->null_count(), ree_array.values()->length());
} else {
ASSERT_EQ(array->null_count(), length);
for (int64_t i = 0; i < length; ++i) {
ASSERT_TRUE(array->IsNull(i));
ASSERT_FALSE(array->IsValid(i));
}
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
}
for (int64_t i = 0; i < length; ++i) {
ASSERT_TRUE(array->IsNull(i));
ASSERT_FALSE(array->IsValid(i));
}
}
}
Expand Down Expand Up @@ -482,35 +519,45 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar)
std::unique_ptr<arrow::ArrayBuilder> builder;
auto null_scalar = MakeNullScalar(scalar->type);
ASSERT_OK(MakeBuilderExactIndex(pool, scalar->type, &builder));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*null_scalar));
ASSERT_OK(builder->AppendScalars({scalar, null_scalar}));
ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2));
ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2));
ASSERT_OK(builder->AppendScalar(*scalar)); // [0] = scalar
ASSERT_OK(builder->AppendScalar(*scalar)); // [1] = scalar
ASSERT_OK(builder->AppendScalar(*null_scalar)); // [2] = NULL
ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); // [3, 4] = {scalar, NULL}
ASSERT_OK(
builder->AppendScalar(*scalar, /*n_repeats=*/2)); // [5, 6] = {scalar, scalar}
ASSERT_OK(
builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); // [7, 8] = {NULL, NULL}

std::shared_ptr<Array> out;
FinishAndCheckPadding(builder.get(), &out);
ASSERT_OK(out->ValidateFull());
AssertTypeEqual(scalar->type, out->type());
ASSERT_EQ(out->length(), 9);

const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id());
auto out_type_id = out->type()->id();
const bool has_validity = internal::HasValidityBitmap(out_type_id);
// For a dictionary builder, the output dictionary won't necessarily be the same
const bool can_check_values = !is_dictionary(out->type()->id());
const bool can_check_values = !is_dictionary(out_type_id);

if (can_check_nulls) {
if (has_validity) {
ASSERT_EQ(out->null_count(), 4);
} else {
ASSERT_EQ(out->null_count(), 0);
}
if (scalar->is_valid) {
ASSERT_EQ(out->ComputeLogicalNullCount(), 4);
} else {
ASSERT_EQ(out->ComputeLogicalNullCount(), 9);
}

for (const auto index : {0, 1, 3, 5, 6}) {
ASSERT_FALSE(out->IsNull(index));
ASSERT_NE(out->IsNull(index), scalar->is_valid);
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
ASSERT_OK(scalar_i->ValidateFull());
if (can_check_values) AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
}
for (const auto index : {2, 4, 7, 8}) {
ASSERT_EQ(out->IsNull(index), can_check_nulls);
ASSERT_TRUE(out->IsNull(index));
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
ASSERT_OK(scalar_i->ValidateFull());
AssertScalarsEqual(*null_scalar, *scalar_i, /*verbose=*/true);
Expand Down
16 changes: 12 additions & 4 deletions cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,13 @@ class BaseListBuilder : public ArrayBuilder {
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
Expand Down Expand Up @@ -301,9 +305,13 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
// Add together the inner StructArray offset to the Map/List offset
Expand Down
93 changes: 93 additions & 0 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ree_util.h"
#include "arrow/util/slice_util_internal.h"
#include "arrow/util/union_util.h"

namespace arrow {

Expand All @@ -60,6 +62,38 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length,
}
}

namespace internal {

bool IsNullSparseUnion(const ArrayData& data, int64_t i) {
auto* union_type = checked_cast<const SparseUnionType*>(data.type.get());
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
const int child_id = union_type->child_ids()[types[data.offset + i]];
return data.child_data[child_id]->IsNull(i);
}

bool IsNullDenseUnion(const ArrayData& data, int64_t i) {
auto* union_type = checked_cast<const DenseUnionType*>(data.type.get());
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
const int child_id = union_type->child_ids()[types[data.offset + i]];
const auto* offsets = reinterpret_cast<const int32_t*>(data.buffers[2]->data());
const int64_t child_offset = offsets[data.offset + i];
return data.child_data[child_id]->IsNull(child_offset);
}

bool IsNullRunEndEncoded(const ArrayData& data, int64_t i) {
return ArraySpan(data).IsNullRunEndEncoded(i);
}

bool UnionMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

} // namespace internal

std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count, int64_t offset) {
Expand Down Expand Up @@ -132,6 +166,13 @@ int64_t ArrayData::GetNullCount() const {
return precomputed;
}

int64_t ArrayData::ComputeLogicalNullCount() const {
if (this->buffers[0]) {
return GetNullCount();
}
return ArraySpan(*this).ComputeLogicalNullCount();
}

// ----------------------------------------------------------------------
// Methods for ArraySpan

Expand Down Expand Up @@ -407,6 +448,20 @@ int64_t ArraySpan::GetNullCount() const {
return precomputed;
}

int64_t ArraySpan::ComputeLogicalNullCount() const {
const auto t = this->type->id();
if (t == Type::SPARSE_UNION) {
return union_util::LogicalSparseUnionNullCount(*this);
}
if (t == Type::DENSE_UNION) {
return union_util::LogicalDenseUnionNullCount(*this);
}
if (t == Type::RUN_END_ENCODED) {
return ree_util::LogicalNullCount(*this);
}
return GetNullCount();
}

int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }

std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
Expand Down Expand Up @@ -445,6 +500,44 @@ std::shared_ptr<Array> ArraySpan::ToArray() const {
return MakeArray(this->ToArrayData());
}

bool ArraySpan::IsNullSparseUnion(int64_t i) const {
auto* union_type = checked_cast<const SparseUnionType*>(this->type);
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
const int child_id = union_type->child_ids()[types[this->offset + i]];
return this->child_data[child_id].IsNull(i);
}

bool ArraySpan::IsNullDenseUnion(int64_t i) const {
auto* union_type = checked_cast<const DenseUnionType*>(this->type);
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
const auto* offsets = reinterpret_cast<const int32_t*>(this->buffers[2].data);
const int64_t child_id = union_type->child_ids()[types[this->offset + i]];
const int64_t child_offset = offsets[this->offset + i];
return this->child_data[child_id].IsNull(child_offset);
}

bool ArraySpan::IsNullRunEndEncoded(int64_t i) const {
const auto& values = ree_util::ValuesArray(*this);
if (values.MayHaveLogicalNulls()) {
const int64_t physical_offset = ree_util::FindPhysicalIndex(*this, i, this->offset);
return ree_util::ValuesArray(*this).IsNull(physical_offset);
}
return false;
}

bool ArraySpan::UnionMayHaveLogicalNulls() const {
for (auto& child : this->child_data) {
if (child.MayHaveLogicalNulls()) {
return true;
}
}
return false;
}

bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
}

// ----------------------------------------------------------------------
// Implement internal::GetArrayView

Expand Down
Loading

0 comments on commit 47d5580

Please sign in to comment.