Skip to content

Commit

Permalink
apacheGH-32439: [Python] Fix off by one bug when chunking nested stru…
Browse files Browse the repository at this point in the history
…cts (apache#37376)

### Rationale for this change

See: apache#32439 

### What changes are included in this PR?

During conversion from Python to Arrow, when a struct's child hits a capacity error and chunking is triggered, this can leave the Finish'd chunk in an invalid state since the struct's length does not match the length of its children.

This change simply tries to Append the children first, and only if successful will Append the struct. This is safe because the order of Append'ing between the struct and its child is not specified. It is only specified that they must be consistent with each other.

This is per: 

https://github.com/apache/arrow/blob/86b7a84c9317fa08222eb63f6930bbb54c2e6d0b/cpp/src/arrow/array/builder_nested.h#L507-L508

### Are these changes tested?

A unit test is added that would previously have an invalid data error.

```
>       tab = pa.Table.from_pandas(df)

pyarrow/tests/test_pandas.py:4970: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
pyarrow/table.pxi:3788: in pyarrow.lib.Table.from_pandas
    return cls.from_arrays(arrays, schema=schema)
pyarrow/table.pxi:3890: in pyarrow.lib.Table.from_arrays
    result.validate()
pyarrow/table.pxi:3170: in pyarrow.lib.Table.validate
    check_status(self.table.Validate())

# ...

FAILED pyarrow/tests/test_pandas.py::test_nested_chunking_valid - pyarrow.lib.ArrowInvalid: Column 0: In chunk 0: Invalid: List child array invalid: Invalid: Struct child array #0 has length smaller than expected for struct array (2 < 3)
```

NOTE: This unit test uses about 7GB of memory (max RSS) on my macbook pro. This might make CI challenging; I'm open to suggestions to limit it.

### Are there any user-facing changes?

No
* Closes: apache#32439

Lead-authored-by: Mike Lui <mikelui@meta.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
2 people authored and Jeremy Aguilon committed Oct 23, 2023
1 parent 58678f8 commit 2c70793
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 6 deletions.
16 changes: 10 additions & 6 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
}
switch (input_kind_) {
case InputKind::DICT:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendDict(value);
RETURN_NOT_OK(AppendDict(value));
return this->struct_builder_->Append();
case InputKind::TUPLE:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendTuple(value);
RETURN_NOT_OK(AppendTuple(value));
return this->struct_builder_->Append();
case InputKind::ITEMS:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendItems(value);
RETURN_NOT_OK(AppendItems(value));
return this->struct_builder_->Append();
default:
RETURN_NOT_OK(InferInputKind(value));
return Append(value);
Expand All @@ -944,6 +944,10 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
Status Init(MemoryPool* pool) override {
RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));

// This implementation will check the child values before appending itself,
// so no rewind is necessary
this->rewind_on_overflow_ = false;

// Store the field names as a PyObjects for dict matching
num_fields_ = this->struct_type_->num_fields();
bytes_field_names_.reset(PyList_New(num_fields_));
Expand Down
44 changes: 44 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1691,6 +1691,7 @@ def test_auto_chunking_pandas_series_of_strings(self, char):
'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
})
arr = pa.array(df['strings'], from_pandas=True)
arr.validate(full=True)
assert isinstance(arr, pa.ChunkedArray)
assert arr.num_chunks == 2
assert len(arr.chunk(0)) == 21
Expand Down Expand Up @@ -2381,6 +2382,7 @@ def test_auto_chunking_on_list_overflow(self):
"b": range(n)
})
table = pa.Table.from_pandas(df)
table.validate(full=True)

column_a = table[0]
assert column_a.num_chunks == 2
Expand Down Expand Up @@ -2623,6 +2625,7 @@ def test_from_numpy_large(self):
ty = pa.struct([pa.field('x', pa.float64()),
pa.field('y', pa.binary())])
arr = pa.array(data, type=ty, from_pandas=True)
arr.validate(full=True)
assert arr.num_chunks == 2

def iter_chunked_array(arr):
Expand Down Expand Up @@ -2655,6 +2658,7 @@ def check(arr, data, mask=None):
# Now with explicit mask
mask = np.random.random_sample(n) < 0.2
arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
arr.validate(full=True)
assert arr.num_chunks == 2

check(arr, data, mask)
Expand Down Expand Up @@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():

def assert_roundtrip(series: pd.Series, data) -> None:
array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty))
array_roundtrip.validate(full=True)
assert data.equals(array_roundtrip)

assert_roundtrip(series_default, chunked_array)
Expand Down Expand Up @@ -4944,3 +4949,42 @@ def test_array_conversion_for_datetime():

result = arr.to_pandas()
tm.assert_series_equal(result, series)


@pytest.mark.large_memory
def test_nested_chunking_valid():
# GH-32439: Chunking can cause arrays to be in invalid state
# when nested types are involved.
# Here we simply ensure we validate correctly.

def roundtrip(df, schema=None):
tab = pa.Table.from_pandas(df, schema=schema)
tab.validate(full=True)
# we expect to trigger chunking internally
# an assertion failure here may just mean this threshold has changed
num_chunks = tab.column(0).num_chunks
assert num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True,
maps_as_pydicts="strict"), df)

x = b"0" * 720000000
roundtrip(pd.DataFrame({"strings": [x, x, x]}))

struct = {"struct_field": x}
roundtrip(pd.DataFrame({"structs": [struct, struct, struct]}))

lists = [x]
roundtrip(pd.DataFrame({"lists": [lists, lists, lists]}))

los = [struct]
roundtrip(pd.DataFrame({"los": [los, los, los]}))

sol = {"struct_field": lists}
roundtrip(pd.DataFrame({"sol": [sol, sol, sol]}))

map_of_los = {"a": los}
map_type = pa.map_(pa.string(),
pa.list_(pa.struct([("struct_field", pa.binary())])))
schema = pa.schema([("maps", map_type)])
roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
schema=schema)

0 comments on commit 2c70793

Please sign in to comment.