Skip to content

Commit

Permalink
apacheGH-36882: [C++][Parquet] Use RLE as BOOLEAN default encoding wh…
Browse files Browse the repository at this point in the history
…en both data page and version is V2 (apache#38163)

### Rationale for this change

Only use RLE as BOOLEAN default encoding when data page is V2.

Previous patch ( apache#36955 ) set RLE encoding for Boolean type by default.  However, parquet-cpp might write format v2 file with page v1 by default. This might cause parquet-cpp generating RLE encoding for boolean type by default. As https://issues.apache.org/jira/browse/PARQUET-2222 says, we still need some talks about that. So, we:

1. Still allow writing RLE on DataPage V2. This keeps same as parquet rust
2. If DataPage V1 is used, don't use RLE as default Boolean encoding.

### What changes are included in this PR?

Only use RLE as BOOLEAN default encoding when both data page and version is V2.

### Are these changes tested?

Yes

### Are there any user-facing changes?

RLE encoding change for Boolean.

* Closes: apache#36882

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
2 people authored and Jeremy Aguilon committed Oct 23, 2023
1 parent 554d663 commit 21ed8a3
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 39 deletions.
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2336,7 +2336,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
Encoding::type encoding = properties->encoding(descr->path());
if (encoding == Encoding::UNKNOWN) {
encoding = (descr->physical_type() == Type::BOOLEAN &&
properties->version() != ParquetVersion::PARQUET_1_0)
properties->version() != ParquetVersion::PARQUET_1_0 &&
properties->data_page_version() == ParquetDataPageVersion::V2)
? Encoding::RLE
: Encoding::PLAIN;
}
Expand Down
78 changes: 53 additions & 25 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
int64_t output_size = SMALL_SIZE,
const ColumnProperties& column_properties = ColumnProperties(),
const ParquetVersion::type version = ParquetVersion::PARQUET_1_0,
const ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1,
bool enable_checksum = false) {
sink_ = CreateOutputStream();
WriterProperties::Builder wp_builder;
wp_builder.version(version);
wp_builder.version(version)->data_page_version(data_page_version);
if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY ||
column_properties.encoding() == Encoding::RLE_DICTIONARY) {
wp_builder.enable_dictionary();
Expand Down Expand Up @@ -176,7 +177,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum));
}

void TestDictionaryFallbackEncoding(ParquetVersion::type version) {
void TestDictionaryFallbackEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version) {
this->GenerateData(VERY_LARGE_SIZE);
ColumnProperties column_properties;
column_properties.set_dictionary_enabled(true);
Expand All @@ -187,7 +189,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
column_properties.set_encoding(Encoding::RLE_DICTIONARY);
}

auto writer = this->BuildWriter(VERY_LARGE_SIZE, column_properties, version);
auto writer =
this->BuildWriter(VERY_LARGE_SIZE, column_properties, version, data_page_version);

writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
writer->Close();
Expand All @@ -204,13 +207,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
std::set<Encoding::type> expected;
if (version == ParquetVersion::PARQUET_1_0) {
if (version != ParquetVersion::PARQUET_1_0 &&
data_page_version == ParquetDataPageVersion::V2) {
// There is only 1 encoding (RLE) in a fallback case for version 2.0 and data page
// v2 enabled.
expected = {Encoding::RLE};
} else {
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for
// version 1.0. Note that RLE is used for DL/RL.
// version 1.0 or data page v1. Note that RLE is used for DL/RL.
expected = {Encoding::PLAIN, Encoding::RLE};
} else {
// There is only 1 encoding (RLE) in a fallback case for version 2.0
expected = {Encoding::RLE};
}
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
Expand All @@ -231,7 +236,10 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
this->metadata_encoding_stats();
if (this->type_num() == Type::BOOLEAN) {
ASSERT_EQ(encoding_stats[0].encoding,
version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE);
version != ParquetVersion::PARQUET_1_0 &&
data_page_version == ParquetDataPageVersion::V2
? Encoding::RLE
: Encoding::PLAIN);
ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
} else if (version == ParquetVersion::PARQUET_1_0) {
std::vector<Encoding::type> expected(
Expand Down Expand Up @@ -262,8 +270,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
writer->Close();
Expand All @@ -281,8 +290,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
enable_statistics);
column_properties.set_codec_options(
std::make_shared<CodecOptions>(compression_level));
std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(
num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1, enable_checksum);
writer->WriteBatchSpaced(this->values_.size(), nullptr, nullptr, valid_bits.data(), 0,
this->values_ptr_);
// The behaviour should be independent from the number of Close() calls
Expand Down Expand Up @@ -693,12 +703,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) {

// Test cases for dictionary fallback encoding
TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0,
ParquetDataPageVersion::V1);
}

TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) {
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4,
ParquetDataPageVersion::V1);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4,
ParquetDataPageVersion::V2);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6,
ParquetDataPageVersion::V1);
this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6,
ParquetDataPageVersion::V2);
}

TEST(TestWriter, NullValuesBuffer) {
Expand Down Expand Up @@ -767,10 +784,13 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) {

class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
public:
void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) {
void TestWithEncoding(ParquetVersion::type version,
ParquetDataPageVersion data_page_version,
const std::set<Encoding::type>& expected_encodings) {
this->SetUpSchema(Repetition::REQUIRED);
auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
/*enable_checksum*/ false);
auto writer =
this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, data_page_version,
/*enable_checksum*/ false);
for (int i = 0; i < SMALL_SIZE; i++) {
bool value = (i % 2 == 0) ? true : false;
writer->WriteBatch(1, nullptr, nullptr, &value);
Expand All @@ -780,21 +800,29 @@ class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
for (int i = 0; i < SMALL_SIZE; i++) {
ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
}
const auto& encodings = this->metadata_encodings();
auto iter = std::find(encodings.begin(), encodings.end(), encoding);
ASSERT_TRUE(iter != encodings.end());
auto metadata_encodings = this->metadata_encodings();
std::set<Encoding::type> metadata_encodings_set{metadata_encodings.begin(),
metadata_encodings.end()};
EXPECT_EQ(expected_encodings, metadata_encodings_set);
}
};

// PARQUET-764
// Correct bitpacking for boolean write at non-byte boundaries
TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
for (auto data_page_version :
{ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) {
TestWithEncoding(ParquetVersion::PARQUET_1_0, data_page_version,
{Encoding::PLAIN, Encoding::RLE});
}
}

// Default encoding for boolean is RLE when using V2 pages
// Default encoding for boolean is RLE when both V2 format and V2 pages enabled.
TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V1,
{Encoding::PLAIN, Encoding::RLE});
TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V2,
{Encoding::RLE});
}

// PARQUET-979
Expand Down
22 changes: 10 additions & 12 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <algorithm>
#include <cinttypes>
#include <memory>
#include <ostream>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -1501,8 +1502,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
thrift_encoding_stats.push_back(data_enc_stat);
add_encoding(data_encoding);
}
column_chunk_->meta_data.__set_encodings(thrift_encodings);
column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings));
column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats));

const auto& encrypt_md =
properties_->column_encryption_properties(column_->path()->ToDotString());
Expand All @@ -1521,7 +1522,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
}
column_chunk_->__set_crypto_metadata(ccmd);
column_chunk_->__set_crypto_metadata(std::move(ccmd));

bool encrypted_footer =
properties_->file_encryption_properties()->encrypted_footer();
Expand Down Expand Up @@ -1601,16 +1602,13 @@ std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(

ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
: impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
: impl_{std::make_unique<ColumnChunkMetaDataBuilderImpl>(std::move(props), column)} {}

ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
void* contents)
: impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
new ColumnChunkMetaDataBuilderImpl(
std::move(props), column,
reinterpret_cast<format::ColumnChunk*>(contents)))} {}
: impl_{std::make_unique<ColumnChunkMetaDataBuilderImpl>(
std::move(props), column, reinterpret_cast<format::ColumnChunk*>(contents))} {}

ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;

Expand Down Expand Up @@ -1782,7 +1780,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
key_value_metadata_(std::move(key_value_metadata)) {
if (properties_->file_encryption_properties() != nullptr &&
properties_->file_encryption_properties()->encrypted_footer()) {
crypto_metadata_.reset(new format::FileCryptoMetaData());
crypto_metadata_ = std::make_unique<format::FileCryptoMetaData>();
}
}

Expand Down Expand Up @@ -1956,8 +1954,8 @@ std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
FileMetaDataBuilder::FileMetaDataBuilder(
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
: impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
schema, std::move(props), std::move(key_value_metadata)))} {}
: impl_{std::make_unique<FileMetaDataBuilderImpl>(schema, std::move(props),
std::move(key_value_metadata))} {}

FileMetaDataBuilder::~FileMetaDataBuilder() = default;

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_parquet_metadata_api():
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
assert set(col_meta.encodings) == {'RLE'}
assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0
Expand Down

0 comments on commit 21ed8a3

Please sign in to comment.