Skip to content

Commit ed65e24

Browse files
committed
Add clustered Iceberg writer mode.
# Conflicts: # velox/connectors/hive/HiveConfig.cpp # velox/connectors/hive/HiveConfig.h
1 parent 8953067 commit ed65e24

File tree

8 files changed

+389
-40
lines changed

8 files changed

+389
-40
lines changed

velox/connectors/hive/HiveConfig.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,4 +273,9 @@ std::string HiveConfig::schema(const config::ConfigBase* session) const {
273273
kSchema, config_->get<std::string>(kSchema, ""));
274274
}
275275

276+
bool HiveConfig::fanoutEnabled(const config::ConfigBase* session) const {
277+
return session->get<bool>(
278+
kFanoutEnabledSession, config_->get<bool>(kFanoutEnabled, true));
279+
}
280+
276281
} // namespace facebook::velox::connector::hive

velox/connectors/hive/HiveConfig.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,12 @@ class HiveConfig {
210210
static constexpr const char* kSource = "source";
211211
static constexpr const char* kSchema = "schema";
212212

213+
/// Controls the writer mode, whether the fanout mode writer is enabled,
214+
/// default value is true, setting to false means clustered mode.
215+
/// Currently applies only to the Iceberg writer.
216+
static constexpr const char* kFanoutEnabled = "fanout-enabled";
217+
static constexpr const char* kFanoutEnabledSession = "fanout_enabled";
218+
213219
InsertExistingPartitionsBehavior insertExistingPartitionsBehavior(
214220
const config::ConfigBase* session) const;
215221

@@ -307,6 +313,9 @@ class HiveConfig {
307313
/// Schema of the query. Used for storage logging.
308314
std::string schema(const config::ConfigBase* session) const;
309315

316+
/// Return if fanout writer mode is enabled.
317+
bool fanoutEnabled(const config::ConfigBase* session) const;
318+
310319
HiveConfig(std::shared_ptr<const config::ConfigBase> config) {
311320
VELOX_CHECK_NOT_NULL(
312321
config, "Config is null for HiveConfig initialization");

velox/connectors/hive/iceberg/IcebergDataSink.cpp

Lines changed: 143 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ namespace facebook::velox::connector::hive::iceberg {
3131

3232
namespace {
3333

34+
constexpr std::string_view kNotClusteredRowsErrorMsg =
35+
"Incoming records violate the writer assumption that records are clustered by spec and \n by partition within each spec. Either cluster the incoming records or switch to fanout writers.\nEncountered records that belong to already closed files:\n";
36+
3437
#define WRITER_NON_RECLAIMABLE_SECTION_GUARD(index) \
3538
memory::NonReclaimableSectionGuard nonReclaimableGuard( \
3639
writerInfo_[(index)]->nonReclaimableSectionHolder.get())
@@ -208,7 +211,10 @@ IcebergDataSink::IcebergDataSink(
208211
insertTableHandle->columnTransforms(),
209212
hiveConfig->isPartitionPathAsLowerCase(
210213
connectorQueryCtx->sessionProperties()))
211-
: nullptr) {
214+
: nullptr),
215+
fanoutEnabled_(
216+
hiveConfig_->fanoutEnabled(connectorQueryCtx_->sessionProperties())),
217+
currentWriterId_(0) {
212218
if (isPartitioned()) {
213219
partitionData_.resize(maxOpenWriters_);
214220
}
@@ -325,8 +331,6 @@ std::vector<std::string> IcebergDataSink::commitMessage() const {
325331
}
326332

327333
void IcebergDataSink::splitInputRowsAndEnsureWriters(RowVectorPtr input) {
328-
VELOX_CHECK(isPartitioned());
329-
330334
std::fill(partitionSizes_.begin(), partitionSizes_.end(), 0);
331335

332336
const auto numRows = partitionIds_.size();
@@ -339,26 +343,7 @@ void IcebergDataSink::splitInputRowsAndEnsureWriters(RowVectorPtr input) {
339343
if (!partitionData_[index].empty()) {
340344
continue;
341345
}
342-
343-
std::vector<folly::dynamic> partitionValues(partitionChannels_.size());
344-
auto icebergPartitionIdGenerator =
345-
dynamic_cast<const IcebergPartitionIdGenerator*>(
346-
partitionIdGenerator_.get());
347-
VELOX_CHECK_NOT_NULL(icebergPartitionIdGenerator);
348-
const RowVectorPtr transformedValues =
349-
icebergPartitionIdGenerator->partitionValues();
350-
for (auto i = 0; i < partitionChannels_.size(); ++i) {
351-
auto block = transformedValues->childAt(i);
352-
if (block->isNullAt(index)) {
353-
partitionValues[i] = nullptr;
354-
} else {
355-
DecodedVector decoded(*block);
356-
partitionValues[i] = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
357-
extractPartitionValue, block->typeKind(), &decoded, index);
358-
}
359-
}
360-
361-
partitionData_[index] = partitionValues;
346+
buildPartitionData(index);
362347
}
363348

364349
for (auto i = 0; i < partitionSizes_.size(); ++i) {
@@ -369,6 +354,11 @@ void IcebergDataSink::splitInputRowsAndEnsureWriters(RowVectorPtr input) {
369354
}
370355
}
371356

357+
void IcebergDataSink::computePartition(const RowVectorPtr& input) {
358+
VELOX_CHECK(isPartitioned());
359+
partitionIdGenerator_->run(input, partitionIds_);
360+
}
361+
372362
void IcebergDataSink::appendData(RowVectorPtr input) {
373363
checkRunning();
374364
if (!isPartitioned()) {
@@ -377,22 +367,79 @@ void IcebergDataSink::appendData(RowVectorPtr input) {
377367
return;
378368
}
379369

380-
// Compute partition and bucket numbers.
381-
computePartitionAndBucketIds(input);
370+
computePartition(input);
382371

383-
splitInputRowsAndEnsureWriters(input);
372+
if (fanoutEnabled_) {
373+
splitInputRowsAndEnsureWriters(input);
384374

385-
for (auto index = 0; index < writers_.size(); ++index) {
386-
const vector_size_t partitionSize = partitionSizes_[index];
387-
if (partitionSize == 0) {
388-
continue;
375+
for (auto index = 0; index < writers_.size(); ++index) {
376+
const vector_size_t partitionSize = partitionSizes_[index];
377+
if (partitionSize == 0) {
378+
continue;
379+
}
380+
381+
const RowVectorPtr writerInput = partitionSize == input->size()
382+
? input
383+
: exec::wrap(partitionSize, partitionRows_[index], input);
384+
write(index, writerInput);
385+
}
386+
} else { // Clustered mode.
387+
std::fill(partitionSizes_.begin(), partitionSizes_.end(), 0);
388+
const auto numRows = input->size();
389+
uint32_t index = 0;
390+
for (auto row = 0; row < numRows; ++row) {
391+
auto id = getIcebergWriterId(row);
392+
index = ensureWriter(id);
393+
if (currentWriterId_ != index) {
394+
clusteredWrite(input, currentWriterId_);
395+
closeWriter(currentWriterId_);
396+
completedWriterIds_.insert(currentWriterId_);
397+
VELOX_USER_CHECK_EQ(
398+
completedWriterIds_.count(index),
399+
0,
400+
"{}",
401+
kNotClusteredRowsErrorMsg);
402+
currentWriterId_ = index;
403+
}
404+
updatePartitionRows(index, numRows, row);
405+
buildPartitionData(index);
389406
}
407+
clusteredWrite(input, index);
408+
}
409+
}
390410

391-
const RowVectorPtr writerInput = partitionSize == input->size()
392-
? input
393-
: exec::wrap(partitionSize, partitionRows_[index], input);
394-
write(index, writerInput);
411+
void IcebergDataSink::buildPartitionData(int32_t index) {
412+
std::vector<folly::dynamic> partitionValues(partitionChannels_.size());
413+
auto icebergPartitionIdGenerator =
414+
dynamic_cast<const IcebergPartitionIdGenerator*>(
415+
partitionIdGenerator_.get());
416+
VELOX_CHECK_NOT_NULL(icebergPartitionIdGenerator);
417+
const RowVectorPtr transformedValues =
418+
icebergPartitionIdGenerator->partitionValues();
419+
for (auto i = 0; i < partitionChannels_.size(); ++i) {
420+
auto block = transformedValues->childAt(i);
421+
if (block->isNullAt(index)) {
422+
partitionValues[i] = nullptr;
423+
} else {
424+
DecodedVector decoded(*block);
425+
partitionValues[i] = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
426+
extractPartitionValue, block->typeKind(), &decoded, index);
427+
}
395428
}
429+
partitionData_[index] = partitionValues;
430+
}
431+
432+
void IcebergDataSink::clusteredWrite(RowVectorPtr input, int32_t writerIdx) {
433+
if (partitionSizes_[writerIdx] != 0) {
434+
VELOX_CHECK_NOT_NULL(partitionRows_[writerIdx]);
435+
partitionRows_[writerIdx]->setSize(
436+
partitionSizes_[writerIdx] * sizeof(vector_size_t));
437+
}
438+
const vector_size_t partitionSize = partitionSizes_[writerIdx];
439+
const RowVectorPtr writerInput = partitionSize == input->size()
440+
? input
441+
: exec::wrap(partitionSize, partitionRows_[writerIdx], input);
442+
write(writerIdx, writerInput);
396443
}
397444

398445
HiveWriterId IcebergDataSink::getIcebergWriterId(size_t row) const {
@@ -463,9 +510,11 @@ void IcebergDataSink::closeInternal() {
463510

464511
if (state_ == State::kClosed) {
465512
for (int i = 0; i < writers_.size(); ++i) {
466-
WRITER_NON_RECLAIMABLE_SECTION_GUARD(i);
467-
writers_[i]->close();
468-
dataFileStats_.push_back(writers_[i]->dataFileStats());
513+
if (writers_[i]) {
514+
WRITER_NON_RECLAIMABLE_SECTION_GUARD(i);
515+
writers_[i]->close();
516+
dataFileStats_.push_back(writers_[i]->dataFileStats());
517+
}
469518
}
470519
} else {
471520
for (int i = 0; i < writers_.size(); ++i) {
@@ -475,6 +524,63 @@ void IcebergDataSink::closeInternal() {
475524
}
476525
}
477526

527+
void IcebergDataSink::closeWriter(int32_t index) {
528+
common::testutil::TestValue::adjust(
529+
"facebook::velox::connector::hive::iceberg::IcebergDataSink::closeWriter",
530+
this);
531+
532+
if (writers_[index]) {
533+
WRITER_NON_RECLAIMABLE_SECTION_GUARD(index);
534+
if (sortWrite()) {
535+
finishWriter(index);
536+
}
537+
writers_[index]->close();
538+
dataFileStats_.push_back(writers_[index]->dataFileStats());
539+
writers_[index] = nullptr;
540+
}
541+
}
542+
543+
bool IcebergDataSink::finishWriter(int32_t index) {
544+
if (!sortWrite()) {
545+
return true;
546+
}
547+
548+
if (writers_[index]) {
549+
const uint64_t startTimeMs = getCurrentTimeMs();
550+
if (!writers_[index]->finish()) {
551+
return false;
552+
}
553+
if (getCurrentTimeMs() - startTimeMs > sortWriterFinishTimeSliceLimitMs_) {
554+
return false;
555+
}
556+
}
557+
return true;
558+
}
559+
560+
bool IcebergDataSink::finish() {
561+
// Flush is reentry state.
562+
setState(State::kFinishing);
563+
564+
// As for now, only sorted writer needs flush buffered data. For non-sorted
565+
// writer, data is directly written to the underlying file writer.
566+
if (!sortWrite()) {
567+
return true;
568+
}
569+
570+
// TODO: we might refactor to move the data sorting logic into hive data sink.
571+
const uint64_t startTimeMs = getCurrentTimeMs();
572+
for (auto i = 0; i < writers_.size(); ++i) {
573+
WRITER_NON_RECLAIMABLE_SECTION_GUARD(i);
574+
if (writers_[i] && !writers_[i]->finish()) {
575+
return false;
576+
}
577+
if (getCurrentTimeMs() - startTimeMs > sortWriterFinishTimeSliceLimitMs_) {
578+
return false;
579+
}
580+
}
581+
return true;
582+
}
583+
478584
std::unique_ptr<facebook::velox::dwio::common::Writer>
479585
IcebergDataSink::maybeCreateBucketSortWriter(
480586
std::unique_ptr<facebook::velox::dwio::common::Writer> writer) {

velox/connectors/hive/iceberg/IcebergDataSink.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ class IcebergDataSink : public HiveDataSink {
9191
return dataFileStats_;
9292
}
9393

94+
bool finish() override;
95+
9496
private:
9597
IcebergDataSink(
9698
RowTypePtr inputType,
@@ -103,6 +105,8 @@ class IcebergDataSink : public HiveDataSink {
103105

104106
void splitInputRowsAndEnsureWriters(RowVectorPtr input) override;
105107

108+
void computePartition(const RowVectorPtr& input);
109+
106110
std::vector<std::string> commitMessage() const override;
107111

108112
HiveWriterId getIcebergWriterId(size_t row) const;
@@ -116,8 +120,16 @@ class IcebergDataSink : public HiveDataSink {
116120
std::unique_ptr<dwio::common::Writer> maybeCreateBucketSortWriter(
117121
std::unique_ptr<dwio::common::Writer> writer) override;
118122

123+
void buildPartitionData(int32_t index);
124+
125+
void clusteredWrite(RowVectorPtr input, int32_t writerIdx);
126+
119127
void closeInternal() override;
120128

129+
void closeWriter(int32_t index);
130+
131+
bool finishWriter(int32_t index);
132+
121133
// Below are structures for partitions from all inputs. partitionData_
122134
// is indexed by partitionId.
123135
std::vector<std::vector<folly::dynamic>> partitionData_;
@@ -127,6 +139,11 @@ class IcebergDataSink : public HiveDataSink {
127139
std::vector<std::unique_ptr<dwio::common::DataFileStatsSettings>>>
128140
statsSettings_;
129141
std::unique_ptr<DataFileStatsCollector> icebergStatsCollector_;
142+
143+
// Below are structures for clustered mode writer.
144+
const bool fanoutEnabled_;
145+
uint32_t currentWriterId_;
146+
std::unordered_set<uint32_t> completedWriterIds_;
130147
};
131148

132149
} // namespace facebook::velox::connector::hive::iceberg

velox/connectors/hive/iceberg/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ if(NOT VELOX_DISABLE_GOOGLETEST)
4949
IcebergTestBase.cpp
5050
IcebergTransformE2ETest.cpp
5151
IcebergTransformUnitTest.cpp
52+
IcebergWriterModeTest.cpp
5253
Main.cpp
5354
Murmur3Test.cpp
5455
)

velox/connectors/hive/iceberg/tests/IcebergTestBase.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ class IcebergTestBase : public exec::test::HiveConnectorTestBase {
6262
const std::vector<PartitionField>& transformSpecs,
6363
const RowTypePtr& rowType);
6464

65+
void setupMemoryPools(const std::string& name);
66+
6567
private:
6668
std::shared_ptr<IcebergInsertTableHandle> createIcebergInsertTableHandle(
6769
const RowTypePtr& rowType,
@@ -72,11 +74,10 @@ class IcebergTestBase : public exec::test::HiveConnectorTestBase {
7274
std::vector<std::string> listPartitionDirectories(
7375
const std::string& dataPath);
7476

75-
void setupMemoryPools(const std::string& name);
76-
7777
protected:
7878
RowTypePtr rowType_;
7979
std::shared_ptr<memory::MemoryPool> opPool_;
80+
std::shared_ptr<config::ConfigBase> connectorSessionProperties_;
8081

8182
private:
8283
static constexpr const char* kHiveConnectorId = "test-hive";
@@ -86,7 +87,6 @@ class IcebergTestBase : public exec::test::HiveConnectorTestBase {
8687

8788
std::shared_ptr<memory::MemoryPool> root_;
8889
std::shared_ptr<memory::MemoryPool> connectorPool_;
89-
std::shared_ptr<config::ConfigBase> connectorSessionProperties_;
9090
std::shared_ptr<HiveConfig> connectorConfig_;
9191
std::unique_ptr<ConnectorQueryCtx> connectorQueryCtx_;
9292
VectorFuzzer::Options fuzzerOptions_;

0 commit comments

Comments
 (0)