diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index d7e094f79841..142f1017287f 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -619,12 +619,18 @@ void PageReader::makeDecoder() { pageData_, pageData_ + encodedDataSize_); break; case thrift::Type::FIXED_LEN_BYTE_ARRAY: - directDecoder_ = std::make_unique>( - std::make_unique( - pageData_, encodedDataSize_), - false, - type_->typeLength_, - true); + if (type_->type()->isVarbinary()) { + stringDecoder_ = std::make_unique( + pageData_, pageData_ + encodedDataSize_, type_->typeLength_); + } else { + directDecoder_ = + std::make_unique>( + std::make_unique( + pageData_, encodedDataSize_), + false, + type_->typeLength_, + true); + } break; default: { directDecoder_ = std::make_unique>( diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 818d5004c8ff..0cfd4b9882e2 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -646,7 +646,7 @@ TypePtr ReaderBase::convertType( case thrift::Type::type::INT64: return BIGINT(); case thrift::Type::type::INT96: - return DOUBLE(); // TODO: Lose precision + return TIMESTAMP(); // INT96 only maps to a timestamp case thrift::Type::type::FLOAT: return REAL(); case thrift::Type::type::DOUBLE: diff --git a/velox/dwio/parquet/reader/StringDecoder.h b/velox/dwio/parquet/reader/StringDecoder.h index f85a1ba1e11f..cfbb1589ba51 100644 --- a/velox/dwio/parquet/reader/StringDecoder.h +++ b/velox/dwio/parquet/reader/StringDecoder.h @@ -20,11 +20,11 @@ namespace facebook::velox::parquet { class StringDecoder { public: - StringDecoder(const char* start, const char* end) + StringDecoder(const char* start, const char* end, int fixedLength = -1) : bufferStart_(start), bufferEnd_(end), - - lastSafeWord_(end - simd::kPadding) {} + lastSafeWord_(end - simd::kPadding), + fixedLength_(fixedLength) {} void skip(uint64_t numValues) { skip(numValues, 0, nullptr); @@ -62,7 +62,8 @@ class StringDecoder { } // We are at a non-null value on a row to visit. - toSkip = visitor.process(readString(), atEnd); + toSkip = visitor.process( + fixedLength_ > 0 ? readFixedString() : readString(), atEnd); } ++current; if (toSkip) { @@ -85,9 +86,16 @@ class StringDecoder { bufferStart_ += length + sizeof(int32_t); return folly::StringPiece(bufferStart_ - length, length); } + + folly::StringPiece readFixedString() { + bufferStart_ += fixedLength_; + return folly::StringPiece(bufferStart_ - fixedLength_, fixedLength_); + } + const char* bufferStart_; const char* bufferEnd_; const char* const lastSafeWord_; + const int fixedLength_; }; } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/tests/examples/varbinary_flba.parquet b/velox/dwio/parquet/tests/examples/varbinary_flba.parquet new file mode 100644 index 000000000000..befd342fc2a0 Binary files /dev/null and b/velox/dwio/parquet/tests/examples/varbinary_flba.parquet differ diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index 43975343681a..5488c0c7d3d9 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -1128,3 +1128,31 @@ TEST_F(ParquetReaderTest, testEnumType) { assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_); } + +TEST_F(ParquetReaderTest, readVarbinaryFromFLBA) { + const std::string filename("varbinary_flba.parquet"); + const std::string sample(getExampleFilePath(filename)); + + facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto reader = createReader(sample, readerOptions); + + auto type = reader->typeWithId(); + EXPECT_EQ(type->size(), 8ULL); + auto flbaCol = + std::static_pointer_cast(type->childAt(6)); + EXPECT_EQ(flbaCol->name_, "flba_field"); + EXPECT_EQ(flbaCol->parquetType_, thrift::Type::FIXED_LEN_BYTE_ARRAY); + + auto selectedType = ROW({"flba_field"}, {VARBINARY()}); + auto rowReaderOpts = getReaderOpts(selectedType); + rowReaderOpts.setScanSpec(makeScanSpec(selectedType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + + auto expected = std::string(1024, '*'); + VectorPtr result = BaseVector::create(selectedType, 0, &(*leafPool_)); + rowReader->next(1, result); + EXPECT_EQ( + expected, + result->as()->childAt(0)->asFlatVector()->valueAt( + 0)); +}