From aa3e7cb495304e8e1e92f562e55b8a5feb83aba7 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 22 Oct 2019 13:15:23 +0300 Subject: [PATCH 1/8] Replace Array with ChunkedArray in hpat_parquet_reader.cpp --- parquet_reader/hpat_parquet_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index 665e1429e..ddde7541a 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -86,7 +86,7 @@ int64_t pq_get_size_single_file(std::shared_ptr arrow_reader, int64_ int64_t pq_read_single_file(std::shared_ptr arrow_reader, int64_t column_idx, uint8_t* out_data, int out_dtype) { - std::shared_ptr<::arrow::Array> arr; + std::shared_ptr<::arrow::ChunkedArray> arr; arrow_reader->ReadColumn(column_idx, &arr); if (arr == NULL) return 0; From 2fd340f695c93f47891058a7dc759f05d325a34b Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 22 Oct 2019 13:47:02 +0300 Subject: [PATCH 2/8] Use on Array from ChunkedArray. --- parquet_reader/hpat_parquet_reader.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index ddde7541a..1691193dd 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -86,11 +86,13 @@ int64_t pq_get_size_single_file(std::shared_ptr arrow_reader, int64_ int64_t pq_read_single_file(std::shared_ptr arrow_reader, int64_t column_idx, uint8_t* out_data, int out_dtype) { - std::shared_ptr<::arrow::ChunkedArray> arr; - arrow_reader->ReadColumn(column_idx, &arr); - if (arr == NULL) + std::shared_ptr<::arrow::ChunkedArray> chunked_array; + arrow_reader->ReadColumn(column_idx, &chunked_array); + if (chunked_array == NULL) return 0; + auto arr = chunked_array->chunk(0); + int64_t num_values = arr->length(); // std::cout << "arr: " << arr->ToString() << std::endl; std::shared_ptr arrow_type = get_arrow_type(arrow_reader, column_idx); From 24e970f026338ace5fd3d5b73910b8ddb4543ce7 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 22 Oct 2019 14:08:29 +0300 Subject: [PATCH 3/8] Remove arrow::Column class --- parquet_reader/hpat_parquet_reader.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index 1691193dd..641587b9c 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -156,8 +156,7 @@ int pq_read_parallel_single_file(std::shared_ptr arrow_reader, /* -------- read row group ---------- */ std::shared_ptr<::arrow::Table> table; arrow_reader->ReadRowGroup(row_group_index, column_indices, &table); - std::shared_ptr<::arrow::Column> column = table->column(0); - std::shared_ptr<::arrow::ChunkedArray> chunked_arr = column->data(); + std::shared_ptr<::arrow::ChunkedArray> chunked_arr = table->column(0); // std::cout << chunked_arr->num_chunks() << std::endl; if (chunked_arr->num_chunks() != 1) { @@ -511,8 +510,7 @@ int pq_read_string_parallel_single_file(std::shared_ptr arrow_reader /* -------- read row group ---------- */ std::shared_ptr<::arrow::Table> table; arrow_reader->ReadRowGroup(row_group_index, column_indices, &table); - std::shared_ptr<::arrow::Column> column = table->column(0); - std::shared_ptr<::arrow::ChunkedArray> chunked_arr = column->data(); + std::shared_ptr<::arrow::ChunkedArray> chunked_arr = table->column(0); // std::cout << chunked_arr->num_chunks() << std::endl; if (chunked_arr->num_chunks() != 1) { From d3b1c87094a8f097e902973eccf1315f6d6022c8 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 22 Oct 2019 14:12:55 +0300 Subject: [PATCH 4/8] Replace Array with ChunkedArray in hpat_parquet_reader.cpp --- parquet_reader/hpat_parquet_reader.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index 641587b9c..a1b771a20 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -391,10 +391,11 @@ int64_t pq_read_string_single_file(std::shared_ptr arrow_reader, { // std::cout << "string read file" << '\n'; // - std::shared_ptr<::arrow::Array> arr; - arrow_reader->ReadColumn(column_idx, &arr); - if (arr == NULL) + std::shared_ptr<::arrow::ChunkedArray> chunked_arr; + arrow_reader->ReadColumn(column_idx, &chunked_arr); + if (chunked_arr == NULL) return -1; + auto arr = chunked_arr->chunk(0); int64_t num_values = arr->length(); // std::cout << arr->ToString() << std::endl; std::shared_ptr arrow_type = get_arrow_type(arrow_reader, column_idx); From a0b430bf58f4e3b430c518ebb90d95463dcc19b6 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 22 Oct 2019 15:55:46 +0300 Subject: [PATCH 5/8] Use FileReader::Make() for create std::unique_ptr --- parquet_reader/hpat_parquet_reader.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index a1b771a20..0a0c994b2 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -635,11 +635,15 @@ void pq_init_reader(const char* file_name, std::shared_ptr* a_reader ::arrow::io::HadoopFileSystem::Connect(&hfs_config, &fs); std::shared_ptr<::arrow::io::HdfsReadableFile> file; fs->OpenReadable(f_name, &file); - a_reader->reset(new FileReader(pool, ParquetFileReader::Open(file))); + std::unique_ptr arrow_reader; + FileReader::Make(pool, ParquetFileReader::Open(file), &arrow_reader); + *a_reader = std::move(arrow_reader); } else // regular file system { - a_reader->reset(new FileReader(pool, ParquetFileReader::OpenFile(f_name, false))); + std::unique_ptr arrow_reader; + FileReader::Make(pool, ParquetFileReader::OpenFile(f_name, false), &arrow_reader); + *a_reader = std::move(arrow_reader); } // printf("file open for arrow reader done\n"); // fflush(stdout); From 58c26569c38c9b3021c7ac28bd17570e006011f0 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 23 Oct 2019 09:15:44 +0300 Subject: [PATCH 6/8] Use FileReader::GetSchema() because FromParquetSchema(..., column_indices, ...) was removed. --- parquet_reader/hpat_parquet_reader.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index 0a0c994b2..94a16be74 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -655,15 +655,18 @@ void pq_init_reader(const char* file_name, std::shared_ptr* a_reader std::shared_ptr get_arrow_type(std::shared_ptr arrow_reader, int64_t column_idx) { // TODO: error checking - std::vector column_indices; - column_indices.push_back(column_idx); + // std::vector column_indices; + // column_indices.push_back(column_idx); std::shared_ptr<::arrow::Schema> col_schema; - auto descr = arrow_reader->parquet_reader()->metadata()->schema(); - auto parquet_key_value_metadata = arrow_reader->parquet_reader()->metadata()->key_value_metadata(); - parquet::arrow::FromParquetSchema(descr, column_indices, parquet_key_value_metadata, &col_schema); + // auto descr = arrow_reader->parquet_reader()->metadata()->schema(); + // auto parquet_key_value_metadata = arrow_reader->parquet_reader()->metadata()->key_value_metadata(); + // parquet::arrow::FromParquetSchema(descr, column_indices, parquet_key_value_metadata, &col_schema); + arrow_reader->GetSchema(&col_schema); // std::cout<< col_schema->ToString() << std::endl; - std::shared_ptr<::arrow::DataType> arrow_dtype = col_schema->field(0)->type(); + // std::shared_ptr<::arrow::DataType> arrow_dtype = col_schema->field(0)->type(); + std::shared_ptr<::arrow::DataType> arrow_dtype = col_schema->field(column_idx)->type(); + return arrow_dtype; } From b24513b6bee67b3054df2df0415af5bba6d25f52 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 23 Oct 2019 09:43:58 +0300 Subject: [PATCH 7/8] Update PyArrow version in docs and meta.yaml files --- README.rst | 4 ++-- buildscripts/hpat-conda-recipe/meta.yaml | 8 ++++---- buildscripts/parquet-reader-conda-recipe/meta.yaml | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 0a4a11b84..353c85a00 100644 --- a/README.rst +++ b/README.rst @@ -100,7 +100,7 @@ Building on Linux with setuptools :: PYVER=<3.6 or 3.7> - conda create -n HPAT -q -y -c numba -c conda-forge -c defaults numba mpich pyarrow=0.14.1 arrow-cpp=0.14.1 gcc_linux-64 gxx_linux-64 gfortran_linux-64 scipy pandas boost python=$PYVER + conda create -n HPAT -q -y -c numba -c conda-forge -c defaults numba mpich pyarrow=0.15.0 arrow-cpp=0.15.0 gcc_linux-64 gxx_linux-64 gfortran_linux-64 scipy pandas boost python=$PYVER source activate HPAT git clone https://github.com/IntelPython/hpat cd hpat @@ -136,7 +136,7 @@ Building on Windows with setuptools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :: - conda create -n HPAT -c numba -c defaults -c intel python=<3.6 or 3.7> numba impi-devel pyarrow=0.14.1 arrow-cpp=0.14.1 scipy pandas boost + conda create -n HPAT -c numba -c defaults -c intel -c conda-forge python=<3.6 or 3.7> numba impi-devel pyarrow=0.15.0 arrow-cpp=0.15.0 scipy pandas boost conda activate HPAT git clone https://github.com/IntelPython/hpat.git cd hpat diff --git a/buildscripts/hpat-conda-recipe/meta.yaml b/buildscripts/hpat-conda-recipe/meta.yaml index a3877c641..7a24472d5 100644 --- a/buildscripts/hpat-conda-recipe/meta.yaml +++ b/buildscripts/hpat-conda-recipe/meta.yaml @@ -26,8 +26,8 @@ requirements: - numba ==0.45 - numpy - pandas >=0.23 - - pyarrow ==0.14.1 - - arrow-cpp ==0.14.1 + - pyarrow ==0.15.0 + - arrow-cpp ==0.15.0 - boost - hdf5 - h5py @@ -41,8 +41,8 @@ requirements: - python - {{ pin_compatible('numpy') }} - pandas >=0.23 - - pyarrow ==0.14.1 - - arrow-cpp ==0.14.1 + - pyarrow ==0.15.0 + - arrow-cpp ==0.15.0 - boost - numba ==0.45 - mpich # [not win] diff --git a/buildscripts/parquet-reader-conda-recipe/meta.yaml b/buildscripts/parquet-reader-conda-recipe/meta.yaml index 0ec886e4c..60e592a9f 100644 --- a/buildscripts/parquet-reader-conda-recipe/meta.yaml +++ b/buildscripts/parquet-reader-conda-recipe/meta.yaml @@ -14,11 +14,11 @@ requirements: - {{ compiler('cxx') }} - cmake >=3.2 - python 3.6.* - - pyarrow ==0.14.1 - - arrow-cpp ==0.14.1 + - pyarrow ==0.15.0 + - arrow-cpp ==0.15.0 run: - - pyarrow ==0.14.1 - - arrow-cpp ==0.14.1 + - pyarrow ==0.15.0 + - arrow-cpp ==0.15.0 about: From bfe95d120b750fc58a68467c019b841b378f9a7c Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Thu, 24 Oct 2019 10:22:26 +0300 Subject: [PATCH 8/8] Remove commented code. --- parquet_reader/hpat_parquet_reader.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/parquet_reader/hpat_parquet_reader.cpp b/parquet_reader/hpat_parquet_reader.cpp index 94a16be74..571617f81 100644 --- a/parquet_reader/hpat_parquet_reader.cpp +++ b/parquet_reader/hpat_parquet_reader.cpp @@ -654,20 +654,10 @@ void pq_init_reader(const char* file_name, std::shared_ptr* a_reader // TODO: handle more complex types std::shared_ptr get_arrow_type(std::shared_ptr arrow_reader, int64_t column_idx) { - // TODO: error checking - // std::vector column_indices; - // column_indices.push_back(column_idx); - + // TODO: error checking (column_idx out of bounds) std::shared_ptr<::arrow::Schema> col_schema; - // auto descr = arrow_reader->parquet_reader()->metadata()->schema(); - // auto parquet_key_value_metadata = arrow_reader->parquet_reader()->metadata()->key_value_metadata(); - // parquet::arrow::FromParquetSchema(descr, column_indices, parquet_key_value_metadata, &col_schema); arrow_reader->GetSchema(&col_schema); - // std::cout<< col_schema->ToString() << std::endl; - // std::shared_ptr<::arrow::DataType> arrow_dtype = col_schema->field(0)->type(); - std::shared_ptr<::arrow::DataType> arrow_dtype = col_schema->field(column_idx)->type(); - - return arrow_dtype; + return col_schema->field(column_idx)->type(); } bool arrowPqTypesEqual(std::shared_ptr arrow_type, ::parquet::Type::type pq_type)