Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the new tiledb_array_evaluate_cell to apply filter expressions for the SingleCellTileDBIterator #285

Merged
merged 4 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
# THE SOFTWARE.
#

project(GenomicsDB)
cmake_minimum_required(VERSION 3.17)

project(GenomicsDB)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules")

if (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
Expand Down
2 changes: 1 addition & 1 deletion dependencies/TileDB
7 changes: 7 additions & 0 deletions src/main/cpp/include/genomicsdb/genomicsdb_iterators.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@ class SingleCellTileDBIterator {
auto coords_query_idx = m_fields.size()-1u;
return reinterpret_cast<const int64_t*>(get_raw_field_pointer_for_query_idx(coords_query_idx));
}
inline const bool evaluate_cell() const {
return m_cell_evaluated_with_filter_expression;
}
void print(const int query_idx, std::ostream& fptr=std::cout) const;
void print_ALT(const int query_idx, std::ostream& fptr=std::cout) const;
void print_csv(const int query_idx, std::ostream& fptr=std::cout) const;
Expand Down Expand Up @@ -350,6 +353,7 @@ class SingleCellTileDBIterator {
const VariantQueryConfig* m_query_config;
uint64_t m_query_column_interval_idx;
GenomicsDBColumnarCell* m_cell;
bool m_cell_evaluated_with_filter_expression;
//Buffers for fields
std::vector<GenomicsDBColumnarField> m_fields;
//Cell markers for handling the sweep operation
Expand Down Expand Up @@ -444,6 +448,9 @@ class GenomicsDBGVCFIterator : public SingleCellTileDBIterator {
inline const GenomicsDBGVCFCell& operator*() const {
return *m_cell;
}
inline const bool evaluate_cell() const {
return m_cell_evaluated_with_filter_expression;
}
inline bool end() const {
return SingleCellTileDBIterator::end() && m_end_set.empty();
}
Expand Down
3 changes: 3 additions & 0 deletions src/main/cpp/include/genomicsdb/variant_cell.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ class GenomicsDBColumnarCell {
inline int get_field_size_in_bytes(const int query_idx) const {
return m_iterator->get_field_size_in_bytes(query_idx);
}
inline bool evaluate_cell() const {
return m_iterator->evaluate_cell();
}
inline bool is_valid(const int query_idx) const {
return m_iterator->is_valid(query_idx);
}
Expand Down
6 changes: 4 additions & 2 deletions src/main/cpp/src/genomicsdb/genomicsdb_columnar_field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -326,16 +326,18 @@ void GenomicsDBColumnarField::set_valid_vector_in_live_buffer_list_tail_ptr() {
auto buffer_ptr = get_live_buffer_list_tail();
assert(buffer_ptr);
auto& valid_vector = buffer_ptr->get_valid_vector();
if (m_length_descriptor == BCF_VL_FIXED)
if (m_length_descriptor == BCF_VL_FIXED) {
for (auto i=0ull; i<buffer_ptr->get_num_live_entries(); ++i) {
assert(i < valid_vector.size());
valid_vector[i] = m_check_tiledb_valid_element(buffer_ptr->get_raw_pointer() + (m_fixed_length_field_size*i),
m_fixed_length_field_num_elements);
} else
}
} else {
for (auto i=0ull; i<buffer_ptr->get_num_live_entries(); ++i) {
assert(i < valid_vector.size());
valid_vector[i] = (buffer_ptr->get_size_of_variable_length_field(i) > 0u);
}
}
}

void GenomicsDBColumnarField::print_data_in_buffer_at_index(std::ostream& fptr,
Expand Down
29 changes: 27 additions & 2 deletions src/main/cpp/src/genomicsdb/genomicsdb_iterators.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,31 @@ bool SingleCellTileDBIterator::advance_to_next_useful_cell(const uint64_t min_nu
#endif
advance_fields_other_than_coords_END(num_cells_incremented);
}

// Evaluate filter expression if necessary
m_cell_evaluated_with_filter_expression = true;
if (m_query_config->get_query_filter().size() > 0) {
const auto& attribute_ids = m_query_config->get_query_attributes_schema_idxs();
std::vector<void *> buffers;
std::vector<size_t> buffer_sizes;
std::vector<int64_t> positions;
for (auto i=0; i<m_fields.size(); i++) {
auto& genomicsdb_columnar_field = m_fields[i];
auto buffer = get_buffer_and_index(i);
if (genomicsdb_columnar_field.is_variable_length_field()) {
buffers.push_back(const_cast<GenomicsDBBuffer *>(buffer.first)->get_offsets_pointer());
buffer_sizes.push_back(buffer.first->get_offsets_size_in_bytes());
positions.push_back((int64_t)buffer.second);
}
buffers.push_back(const_cast<uint8_t *>(buffer.first->get_raw_pointer()));
buffer_sizes.push_back(buffer.first->get_data_vector_size_in_bytes());
positions.push_back((int64_t)buffer.second);
}
if (tiledb_array_evaluate_cell(m_tiledb_array, buffers.data(), buffer_sizes.data(), positions.data()) != TILEDB_OK) {
m_cell_evaluated_with_filter_expression = false;
}
}

return true;
}

Expand Down Expand Up @@ -646,7 +671,7 @@ void SingleCellTileDBIterator::advance_fields_other_than_coords_END(const uint64
assert(m_END_query_idx == 0u);
for (auto i=0u; i<m_query_attribute_idx_vec.size(); ++i)
m_query_attribute_idx_vec[i] = i+1u; //END is the first field - ignore
//-2 - ignore coords and END
//-2 - ignore coords as well
m_query_attribute_idx_num_cells_to_increment_vec.resize(m_fields.size()-2u);
//For all fields, #cells to skip == num_cells_to_increment initially
m_query_attribute_idx_num_cells_to_increment_vec.assign(
Expand All @@ -660,7 +685,7 @@ void SingleCellTileDBIterator::advance_fields_other_than_coords_END(const uint64
assert(!m_done_reading_from_TileDB
&& m_query_column_interval_idx == curr_query_column_interval_idx);
#ifdef DEBUG
//After the skip cells API is implemented, there shouldn't be any attributes
//After the TileDB read with skip cells, there shouldn't be any attributes
//whose cells need to be skipped after a call to read_from_TileDB()
for (auto i=0u; i<m_query_attribute_idx_vec.size(); ++i)
assert(m_query_attribute_idx_num_cells_to_increment_vec[i] == 0u);
Expand Down
9 changes: 6 additions & 3 deletions src/main/cpp/src/genomicsdb/query_variants.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,10 @@ void VariantQueryProcessor::iterate_over_cells(
for (; !(columnar_forward_iter->end()); ++(*columnar_forward_iter)) {
auto& cell = **columnar_forward_iter;
auto coords = cell.get_coordinates();
if (query_config.is_queried_array_row_idx(coords[0])) //If row is part of query, process cell
//If row is part of query and is not filtered away, process cell
if (query_config.is_queried_array_row_idx(coords[0]) && cell.evaluate_cell()) {
variant_operator.operate_on_columnar_cell(cell, query_config, get_array_schema());
}
}
variant_operator.finalize();
delete columnar_forward_iter;
Expand All @@ -550,8 +552,9 @@ void VariantQueryProcessor::iterate_over_gvcf_entries(
use_common_array_object);
for (; !(columnar_gvcf_iter->end()); ++(*columnar_gvcf_iter)) {
auto& cell = **columnar_gvcf_iter;

variant_operator.operate_on_columnar_cell(cell);
if (cell.get_iterator()->evaluate_cell()) {
variant_operator.operate_on_columnar_cell(cell);
}
}
//variant_operator.finalize();
delete columnar_gvcf_iter;
Expand Down
70 changes: 58 additions & 12 deletions src/test/cpp/src/test_genomicsdb_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* src/test/cpp/src/test_genomicsdb_api.cc
*
* The MIT License (MIT)
* Copyright (c) 2019-2022 Omics Data Automation, Inc.
* Copyright (c) 2019-2023 Omics Data Automation, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
Expand Down Expand Up @@ -280,6 +280,26 @@ class NullVariantCallProcessor : public GenomicsDBVariantCallProcessor {
};
};

class CountCellsProcessor : public GenomicsDBVariantCallProcessor {
public:
CountCellsProcessor() {
};

void process(const interval_t& interval) {
};

void process(const std::string& sample_name,
const int64_t* coordinates,
const genomic_interval_t& genomic_interval,
const std::vector<genomic_field_t>& genomic_fields) {
m_count++;
m_coordinates = coordinates;
};

int m_count = 0;
const int64_t* m_coordinates;
};

class OneQueryIntervalProcessor : public GenomicsDBVariantCallProcessor {
public:
OneQueryIntervalProcessor(const std::vector<std::string> attributes = {}, bool is_PP=false) {
Expand Down Expand Up @@ -540,17 +560,43 @@ TEST_CASE("api query_variant_calls with protobuf", "[query_variant_calls_with_pr
gdb->query_variant_calls(one_query_interval_processor);
delete gdb;

// try query with contig intervals instead of tiledb column intervals
ContigInterval* contig_interval = new ContigInterval();
contig_interval->set_contig("1");
contig_interval->set_begin(1);
contig_interval->set_end(249250621);
column_interval->Clear();
column_interval->set_allocated_contig_interval(contig_interval);
CHECK(config->SerializeToString(&config_string));
gdb = new GenomicsDB(config_string, GenomicsDB::PROTOBUF_BINARY_STRING, loader_json, 0);
gdb->query_variant_calls();
delete gdb;
SECTION("Try query with contig intervals instead of tiledb column intervals") {
ContigInterval* contig_interval = new ContigInterval();
contig_interval->set_contig("1");
contig_interval->set_begin(1);
contig_interval->set_end(249250621);
column_interval->Clear();
column_interval->set_allocated_contig_interval(contig_interval);
CHECK(config->SerializeToString(&config_string));
gdb = new GenomicsDB(config_string, GenomicsDB::PROTOBUF_BINARY_STRING, loader_json, 0);
gdb->query_variant_calls();
delete gdb;
}

SECTION("Try query with a filter") {
config->set_query_filter("REF == \"G\" && GT &= \"1/1\" && ALT |= \"T\"");
CHECK(config->SerializeToString(&config_string));
gdb = new GenomicsDB(config_string, GenomicsDB::PROTOBUF_BINARY_STRING, loader_json, 0);
CountCellsProcessor count_cells_processor;
gdb->query_variant_calls(count_cells_processor);
CHECK(count_cells_processor.m_count == 1);
CHECK(count_cells_processor.m_coordinates[0] == 1);
CHECK(count_cells_processor.m_coordinates[1] == 17384);
delete gdb;
}

SECTION("Try query with a filter and a small segment size to force TileDB buffers overflow") {
config->set_query_filter("REF == \"G\" && GT &= \"11\" && ALT |= \"T\"");
config->set_segment_size(40);
CHECK(config->SerializeToString(&config_string));
gdb = new GenomicsDB(config_string, GenomicsDB::PROTOBUF_BINARY_STRING, loader_json, 0);
CountCellsProcessor count_cells_processor;
gdb->query_variant_calls(count_cells_processor);
CHECK(count_cells_processor.m_count == 1);
CHECK(count_cells_processor.m_coordinates[0] == 1);
CHECK(count_cells_processor.m_coordinates[1] == 17384);
delete gdb;
}
}

TEST_CASE("api generate_vcf direct", "[query_generate_vcf_direct]") {
Expand Down