Skip to content

Commit

Permalink
GenomicsDB api performance changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Nalini Ganapati committed May 9, 2024
1 parent f6fc8e8 commit 7ebb514
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 9 deletions.
6 changes: 5 additions & 1 deletion src/main/cpp/include/genomicsdb/query_variants.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* The MIT License (MIT)
* Copyright (c) 2016-2017 Intel Corporation
* Copyright (c) 2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
Expand Down Expand Up @@ -217,6 +218,9 @@ class VariantQueryProcessor {
const VidMapper& vid_mapper);

~VariantQueryProcessor() {
if (m_storage_manager && m_ad > -1) {
m_storage_manager->close_array(m_ad);
}
if (m_array_schema)
delete m_array_schema;
m_array_schema = 0;
Expand Down Expand Up @@ -384,7 +388,7 @@ class VariantQueryProcessor {
/*
* VariantStorage manager
*/
const VariantStorageManager* m_storage_manager;
VariantStorageManager* m_storage_manager;
/**
* Map the known field enum to cell attribute idx for the given schema
*/
Expand Down
23 changes: 23 additions & 0 deletions src/main/cpp/include/utils/memory_measure.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/**
* The MIT License (MIT)
* Copyright (c) 2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef MEMORY_MEASURE_H
#define MEMORY_MEASURE_H

Expand All @@ -9,5 +31,6 @@ typedef struct {
} statm_t;

void read_off_memory_status(statm_t& result, const size_t page_size=4096u);
void print_memory_usage(const std::string& msg);

#endif
14 changes: 10 additions & 4 deletions src/main/cpp/src/api/genomicsdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* The MIT License (MIT)
*
* Copyright (c) 2019-2020,2022 Omics Data Automation, Inc.
* Copyright (c) 2023 dātma, inc™
* Copyright (c) 2023-2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
Expand Down Expand Up @@ -36,6 +36,10 @@
#include <iostream>
#include <string>

#ifdef __linux__
# include <malloc.h>
#endif

#include "annotation_service.h"
#include "broad_combined_gvcf.h"
#include "query_variants.h"
Expand Down Expand Up @@ -166,8 +170,6 @@ GenomicsDB::GenomicsDB(const std::string& query_configuration,
}

GenomicsDB::~GenomicsDB() {
// TODO: delete variant_query_config per array

if (m_annotation_service != nullptr) {
delete TO_ANNOTATION_SERVICE(m_annotation_service);
}
Expand All @@ -180,6 +182,9 @@ GenomicsDB::~GenomicsDB() {
if (m_storage_manager != nullptr) {
delete TO_VARIANT_STORAGE_MANAGER(m_storage_manager);
}
#ifdef __linux__
malloc_trim(0);
#endif
}

std::map<std::string, genomic_field_type_t> create_genomic_field_types(const VariantQueryConfig &query_config,
Expand Down Expand Up @@ -542,7 +547,8 @@ std::vector<VariantCall>* GenomicsDB::query_variant_calls(const std::string& arr
#endif

// Perform Query over all the intervals
std::vector<VariantCall> *pvariant_calls = new std::vector<VariantCall>;
// This route is not being exercised - std::vector<VariantCall> *pvariant_calls = new std::vector<VariantCall>;
std::vector<VariantCall> *pvariant_calls = NULL;

GatherVariantCalls gather_variant_calls(processor, *query_config, m_annotation_service);
query_processor->iterate_over_cells(query_processor->get_array_descriptor(), *query_config, gather_variant_calls, true);
Expand Down
5 changes: 2 additions & 3 deletions src/main/cpp/src/genomicsdb/variant_storage_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* The MIT License (MIT)
* Copyright (c) 2016-2017 Intel Corporation
* Copyright (c) 2018-2023 Omics Data Automation, Inc.
* Copyright (c) 2023 dātma, inc™
* Copyright (c) 2023-2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
Expand Down Expand Up @@ -580,8 +580,7 @@ int VariantStorageManager::open_array(const std::string& array_name, const VidMa

void VariantStorageManager::close_array(const int ad, const bool consolidate_tiledb_array,
const int consolidation_batch_size) {
VERIFY_OR_THROW(static_cast<size_t>(ad) < m_open_arrays_info_vector.size() &&
m_open_arrays_info_vector[ad].get_array_name().length());
VERIFY_OR_THROW(static_cast<size_t>(ad) < m_open_arrays_info_vector.size());
m_open_arrays_info_vector[ad].close_array(consolidate_tiledb_array, m_segment_size, consolidation_batch_size);
}

Expand Down
34 changes: 34 additions & 0 deletions src/main/cpp/src/utils/memory_measure.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
/**
* The MIT License (MIT)
* Copyright (c) 2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#include "genomicsdb_logger.h"
#include "memory_measure.h"
#include <string>

void read_off_memory_status(statm_t& result, const size_t page_size) {
#ifdef __linux__
const char* statm_path = "/proc/self/statm";

FILE *f = fopen(statm_path,"r");
Expand All @@ -21,4 +46,13 @@ void read_off_memory_status(statm_t& result, const size_t page_size) {
result.data *= page_size;
result.dt *= page_size;
fclose(f);
#endif
}

void print_memory_usage(const std::string& msg) {
#ifdef __linux__
statm_t mem_result;
read_off_memory_status(mem_result);
logger.info("Mem usage {} rss={}M", msg, mem_result.resident/1000000);
#endif
}
4 changes: 3 additions & 1 deletion src/test/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# The MIT License
#
# Copyright (c) 2019 Omics Data Automation, Inc.
# Copyright (c) 2023 da̅tma, inc.
# Copyright (c) 2023-2024 da̅tma, inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -83,3 +83,5 @@ add_custom_command(TARGET api_tests
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../inputs/test.tgz)

add_custom_target(all_ctests COMMAND ${CMAKE_CTEST_COMMAND} -V DEPENDS ctests api_tests)

build_GenomicsDB_executable(test_genomicsdb_demo)
164 changes: 164 additions & 0 deletions src/test/cpp/src/test_genomicsdb_demo.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/**
* src/test/cpp/src/test_genomicsdb_demo.cc
*
* The MIT License (MIT)
* Copyright (c) 2024 dātma, inc™
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Test the GenomicsDB query api with the genomicsdb demo workspace
*/

#include "genomicsdb.h"
#include "genomicsdb_config_base.h"
#include "genomicsdb_logger.h"
#include "memory_measure.h"
#include "tiledb_utils.h"
#include <malloc.h>

#ifdef USE_GPERFTOOLS_HEAP
#include "gperftools/heap-profiler.h"
#endif

#include "genomicsdb_export_config.pb.h"

/**
* Instructions to run this test...
* Define GENOMICSDB_DEMO_WS(test does not run otherwise) and NUM_ITERATIONS(default=1) env variables
* GENOMICSDB_DEMO_WS=<genomicsdb_demo_ws_path> NUM_ITERATIONS=10 ./test_genomicsdb_demo
* with valgrind...
* GENOMICSDB_DEMO_WS=<genomicsdb_demo_ws_path> valgrind --leak-check=full --suppressions=<GenomicsDB>/src/test/inputs/valgrind.supp ./test_genomicsdb_demo
* To attach a degugger with valgrind, invoke valgrind --vgdb=yes --vgdb-error=0 --suppressions=<GenomiscDB>/src/test/inputs/valgrind.supp ./test_genomicsdb_demo
*/
class CountCellsProcessor : public GenomicsDBVariantCallProcessor {
public:
CountCellsProcessor() {
};

void process(const interval_t& interval) {
m_intervals++;
};

void process(const std::string& sample_name,
const int64_t* coordinates,
const genomic_interval_t& genomic_interval,
const std::vector<genomic_field_t>& genomic_fields) {
m_count++;
m_coordinates[0] = coordinates[0];
m_coordinates[1] = coordinates[1];
};

int m_intervals = 0;
int m_count = 0;
int64_t m_coordinates[2];
};

int main(int argc, char** argv) {
using namespace genomicsdb_pb;

char *genomicsdb_demo_workspace = getenv("GENOMICSDB_DEMO_WS");
if (!genomicsdb_demo_workspace) return 0;

unsigned num_iterations = 1;
char *num_iterations_str = getenv("NUM_ITERATIONS");
if (num_iterations_str) num_iterations = std::stoul(num_iterations_str);

printf("num_iterations=%u\n", num_iterations);

bool use_single_handle = false;
char *use_single_handle_str = getenv("USE_SINGLE_HANDLE");
if (use_single_handle_str && strncmp(use_single_handle_str, "true", 4) == 0) {
use_single_handle = true;
}

ExportConfiguration config;

std::string ws(genomicsdb_demo_workspace);
config.set_workspace(ws);
config.set_array_name("allcontigs$1$3095677412");
config.set_callset_mapping_file(ws+"/callset.json");
config.set_vid_mapping_file(ws+"/vidmap.json");

// query_contig_intervals
auto* contig_interval = config.add_query_contig_intervals();
contig_interval->set_contig("17");
contig_interval->set_begin(7571719);
contig_interval->set_end(7590868);

// query_row_ranges
auto* row_range = config.add_query_row_ranges()->add_range_list();
row_range->set_low(0);
row_range->set_high(200000);

// query_attributes
config.add_attributes()->assign("REF");
config.add_attributes()->assign("ALT");
config.add_attributes()->assign("GT");

// other
config.set_bypass_intersecting_intervals_phase(true);
config.set_enable_shared_posixfs_optimizations(true);

std::string config_string;
config.SerializeToString(&config_string);

print_memory_usage("before loop");

#ifdef USE_GPERFTOOLS_HEAP
HeapProfilerStart("test_genomicsdb_demo.gperf.heap");
#endif

if (use_single_handle) {
GenomicsDB gdb(config_string, GenomicsDB::PROTOBUF_BINARY_STRING);
for (auto i=0u; i<num_iterations; i++) {
CountCellsProcessor count_cells_processor;
print_memory_usage("before query");
#ifdef USE_GPERFTOOLS_HEAP
HeapProfilerDump("before_query");
#endif
gdb.query_variant_calls(count_cells_processor, "", GenomicsDB::NONE);
#ifdef USE_GPERFTOOLS_HEAP
HeapProfilerDump("after_query_before_trim");
#endif
print_memory_usage("after query before trim");
#ifdef __linux__
// malloc_trim(0);
#endif
#ifdef USE_GPERFTOOLS_HEAP
HeapProfilerDump("after_query");
#endif
print_memory_usage("after query");
printf("count=%d\n", count_cells_processor.m_count);
}
} else {
for (auto i=0u; i<num_iterations; i++) {
print_memory_usage("before gdb and query");
GenomicsDB *gdb = new GenomicsDB(config_string, GenomicsDB::PROTOBUF_BINARY_STRING);
CountCellsProcessor count_cells_processor;
print_memory_usage("before query");
gdb->query_variant_calls(count_cells_processor, "", GenomicsDB::NONE);
delete gdb;
print_memory_usage("after query");
printf("count=%d\n", count_cells_processor.m_count);
}
}

#ifdef USE_GPERFTOOLS_HEAP
HeapProfilerStop();
#endif
}
24 changes: 24 additions & 0 deletions src/test/inputs/valgrind.supp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
<insert_a_suppression_name_here>
Memcheck:Leak
match-leak-kinds: reachable
fun:malloc
obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
fun:call_init.part.0
fun:call_init
fun:_dl_init
obj:/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
}
{
<insert_a_suppression_name_here>
Memcheck:Leak
match-leak-kinds: reachable
fun:malloc
obj:/usr/lib64/libgomp.so.1.0.0
obj:/usr/lib64/libgomp.so.1.0.0
obj:/usr/lib64/libgomp.so.1.0.0
fun:_dl_init
obj:/usr/lib64/ld-2.17.so
}

0 comments on commit 7ebb514

Please sign in to comment.