Skip to content

Commit

Permalink
[index] instrument merger with stats and make interface more like com…
Browse files Browse the repository at this point in the history
…piler (#41)

instrument dictionary merger to retrieve stats about number of deleted/overwritten keys
  • Loading branch information
Hendrik Muhs committed Feb 19, 2018
1 parent 7fc055b commit 7159787
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 15 deletions.
3 changes: 1 addition & 2 deletions keyvi/include/keyvi/dictionary/dictionary_compiler.h
Expand Up @@ -248,8 +248,7 @@ class DictionaryCompiler final {
generator_->Write(stream);
}

template <typename StringType>
void WriteToFile(StringType filename) {
void WriteToFile(const std::string& filename) {
if (!generator_) {
throw compiler_exception("not compiled yet");
}
Expand Down
56 changes: 47 additions & 9 deletions keyvi/include/keyvi/dictionary/dictionary_merger.h
Expand Up @@ -49,8 +49,24 @@
namespace keyvi {
namespace dictionary {

/**
* Exception class for dictionary merger
*/

struct merger_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};

struct MergeStats {
size_t number_of_keys_ = 0;
size_t deleted_keys_ = 0;
size_t updated_keys_ = 0;
};

template <class PersistenceT, class ValueStoreT = fsa::internal::NullValueStore>
class DictionaryMerger final {
using GeneratorAdapter = fsa::GeneratorAdapterInterface<PersistenceT, ValueStoreT>;

private:
class SegmentIterator {
using EntryIteratorPtr = std::shared_ptr<fsa::EntryIterator>;
Expand Down Expand Up @@ -106,7 +122,7 @@ class DictionaryMerger final {
* @params params merger parameters
*/
explicit DictionaryMerger(const keyvi::util::parameters_t& params = keyvi::util::parameters_t())
: dicts_to_merge_(), params_(params) {
: dicts_to_merge_(), params_(params), stats_() {
params_[TEMPORARY_PATH_KEY] = keyvi::util::mapGetTemporaryPath(params);

append_merge_ = MERGE_APPEND == keyvi::util::mapGet<std::string>(params_, MERGE_MODE, "");
Expand Down Expand Up @@ -150,16 +166,20 @@ class DictionaryMerger final {
void SetManifestFromString(const std::string& manifest) { manifest_ = manifest; }

void Merge(const std::string& filename) {
using GeneratorAdapter = fsa::GeneratorAdapterInterface<PersistenceT, ValueStoreT>;
Merge();
generator_->SetManifestFromString(manifest_);
generator_->WriteToFile(filename);
}

void Merge() {
size_t sparse_array_size_sum = 0;
for (auto fsa : dicts_to_merge_) {
sparse_array_size_sum += fsa->SparseArraySize();
}

ValueStoreT* value_store = append_merge_ ? new ValueStoreT(inputFiles_) : new ValueStoreT(params_);

auto generator = GeneratorAdapter::CreateGenerator(sparse_array_size_sum, params_, value_store);
generator_ = GeneratorAdapter::CreateGenerator(sparse_array_size_sum, params_, value_store);

std::string top_key;

Expand All @@ -171,6 +191,7 @@ class DictionaryMerger final {

// check for same keys and merge only the most recent one
while (!segments_pqueue_.empty() && segments_pqueue_.top().entryIterator().operator==(top_key)) {
++stats_.updated_keys_;
auto to_inc = segments_pqueue_.top();

segments_pqueue_.pop();
Expand All @@ -183,10 +204,11 @@ class DictionaryMerger final {
if (!deleted_keys_[segment_it.segmentIndex()].empty() &&
top_key == deleted_keys_[segment_it.segmentIndex()].back()) {
deleted_keys_[segment_it.segmentIndex()].pop_back();

++stats_.deleted_keys_;
// check the other deleted_keys for duplicates
for (auto& deleted_keys : deleted_keys_) {
if (!deleted_keys.empty() && top_key == deleted_keys.back()) {
++stats_.deleted_keys_;
deleted_keys.pop_back();
}
}
Expand All @@ -209,7 +231,8 @@ class DictionaryMerger final {
}

TRACE("Add key: %s", top_key.c_str());
generator->Add(std::move(top_key), handle);
++stats_.number_of_keys_;
generator_->Add(std::move(top_key), handle);
}
if (++segment_it) {
segments_pqueue_.push(segment_it);
Expand All @@ -220,21 +243,36 @@ class DictionaryMerger final {

TRACE("finished iterating, do final compile.");

generator->CloseFeeding();
generator_->CloseFeeding();
}

void Write(std::ostream& stream) {
if (!generator_) {
throw merger_exception("not merged yet");
}

generator_->Write(stream);
}

generator->SetManifestFromString(manifest_);
generator->WriteToFile(filename);
void WriteToFile(const std::string& filename) {
if (!generator_) {
throw merger_exception("not merged yet");
}
generator_->WriteToFile(filename);
}

const MergeStats& GetStats() const { return stats_; }

private:
typename GeneratorAdapter::AdapterPtr generator_;
bool append_merge_ = false;
std::vector<fsa::automata_t> dicts_to_merge_;
std::vector<std::vector<std::string>> deleted_keys_;
std::vector<std::string> inputFiles_;
std::priority_queue<SegmentIterator> segments_pqueue_;

keyvi::util::parameters_t params_;
std::string manifest_ = std::string();
MergeStats stats_;

/**
* Load a file with deleted keys if it exists
Expand Down
3 changes: 1 addition & 2 deletions keyvi/include/keyvi/dictionary/fsa/generator.h
Expand Up @@ -323,8 +323,7 @@ class Generator final {
value_store_->Write(stream);
}

template <typename StringType>
void WriteToFile(StringType filename) {
void WriteToFile(const std::string& filename) {
std::ofstream out_stream(filename, std::ios::binary);
Write(out_stream);
out_stream.close();
Expand Down
45 changes: 43 additions & 2 deletions keyvi/tests/keyvi/dictionary/dictionary_merger_test.cpp
Expand Up @@ -73,6 +73,10 @@ BOOST_AUTO_TEST_CASE(MergeKeyOnlyDicts) {
BOOST_CHECK(!d->Contains("a"));
BOOST_CHECK(!d->Contains("cde"));

BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_);
BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_);
BOOST_CHECK_EQUAL(10, merger.GetStats().number_of_keys_);

std::remove(filename.c_str());
}

Expand Down Expand Up @@ -361,9 +365,12 @@ BOOST_AUTO_TEST_CASE(MergeJsonDicts) {
BOOST_CHECK(d->Contains("abc"));
BOOST_CHECK(d->Contains("abbc"));
BOOST_CHECK(d->Contains("abbcd"));
BOOST_CHECK(d->Contains("abbe"));
BOOST_CHECK(d->Contains("abcd"));
BOOST_CHECK(d->Contains("abcde"));
BOOST_CHECK(d->Contains("abdd"));
BOOST_CHECK(d->Contains("bba"));
BOOST_CHECK(d->Contains("bbacd"));

BOOST_CHECK_EQUAL("\"{a:1}\"", d->operator[]("abc").GetValueAsString());

Expand All @@ -382,6 +389,10 @@ BOOST_AUTO_TEST_CASE(MergeJsonDicts) {
BOOST_CHECK_EQUAL("\"{d:4}\"", d->operator[]("abbe").GetValueAsString());
BOOST_CHECK_EQUAL("\"{f:5}\"", d->operator[]("bbacd").GetValueAsString());

BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_);
BOOST_CHECK_EQUAL(1, merger.GetStats().updated_keys_);
BOOST_CHECK_EQUAL(9, merger.GetStats().number_of_keys_);

std::remove(filename.c_str());
}

Expand Down Expand Up @@ -601,6 +612,10 @@ BOOST_AUTO_TEST_CASE(MergeToEmptyDict) {
BOOST_CHECK_EQUAL("\"{b:3}\"", d->operator[]("abbc").GetValueAsString());
BOOST_CHECK_EQUAL("\"{d:4}\"", d->operator[]("abbe").GetValueAsString());

BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_);
BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_);
BOOST_CHECK_EQUAL(2, merger.GetStats().number_of_keys_);

std::remove(filename.c_str());
}

Expand Down Expand Up @@ -636,13 +651,21 @@ BOOST_AUTO_TEST_CASE(Delete) {
merger.Add(dictionary.GetFileName());

std::string filename("merge-delete-key-dict.kv");
merger.Merge(filename);
std::ofstream out_stream(filename, std::ios::binary);
merger.Merge();
merger.Write(out_stream);
out_stream.close();

fsa::automata_t fsa(new fsa::Automata(filename.c_str()));
dictionary_t d(new Dictionary(fsa));

BOOST_CHECK(d->Contains("abcd"));
BOOST_CHECK(!d->Contains("xyz"));

BOOST_CHECK_EQUAL(1, merger.GetStats().deleted_keys_);
BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_);
BOOST_CHECK_EQUAL(1, merger.GetStats().number_of_keys_);

std::remove(filename.c_str());
std::remove(deleted_keys_file.string().c_str());
}
Expand Down Expand Up @@ -693,7 +716,9 @@ BOOST_AUTO_TEST_CASE(MultipleDeletes) {
merger.Add(dictionary3.GetFileName());

std::string filename("merge-multiple-deletes-dict.kv");
merger.Merge(filename);
merger.Merge();

merger.WriteToFile(filename);

fsa::automata_t fsa(new fsa::Automata(filename.c_str()));
dictionary_t d(new Dictionary(fsa));
Expand All @@ -707,12 +732,28 @@ BOOST_AUTO_TEST_CASE(MultipleDeletes) {
BOOST_CHECK(d->Contains("acdd"));
BOOST_CHECK(!d->Contains("afgh"));

BOOST_CHECK_EQUAL(8, merger.GetStats().deleted_keys_);
BOOST_CHECK_EQUAL(13, merger.GetStats().updated_keys_);
BOOST_CHECK_EQUAL(3, merger.GetStats().number_of_keys_);

std::remove(filename.c_str());
std::remove(deleted_keys_file1.string().c_str());
std::remove(deleted_keys_file2.string().c_str());
std::remove(deleted_keys_file3.string().c_str());
}

BOOST_AUTO_TEST_CASE(WriteWithoutMerge) {
JsonDictionaryMerger merger;
const std::string filename("write-without-merger.kv");

BOOST_CHECK_THROW(merger.WriteToFile(filename), merger_exception);
{
std::ofstream out_stream(filename, std::ios::binary);
BOOST_CHECK_THROW(merger.Write(out_stream), merger_exception);
}
std::remove(filename.c_str());
}

BOOST_AUTO_TEST_SUITE_END()

} /* namespace dictionary */
Expand Down

0 comments on commit 7159787

Please sign in to comment.