diff --git a/keyvi/include/keyvi/dictionary/dictionary_compiler.h b/keyvi/include/keyvi/dictionary/dictionary_compiler.h index c057db083..93f708777 100644 --- a/keyvi/include/keyvi/dictionary/dictionary_compiler.h +++ b/keyvi/include/keyvi/dictionary/dictionary_compiler.h @@ -248,8 +248,7 @@ class DictionaryCompiler final { generator_->Write(stream); } - template - void WriteToFile(StringType filename) { + void WriteToFile(const std::string& filename) { if (!generator_) { throw compiler_exception("not compiled yet"); } diff --git a/keyvi/include/keyvi/dictionary/dictionary_merger.h b/keyvi/include/keyvi/dictionary/dictionary_merger.h index 0f3d453cc..8dca478bd 100644 --- a/keyvi/include/keyvi/dictionary/dictionary_merger.h +++ b/keyvi/include/keyvi/dictionary/dictionary_merger.h @@ -49,8 +49,24 @@ namespace keyvi { namespace dictionary { +/** + * Exception class for dictionary merger + */ + +struct merger_exception : public std::runtime_error { + using std::runtime_error::runtime_error; +}; + +struct MergeStats { + size_t number_of_keys_ = 0; + size_t deleted_keys_ = 0; + size_t updated_keys_ = 0; +}; + template class DictionaryMerger final { + using GeneratorAdapter = fsa::GeneratorAdapterInterface; + private: class SegmentIterator { using EntryIteratorPtr = std::shared_ptr; @@ -106,7 +122,7 @@ class DictionaryMerger final { * @params params merger parameters */ explicit DictionaryMerger(const keyvi::util::parameters_t& params = keyvi::util::parameters_t()) - : dicts_to_merge_(), params_(params) { + : dicts_to_merge_(), params_(params), stats_() { params_[TEMPORARY_PATH_KEY] = keyvi::util::mapGetTemporaryPath(params); append_merge_ = MERGE_APPEND == keyvi::util::mapGet(params_, MERGE_MODE, ""); @@ -150,8 +166,12 @@ class DictionaryMerger final { void SetManifestFromString(const std::string& manifest) { manifest_ = manifest; } void Merge(const std::string& filename) { - using GeneratorAdapter = fsa::GeneratorAdapterInterface; + Merge(); + generator_->SetManifestFromString(manifest_); + generator_->WriteToFile(filename); + } + void Merge() { size_t sparse_array_size_sum = 0; for (auto fsa : dicts_to_merge_) { sparse_array_size_sum += fsa->SparseArraySize(); @@ -159,7 +179,7 @@ class DictionaryMerger final { ValueStoreT* value_store = append_merge_ ? new ValueStoreT(inputFiles_) : new ValueStoreT(params_); - auto generator = GeneratorAdapter::CreateGenerator(sparse_array_size_sum, params_, value_store); + generator_ = GeneratorAdapter::CreateGenerator(sparse_array_size_sum, params_, value_store); std::string top_key; @@ -171,6 +191,7 @@ class DictionaryMerger final { // check for same keys and merge only the most recent one while (!segments_pqueue_.empty() && segments_pqueue_.top().entryIterator().operator==(top_key)) { + ++stats_.updated_keys_; auto to_inc = segments_pqueue_.top(); segments_pqueue_.pop(); @@ -183,10 +204,11 @@ class DictionaryMerger final { if (!deleted_keys_[segment_it.segmentIndex()].empty() && top_key == deleted_keys_[segment_it.segmentIndex()].back()) { deleted_keys_[segment_it.segmentIndex()].pop_back(); - + ++stats_.deleted_keys_; // check the other deleted_keys for duplicates for (auto& deleted_keys : deleted_keys_) { if (!deleted_keys.empty() && top_key == deleted_keys.back()) { + ++stats_.deleted_keys_; deleted_keys.pop_back(); } } @@ -209,7 +231,8 @@ class DictionaryMerger final { } TRACE("Add key: %s", top_key.c_str()); - generator->Add(std::move(top_key), handle); + ++stats_.number_of_keys_; + generator_->Add(std::move(top_key), handle); } if (++segment_it) { segments_pqueue_.push(segment_it); @@ -220,21 +243,36 @@ class DictionaryMerger final { TRACE("finished iterating, do final compile."); - generator->CloseFeeding(); + generator_->CloseFeeding(); + } + + void Write(std::ostream& stream) { + if (!generator_) { + throw merger_exception("not merged yet"); + } + + generator_->Write(stream); + } - generator->SetManifestFromString(manifest_); - generator->WriteToFile(filename); + void WriteToFile(const std::string& filename) { + if (!generator_) { + throw merger_exception("not merged yet"); + } + generator_->WriteToFile(filename); } + const MergeStats& GetStats() const { return stats_; } + private: + typename GeneratorAdapter::AdapterPtr generator_; bool append_merge_ = false; std::vector dicts_to_merge_; std::vector> deleted_keys_; std::vector inputFiles_; std::priority_queue segments_pqueue_; - keyvi::util::parameters_t params_; std::string manifest_ = std::string(); + MergeStats stats_; /** * Load a file with deleted keys if it exists diff --git a/keyvi/include/keyvi/dictionary/fsa/generator.h b/keyvi/include/keyvi/dictionary/fsa/generator.h index fc2fdef04..dbd35da81 100644 --- a/keyvi/include/keyvi/dictionary/fsa/generator.h +++ b/keyvi/include/keyvi/dictionary/fsa/generator.h @@ -323,8 +323,7 @@ class Generator final { value_store_->Write(stream); } - template - void WriteToFile(StringType filename) { + void WriteToFile(const std::string& filename) { std::ofstream out_stream(filename, std::ios::binary); Write(out_stream); out_stream.close(); diff --git a/keyvi/tests/keyvi/dictionary/dictionary_merger_test.cpp b/keyvi/tests/keyvi/dictionary/dictionary_merger_test.cpp index 74bf4e807..a4cc0a053 100644 --- a/keyvi/tests/keyvi/dictionary/dictionary_merger_test.cpp +++ b/keyvi/tests/keyvi/dictionary/dictionary_merger_test.cpp @@ -73,6 +73,10 @@ BOOST_AUTO_TEST_CASE(MergeKeyOnlyDicts) { BOOST_CHECK(!d->Contains("a")); BOOST_CHECK(!d->Contains("cde")); + BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_); + BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_); + BOOST_CHECK_EQUAL(10, merger.GetStats().number_of_keys_); + std::remove(filename.c_str()); } @@ -361,9 +365,12 @@ BOOST_AUTO_TEST_CASE(MergeJsonDicts) { BOOST_CHECK(d->Contains("abc")); BOOST_CHECK(d->Contains("abbc")); BOOST_CHECK(d->Contains("abbcd")); + BOOST_CHECK(d->Contains("abbe")); + BOOST_CHECK(d->Contains("abcd")); BOOST_CHECK(d->Contains("abcde")); BOOST_CHECK(d->Contains("abdd")); BOOST_CHECK(d->Contains("bba")); + BOOST_CHECK(d->Contains("bbacd")); BOOST_CHECK_EQUAL("\"{a:1}\"", d->operator[]("abc").GetValueAsString()); @@ -382,6 +389,10 @@ BOOST_AUTO_TEST_CASE(MergeJsonDicts) { BOOST_CHECK_EQUAL("\"{d:4}\"", d->operator[]("abbe").GetValueAsString()); BOOST_CHECK_EQUAL("\"{f:5}\"", d->operator[]("bbacd").GetValueAsString()); + BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_); + BOOST_CHECK_EQUAL(1, merger.GetStats().updated_keys_); + BOOST_CHECK_EQUAL(9, merger.GetStats().number_of_keys_); + std::remove(filename.c_str()); } @@ -601,6 +612,10 @@ BOOST_AUTO_TEST_CASE(MergeToEmptyDict) { BOOST_CHECK_EQUAL("\"{b:3}\"", d->operator[]("abbc").GetValueAsString()); BOOST_CHECK_EQUAL("\"{d:4}\"", d->operator[]("abbe").GetValueAsString()); + BOOST_CHECK_EQUAL(0, merger.GetStats().deleted_keys_); + BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_); + BOOST_CHECK_EQUAL(2, merger.GetStats().number_of_keys_); + std::remove(filename.c_str()); } @@ -636,13 +651,21 @@ BOOST_AUTO_TEST_CASE(Delete) { merger.Add(dictionary.GetFileName()); std::string filename("merge-delete-key-dict.kv"); - merger.Merge(filename); + std::ofstream out_stream(filename, std::ios::binary); + merger.Merge(); + merger.Write(out_stream); + out_stream.close(); fsa::automata_t fsa(new fsa::Automata(filename.c_str())); dictionary_t d(new Dictionary(fsa)); BOOST_CHECK(d->Contains("abcd")); BOOST_CHECK(!d->Contains("xyz")); + + BOOST_CHECK_EQUAL(1, merger.GetStats().deleted_keys_); + BOOST_CHECK_EQUAL(0, merger.GetStats().updated_keys_); + BOOST_CHECK_EQUAL(1, merger.GetStats().number_of_keys_); + std::remove(filename.c_str()); std::remove(deleted_keys_file.string().c_str()); } @@ -693,7 +716,9 @@ BOOST_AUTO_TEST_CASE(MultipleDeletes) { merger.Add(dictionary3.GetFileName()); std::string filename("merge-multiple-deletes-dict.kv"); - merger.Merge(filename); + merger.Merge(); + + merger.WriteToFile(filename); fsa::automata_t fsa(new fsa::Automata(filename.c_str())); dictionary_t d(new Dictionary(fsa)); @@ -707,12 +732,28 @@ BOOST_AUTO_TEST_CASE(MultipleDeletes) { BOOST_CHECK(d->Contains("acdd")); BOOST_CHECK(!d->Contains("afgh")); + BOOST_CHECK_EQUAL(8, merger.GetStats().deleted_keys_); + BOOST_CHECK_EQUAL(13, merger.GetStats().updated_keys_); + BOOST_CHECK_EQUAL(3, merger.GetStats().number_of_keys_); + std::remove(filename.c_str()); std::remove(deleted_keys_file1.string().c_str()); std::remove(deleted_keys_file2.string().c_str()); std::remove(deleted_keys_file3.string().c_str()); } +BOOST_AUTO_TEST_CASE(WriteWithoutMerge) { + JsonDictionaryMerger merger; + const std::string filename("write-without-merger.kv"); + + BOOST_CHECK_THROW(merger.WriteToFile(filename), merger_exception); + { + std::ofstream out_stream(filename, std::ios::binary); + BOOST_CHECK_THROW(merger.Write(out_stream), merger_exception); + } + std::remove(filename.c_str()); +} + BOOST_AUTO_TEST_SUITE_END() } /* namespace dictionary */