Skip to content
Permalink
Browse files

add a tiered merge policy (#114)

The tiered merge policy is a smarter selection algorithm for selecting segments to merge. It scores sets of segments while prefering similar sized segments while boosting smaller merges and segments with deletes.

The algorithm is inspired by Lucene's tiered merge policy.

This change makes the tiered merge policy the default
  • Loading branch information...
hendrikmuhs committed Mar 3, 2019
1 parent 038f1f3 commit f27619a605a6af5065b08d699137e53d543bfe97
@@ -1,3 +1,3 @@
linelength=120
root=keyvi/include
root=include
filter=-build/include_subdir
@@ -32,6 +32,7 @@
#include <memory>
#include <queue>
#include <string>
#include <utility>
#include <vector>

#include <boost/filesystem.hpp>
@@ -141,7 +142,9 @@ class DictionaryMerger final {

fsa::automata_t fsa;
if (append_merge_) {
fsa.reset(new fsa::Automata(filename, loading_strategy_types::lazy, false));
// TODO(hendrik) https://github.com/KeyviDev/keyvi/issues/102
fsa.reset(new fsa::Automata(std::make_shared<DictionaryProperties>(DictionaryProperties::FromFile(filename)),
loading_strategy_types::lazy, false));
} else {
fsa.reset(new fsa::Automata(filename));
}
@@ -28,6 +28,7 @@
#include <cstddef>
#include <cstdint>
#include <fstream>
#include <memory>
#include <string>

#include <boost/lexical_cast.hpp>
@@ -300,6 +301,9 @@ class DictionaryProperties {
value_store_properties, manifest);
}
};

typedef std::shared_ptr<DictionaryProperties> dictionary_properties_t;

} // namespace dictionary
} // namespace keyvi

@@ -76,25 +76,28 @@ class Automata final {
public:
explicit Automata(const std::string& file_name,
loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: Automata(file_name, loading_strategy, true) {}
: Automata(std::make_shared<DictionaryProperties>(DictionaryProperties::FromFile(file_name)), loading_strategy,
true) {}

private:
explicit Automata(const std::string& file_name, loading_strategy_types loading_strategy, const bool load_value_store)
: dictionary_properties_(DictionaryProperties::FromFile(file_name)) {
file_mapping_ = boost::interprocess::file_mapping(file_name.c_str(), boost::interprocess::read_only);
explicit Automata(const dictionary_properties_t& dictionary_properties, loading_strategy_types loading_strategy,
const bool load_value_store)
: dictionary_properties_(dictionary_properties) {
file_mapping_ = boost::interprocess::file_mapping(dictionary_properties_->GetFileName().c_str(),
boost::interprocess::read_only);

const boost::interprocess::map_options_t map_options =
internal::MemoryMapFlags::FSAGetMemoryMapOptions(loading_strategy);

TRACE("labels start offset: %d", dictionary_properties_.GetPersistenceOffset());
labels_region_ = boost::interprocess::mapped_region(file_mapping_, boost::interprocess::read_only,
dictionary_properties_.GetPersistenceOffset(),
dictionary_properties_.GetSparseArraySize(), 0, map_options);
dictionary_properties_->GetPersistenceOffset(),
dictionary_properties_->GetSparseArraySize(), 0, map_options);

TRACE("transitions start offset: %d", dictionary_properties_.GetTransitionsOffset());
transitions_region_ = boost::interprocess::mapped_region(
file_mapping_, boost::interprocess::read_only, dictionary_properties_.GetTransitionsOffset(),
dictionary_properties_.GetTransitionsSize(), 0, map_options);
file_mapping_, boost::interprocess::read_only, dictionary_properties_->GetTransitionsOffset(),
dictionary_properties_->GetTransitionsSize(), 0, map_options);

const auto advise = internal::MemoryMapFlags::FSAGetMemoryMapAdvices(loading_strategy);

@@ -106,8 +109,8 @@ class Automata final {

if (load_value_store) {
value_store_reader_.reset(
internal::ValueStoreFactory::MakeReader(dictionary_properties_.GetValueStoreType(), &file_mapping_,
dictionary_properties_.GetValueStoreProperties(), loading_strategy));
internal::ValueStoreFactory::MakeReader(dictionary_properties_->GetValueStoreType(), &file_mapping_,
dictionary_properties_->GetValueStoreProperties(), loading_strategy));
}
}

@@ -120,15 +123,15 @@ class Automata final {
*
* @return index of root state.
*/
uint64_t GetStartState() const { return dictionary_properties_.GetStartState(); }
uint64_t GetStartState() const { return dictionary_properties_->GetStartState(); }

uint64_t GetNumberOfKeys() const { return dictionary_properties_.GetNumberOfKeys(); }
uint64_t GetNumberOfKeys() const { return dictionary_properties_->GetNumberOfKeys(); }

bool Empty() const { return 0 == GetNumberOfKeys(); }

size_t SparseArraySize() const { return dictionary_properties_.GetSparseArraySize(); }
size_t SparseArraySize() const { return dictionary_properties_->GetSparseArraySize(); }

internal::value_store_t GetValueStoreType() const { return dictionary_properties_.GetValueStoreType(); }
internal::value_store_t GetValueStoreType() const { return dictionary_properties_->GetValueStoreType(); }

uint64_t TryWalkTransition(uint64_t starting_state, unsigned char c) const {
if (labels_[starting_state + c] == c) {
@@ -378,12 +381,12 @@ class Automata final {
return value_store_reader_->GetRawValueAsString(state_value);
}

std::string GetStatistics() const { return dictionary_properties_.GetStatistics(); }
std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); }

std::string GetManifest() const { return dictionary_properties_.GetManifest(); }
std::string GetManifest() const { return dictionary_properties_->GetManifest(); }

private:
DictionaryProperties dictionary_properties_;
dictionary_properties_t dictionary_properties_;
std::unique_ptr<internal::IValueStoreReader> value_store_reader_;
boost::interprocess::file_mapping file_mapping_;
boost::interprocess::mapped_region labels_region_;
@@ -29,7 +29,7 @@

static const char INDEX_REFRESH_INTERVAL[] = "refresh_interval";
static const char MERGE_POLICY[] = "merge_policy";
static const char DEFAULT_MERGE_POLICY[] = "simple";
static const char DEFAULT_MERGE_POLICY[] = "tiered";
static const char KEYVIMERGER_BIN[] = "keyvimerger_bin";
static const char INDEX_MAX_SEGMENTS[] = "max_segments";
static const char SEGMENT_COMPILE_KEY_THRESHOLD[] = "segment_compile_key_threshold";
@@ -30,6 +30,10 @@

#include "index/internal/merge_policy.h"
#include "index/internal/simple_merge_policy.h"
#include "index/internal/tiered_merge_policy.h"

// #define ENABLE_TRACING
#include "dictionary/util/trace.h"

namespace keyvi {
namespace index {
@@ -38,18 +42,22 @@ namespace internal {
inline std::shared_ptr<MergePolicy> merge_policy(const std::string& name = "") {
auto lower_name = name;

TRACE("Merge Policy: %s", name.c_str());

boost::algorithm::to_lower(lower_name);
if (lower_name == "simple") {
return std::make_shared<SimpleMergePolicy>();
} else if (lower_name == "tiered") {
return std::make_shared<TieredMergePolicy>();
} else {
throw std::invalid_argument(name + " is not a valid merge policy");
}
}
} // namespace internal

typedef std::shared_ptr<MergePolicy> merge_policy_t;

} /* namespace internal */
} /* namespace index */
} /* namespace keyvi */
} // namespace internal
} // namespace index
} // namespace keyvi

#endif /* KEYVI_INDEX_INTERNAL_MERGE_POLICY_SELECTOR_H_ */
#endif // KEYVI_INDEX_INTERNAL_MERGE_POLICY_SELECTOR_H_
@@ -51,6 +51,8 @@ class ReadOnlySegment {
public:
explicit ReadOnlySegment(const boost::filesystem::path& path)
: dictionary_path_(path),
dictionary_properties_(std::make_shared<dictionary::DictionaryProperties>(
dictionary::DictionaryProperties::FromFile(path.string()))),
deleted_keys_path_(path),
deleted_keys_during_merge_path_(path),
dictionary_filename_(path.filename().string()),
@@ -70,8 +72,17 @@ class ReadOnlySegment {

dictionary::dictionary_t& GetDictionary() { return dictionary_; }

dictionary::dictionary_properties_t& GetDictionaryProperties() { return dictionary_properties_; }

bool HasDeletedKeys() { return has_deleted_keys_; }

size_t DeletedKeysSize() const {
if (has_deleted_keys_) {
return deleted_keys_->size();
}
return 0;
}

const std::shared_ptr<std::unordered_set<std::string>> DeletedKeys() {
if (!has_deleted_keys_) {
return std::shared_ptr<std::unordered_set<std::string>>();
@@ -107,6 +118,8 @@ class ReadOnlySegment {
protected:
explicit ReadOnlySegment(const boost::filesystem::path& path, bool load_dictionary, bool load_deleted_keys)
: dictionary_path_(path),
dictionary_properties_(std::make_shared<dictionary::DictionaryProperties>(
dictionary::DictionaryProperties::FromFile(path.string()))),
deleted_keys_path_(path),
deleted_keys_during_merge_path_(path),
dictionary_filename_(path.filename().string()),
@@ -127,6 +140,30 @@ class ReadOnlySegment {
}
}

explicit ReadOnlySegment(const dictionary::dictionary_properties_t& dictionary_properties, bool load_dictionary,
bool load_deleted_keys)
: dictionary_path_(dictionary_properties->GetFileName()),
dictionary_properties_(dictionary_properties),
deleted_keys_path_(dictionary_path_),
deleted_keys_during_merge_path_(dictionary_path_),
dictionary_filename_(dictionary_path_.filename().string()),
dictionary_(),
has_deleted_keys_(false),
deleted_keys_(),
last_modification_time_deleted_keys_(0),
last_modification_time_deleted_keys_during_merge_(0) {
deleted_keys_path_ += ".dk";
deleted_keys_during_merge_path_ += ".dkm";

if (load_dictionary) {
LoadDictionary();
}

if (load_deleted_keys) {
LoadDeletedKeys();
}
}

void LoadDictionary() {
// load dictionary
dictionary_.reset(new dictionary::Dictionary(dictionary_path_.string()));
@@ -183,6 +220,9 @@ class ReadOnlySegment {
//! path of the underlying dictionary
boost::filesystem::path dictionary_path_;

//! the properties of the dictionary
dictionary::dictionary_properties_t dictionary_properties_;

//! list of deleted keys
boost::filesystem::path deleted_keys_path_;

@@ -38,7 +38,9 @@
namespace keyvi {
namespace index {
namespace internal {

namespace unit_test {
class SegmentFriend;
}
class Segment final : public ReadOnlySegment {
public:
explicit Segment(const boost::filesystem::path& path, bool no_deletes = false)
@@ -89,7 +91,12 @@ class Segment final : public ReadOnlySegment {

bool HasDeletedKeys() {
LazyLoadDeletedKeys();
return ReadOnlySegment::HasDeletedKeys();
return deleted_keys_for_write_.size() + deleted_keys_during_merge_for_write_.size() > 0;
}

size_t DeletedKeysSize() {
LazyLoadDeletedKeys();
return deleted_keys_for_write_.size() + deleted_keys_during_merge_for_write_.size();
}

const std::shared_ptr<std::unordered_set<std::string>> DeletedKeys() {
@@ -175,6 +182,21 @@ class Segment final : public ReadOnlySegment {
bool new_delete_;
boost::filesystem::path deleted_keys_swap_filename_;

// friend for unit testing only
friend class unit_test::SegmentFriend;

explicit Segment(const dictionary::dictionary_properties_t& dictionary_properties, bool no_deletes = false)
: ReadOnlySegment(dictionary_properties, false, !no_deletes),
deleted_keys_for_write_(),
deleted_keys_during_merge_for_write_(),
dictionary_loaded(false),
deletes_loaded(no_deletes),
in_merge_(false),
new_delete_(false),
deleted_keys_swap_filename_(dictionary_properties->GetFileName()) {
deleted_keys_swap_filename_ += ".dk-swap";
}

inline void LazyLoadDictionary() {
if (!dictionary_loaded) {
LoadDictionary();
@@ -215,6 +237,6 @@ typedef const std::shared_ptr<segment_vec_t> const_segments_t;

} // namespace internal
} // namespace index
} /* namespace keyvi */
} // namespace keyvi

#endif // KEYVI_INDEX_INTERNAL_SEGMENT_H_
@@ -41,7 +41,7 @@ class SimpleMergePolicy final : public MergePolicy {
std::vector<segment_t> to_merge;
for (segment_t& s : *segments) {
if (!s->MarkedForMerge()) {
TRACE("Add to merge list %s", s->GetFilename().c_str());
TRACE("Add to merge list %s", s->GetDictionaryFilename().c_str());
to_merge.push_back(s);
}

Oops, something went wrong.

0 comments on commit f27619a

Please sign in to comment.
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.