Skip to content

Commit

Permalink
Merge b5ce6d8 into 0ee02b2
Browse files Browse the repository at this point in the history
  • Loading branch information
Hendrik Muhs committed Jan 27, 2018
2 parents 0ee02b2 + b5ce6d8 commit 6fafec2
Show file tree
Hide file tree
Showing 29 changed files with 1,147 additions and 220 deletions.
4 changes: 2 additions & 2 deletions keyvi/include/keyvi/dictionary/dictionary_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@
#include "dictionary/fsa/generator_adapter.h"
#include "dictionary/fsa/internal/constants.h"
#include "dictionary/fsa/internal/null_value_store.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/sort/in_memory_sorter.h"
#include "dictionary/sort/sorter_common.h"
#include "util/configuration.h"
#include "util/serialization_utils.h"

#if !defined(KEYVI_DISABLE_TPIE)
#include "dictionary/sort/tpie_sorter.h"
Expand Down Expand Up @@ -222,7 +222,7 @@ class DictionaryCompiler final {
* @param manifest as JSON string
*/
void SetManifestFromString(const std::string& manifest) {
SetManifest(fsa::internal::SerializationUtils::ReadJsonRecord(manifest));
SetManifest(keyvi::util::SerializationUtils::ReadJsonRecord(manifest));
}

/**
Expand Down
85 changes: 68 additions & 17 deletions keyvi/include/keyvi/dictionary/dictionary_merger.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,19 @@

#ifndef KEYVI_DICTIONARY_DICTIONARY_MERGER_H_
#define KEYVI_DICTIONARY_DICTIONARY_MERGER_H_

#include <algorithm>
#include <fstream>
#include <functional>
#include <memory>
#include <queue>
#include <string>
#include <vector>

#include <boost/filesystem.hpp>

#include <msgpack.hpp>

#include "dictionary/fsa/automata.h"
#include "dictionary/fsa/entry_iterator.h"
#include "dictionary/fsa/generator_adapter.h"
Expand Down Expand Up @@ -120,11 +128,15 @@ class DictionaryMerger final {
throw std::invalid_argument("Dictionaries must have the same type.");
}

// check whether dictionary is completely empty
const auto segment_iterator = SegmentIterator(fsa::EntryIterator(fsa), segments_pqueue_.size());
if (!segment_iterator) {
return;
}

// push back deleted keys (list might be empty)
deleted_keys_.push_back(TryLoadDeletedKeys(filename));

segments_pqueue_.push(segment_iterator);
inputFiles_.push_back(filename);
dicts_to_merge_.push_back(fsa);
Expand Down Expand Up @@ -168,25 +180,30 @@ class DictionaryMerger final {
}
}

fsa::ValueHandle handle;
handle.no_minimization = false;

// get the weight value, for now simple: does not require access to the
// value store itself
handle.weight = value_store->GetMergeWeight(segment_it.entryIterator().GetValueId());

if (append_merge_) {
handle.value_idx =
value_store->GetMergeValueId(segment_it.segmentIndex(), segment_it.entryIterator().GetValueId());
if (!deleted_keys_[segment_it.segmentIndex()].empty() &&
top_key == deleted_keys_[segment_it.segmentIndex()].back()) {
// check the other deleted_keys,
deleted_keys_[segment_it.segmentIndex()].pop_back();
} else {
handle.value_idx =
value_store->GetValue(segment_it.entryIterator().GetFsa()->GetValueStore()->GetValueStorePayload(),
segment_it.entryIterator().GetValueId(), &handle.no_minimization);
}

TRACE("Add key: %s", top_key.c_str());
generator->Add(std::move(top_key), handle);
fsa::ValueHandle handle;
handle.no_minimization = false;

// get the weight value, for now simple: does not require access to the
// value store itself
handle.weight = value_store->GetMergeWeight(segment_it.entryIterator().GetValueId());

if (append_merge_) {
handle.value_idx =
value_store->GetMergeValueId(segment_it.segmentIndex(), segment_it.entryIterator().GetValueId());
} else {
handle.value_idx =
value_store->GetValue(segment_it.entryIterator().GetFsa()->GetValueStore()->GetValueStorePayload(),
segment_it.entryIterator().GetValueId(), &handle.no_minimization);
}

TRACE("Add key: %s", top_key.c_str());
generator->Add(std::move(top_key), handle);
}
if (++segment_it) {
segments_pqueue_.push(segment_it);
}
Expand All @@ -205,11 +222,45 @@ class DictionaryMerger final {
private:
bool append_merge_ = false;
std::vector<fsa::automata_t> dicts_to_merge_;
std::vector<std::vector<std::string>> deleted_keys_;
std::vector<std::string> inputFiles_;
std::priority_queue<SegmentIterator> segments_pqueue_;

keyvi::util::parameters_t params_;
std::string manifest_ = std::string();

/**
* Load a file with deleted keys if it exists
*/
std::vector<std::string> TryLoadDeletedKeys(const std::string& filename) {
std::vector<std::string> deleted_keys;
boost::filesystem::path deleted_keys_file{filename};
deleted_keys_file += ".dk";

TRACE("check for deleted keys file: %s", deleted_keys_file.string().c_str());
std::ifstream deleted_keys_stream(deleted_keys_file.string(), std::ios::binary);

if (deleted_keys_stream.good()) {
TRACE("found deleted keys file");

{
// reads the buffer as 1 big chunk, could be improved
// msgpack v2.x provides a better interface (visitor)
std::stringstream buffer;
buffer << deleted_keys_stream.rdbuf();

msgpack::unpacked unpacked_object;
msgpack::unpack(unpacked_object, buffer.str().data(), buffer.str().size());

unpacked_object.get().convert(deleted_keys);
}

// sort in reverse order
std::sort(deleted_keys.begin(), deleted_keys.end(), std::greater<std::string>());
}

return deleted_keys;
}
};

} /* namespace dictionary */
Expand Down
4 changes: 2 additions & 2 deletions keyvi/include/keyvi/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@
#include "dictionary/fsa/internal/constants.h"
#include "dictionary/fsa/internal/intrinsics.h"
#include "dictionary/fsa/internal/memory_map_flags.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/fsa/internal/value_store_factory.h"
#include "dictionary/fsa/traversal/traversal_base.h"
#include "dictionary/fsa/traversal/weighted_traversal.h"
#include "dictionary/keyvi_file.h"
#include "dictionary/util/endian.h"
#include "util/serialization_utils.h"
#include "util/vint.h"

// #define ENABLE_TRACING
Expand All @@ -69,7 +69,7 @@ class Automata final {
number_of_keys_ = boost::lexical_cast<uint64_t>(automata_properties_.get<std::string>("number_of_keys"));

std::istream& persistenceStream = keyViFile.persistenceStream();
sparse_array_properties_ = internal::SerializationUtils::ReadJsonRecord(persistenceStream);
sparse_array_properties_ = keyvi::util::SerializationUtils::ReadJsonRecord(persistenceStream);

const size_t bucket_size = sizeof(uint16_t);
const size_t array_size = boost::lexical_cast<size_t>(sparse_array_properties_.get<std::string>("size"));
Expand Down
6 changes: 3 additions & 3 deletions keyvi/include/keyvi/dictionary/fsa/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
#include <boost/property_tree/ptree.hpp>

#include "dictionary/fsa/internal/null_value_store.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/fsa/internal/sparse_array_builder.h"
#include "dictionary/fsa/internal/unpacked_state.h"
#include "dictionary/fsa/internal/unpacked_state_stack.h"
#include "util/configuration.h"
#include "util/serialization_utils.h"

// #define ENABLE_TRACING
#include "dictionary/util/trace.h"
Expand Down Expand Up @@ -338,7 +338,7 @@ class Generator final {
* @param manifest as JSON string
*/
inline void SetManifestFromString(const std::string& manifest) {
SetManifest(internal::SerializationUtils::ReadJsonRecord(manifest));
SetManifest(keyvi::util::SerializationUtils::ReadJsonRecord(manifest));
}

/**
Expand Down Expand Up @@ -373,7 +373,7 @@ class Generator final {
pt.put("number_of_states", std::to_string(number_of_states_));
pt.add_child("manifest", manifest_);

internal::SerializationUtils::WriteJsonRecord(stream, pt);
keyvi::util::SerializationUtils::WriteJsonRecord(stream, pt);
}

inline void FeedStack(const size_t start, const std::string& key) {
Expand Down
10 changes: 5 additions & 5 deletions keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@
#include "dictionary/fsa/internal/lru_generation_cache.h"
#include "dictionary/fsa/internal/memory_map_flags.h"
#include "dictionary/fsa/internal/memory_map_manager.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/fsa/internal/value_store_persistence.h"
#include "dictionary/keyvi_file.h"
#include "util/configuration.h"
#include "util/json_value.h"
#include "util/serialization_utils.h"

// #define ENABLE_TRACING
#include "dictionary/util/trace.h"
Expand Down Expand Up @@ -121,7 +121,7 @@ class JsonValueStore final : public IValueStoreWriter {
KeyViFile keyViFile(filename);

auto& vsStream = keyViFile.valueStoreStream();
const boost::property_tree::ptree props = internal::SerializationUtils::ReadValueStoreProperties(vsStream);
const boost::property_tree::ptree props = keyvi::util::SerializationUtils::ReadValueStoreProperties(vsStream);
offsets_.push_back(values_buffer_size_);

number_of_values_ += boost::lexical_cast<size_t>(props.get<std::string>("values"));
Expand Down Expand Up @@ -205,7 +205,7 @@ class JsonValueStore final : public IValueStoreWriter {
pt.put(std::string("__") + COMPRESSION_THRESHOLD_KEY, compression_threshold_);
}

internal::SerializationUtils::WriteJsonRecord(stream, pt);
keyvi::util::SerializationUtils::WriteJsonRecord(stream, pt);
TRACE("Wrote JSON header, stream at %d", stream.tellp());

if (!mergeMode_) {
Expand All @@ -214,7 +214,7 @@ class JsonValueStore final : public IValueStoreWriter {
for (const auto& filename : inputFiles_) {
KeyViFile keyViFile(filename);
auto& in_stream = keyViFile.valueStoreStream();
internal::SerializationUtils::ReadValueStoreProperties(in_stream);
keyvi::util::SerializationUtils::ReadValueStoreProperties(in_stream);

stream << in_stream.rdbuf();
}
Expand Down Expand Up @@ -303,7 +303,7 @@ class JsonValueStoreReader final : public IValueStoreReader {
: IValueStoreReader(stream, file_mapping) {
TRACE("JsonValueStoreReader construct");

properties_ = internal::SerializationUtils::ReadValueStoreProperties(stream);
properties_ = keyvi::util::SerializationUtils::ReadValueStoreProperties(stream);

const size_t offset = stream.tellg();
const size_t strings_size = boost::lexical_cast<size_t>(properties_.get<std::string>("size"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@

#include "dictionary/fsa/internal/constants.h"
#include "dictionary/fsa/internal/memory_map_manager.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/util/endian.h"
#include "util/serialization_utils.h"
#include "util/vint.h"

// #define PERSISTENCE_DEBUG
Expand Down Expand Up @@ -196,7 +196,7 @@ class SparseArrayPersistence final {

pt.put("size", std::to_string(highest_write_position));

internal::SerializationUtils::WriteJsonRecord(stream, pt);
keyvi::util::SerializationUtils::WriteJsonRecord(stream, pt);
TRACE("Wrote JSON header, stream at %d", stream.tellp());

labels_extern_->Write(stream, highest_write_position);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,8 @@
#include "dictionary/fsa/internal/ivalue_store.h"
#include "dictionary/fsa/internal/memory_map_flags.h"
#include "dictionary/fsa/internal/minimization_hash.h"
#include "dictionary/fsa/internal/serialization_utils.h"

// #define ENABLE_TRACING
#include "dictionary/util/trace.h"
#include "util/serialization_utils.h"

namespace keyvi {
namespace dictionary {
Expand Down Expand Up @@ -180,7 +178,7 @@ class StringValueStore final : public IValueStoreWriter {
boost::property_tree::ptree pt;
pt.put("size", std::to_string(string_values_.size()));

internal::SerializationUtils::WriteJsonRecord(stream, pt);
keyvi::util::SerializationUtils::WriteJsonRecord(stream, pt);
TRACE("Wrote JSON header, stream at %d", stream.tellp());

stream.write((const char*)&string_values_[0], string_values_.size());
Expand Down Expand Up @@ -217,7 +215,7 @@ class StringValueStoreReader final : public IValueStoreReader {
StringValueStoreReader(std::istream& stream, boost::interprocess::file_mapping* file_mapping,
loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: IValueStoreReader(stream, file_mapping) {
const boost::property_tree::ptree properties = internal::SerializationUtils::ReadValueStoreProperties(stream);
const boost::property_tree::ptree properties = keyvi::util::SerializationUtils::ReadValueStoreProperties(stream);

const size_t offset = stream.tellg();
const size_t strings_size = boost::lexical_cast<size_t>(properties.get<std::string>("size"));
Expand Down
6 changes: 3 additions & 3 deletions keyvi/include/keyvi/dictionary/keyvi_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#include <string>

#include "dictionary/fsa/internal/constants.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "util/serialization_utils.h"

namespace keyvi {
namespace dictionary {
Expand All @@ -51,14 +51,14 @@ class KeyViFile {
throw std::invalid_argument("not a keyvi file");
}

automata_properties_ = fsa::internal::SerializationUtils::ReadJsonRecord(file_stream_);
automata_properties_ = keyvi::util::SerializationUtils::ReadJsonRecord(file_stream_);
persistence_offset_ = file_stream_.tellg();

if (boost::lexical_cast<int>(automata_properties_.get<std::string>("version")) < KEYVI_FILE_VERSION_MIN) {
throw std::invalid_argument("this version of keyvi file is unsupported");
}

const ptree sparse_array_properties = fsa::internal::SerializationUtils::ReadJsonRecord(file_stream_);
const ptree sparse_array_properties = keyvi::util::SerializationUtils::ReadJsonRecord(file_stream_);

if (boost::lexical_cast<int>(sparse_array_properties.get<std::string>("version")) <
KEYVI_FILE_PERSISTENCE_VERSION_MIN) {
Expand Down
9 changes: 2 additions & 7 deletions keyvi/include/keyvi/index/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,19 @@
#include <boost/filesystem.hpp>
#include <boost/filesystem/operations.hpp>
#include <boost/interprocess/sync/file_lock.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>

#include "dictionary/dictionary.h"
#include "dictionary/dictionary_compiler.h"
#include "dictionary/dictionary_types.h"
#include "dictionary/fsa/internal/serialization_utils.h"
#include "dictionary/match.h"
#include "index/internal/base_index_reader.h"
#include "index/internal/index_writer_worker.h"
#include "index/internal/segment.h"

// #define ENABLE_TRACING
#include "dictionary/util/trace.h"

namespace keyvi {
namespace index {

class Index final : public internal::BaseIndexReader<internal::IndexWriterWorker> {
class Index final : public internal::BaseIndexReader<internal::IndexWriterWorker, internal::Segment> {
public:
explicit Index(const std::string& index_directory,
const keyvi::util::parameters_t& params = keyvi::util::parameters_t())
Expand Down
Loading

0 comments on commit 6fafec2

Please sign in to comment.