From 4ba6aea08bf377e5f353ef1fec8172edb8291e28 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 2 Feb 2022 07:50:14 -0600 Subject: [PATCH 01/85] Replace TraitIdArray with a vector since it doesn't have to live in shared memory --- src/api/traits.cc | 4 ++-- src/api/traits.h | 4 ++-- src/hermes_types.h | 10 ++-------- test/bucket_test.cc | 2 +- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index 71b9b796a..4dc91a371 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -18,7 +18,7 @@ namespace hermes { namespace api { -Trait::Trait(TraitID id, TraitIdArray conflict_traits, TraitType type) +Trait::Trait(TraitID id, std::vector conflict_traits, TraitType type) : id(id), conflict_traits(conflict_traits), type(type), @@ -32,7 +32,7 @@ using OffsetMap = std::unordered_map; PersistTrait::PersistTrait(const std::string &filename, const OffsetMap &offset_map, bool synchronous) - : Trait(HERMES_PERSIST_TRAIT, TraitIdArray(), TraitType::PERSIST), + : Trait(HERMES_PERSIST_TRAIT, std::vector(), TraitType::PERSIST), filename(filename), offset_map(offset_map), synchronous(synchronous) { this->onAttachFn = std::bind(&PersistTrait::onAttach, this, std::placeholders::_1, std::placeholders::_2, diff --git a/src/api/traits.h b/src/api/traits.h index 48bc567fc..7e06857dd 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -43,7 +43,7 @@ struct Trait { /** The trait's ID */ TraitID id; /** \todo ??? */ - TraitIdArray conflict_traits; + std::vector conflict_traits; /** The trait's type */ TraitType type; /** Callback for trait->vbucket attach events */ @@ -56,7 +56,7 @@ struct Trait { OnLinkCallback onUnlinkFn; Trait() {} - Trait(TraitID id, TraitIdArray conflict_traits, TraitType type); + Trait(TraitID id, std::vector conflict_traits, TraitType type); }; #define HERMES_PERSIST_TRAIT 11 diff --git a/src/hermes_types.h b/src/hermes_types.h index bbe878f24..1ddc4b681 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -310,15 +310,9 @@ namespace api { enum class TraitType : u8 { META = 0, DATA = 1, - FILE_MAPPING = 2, - PERSIST = 3, -}; -} // namespace api - -struct TraitIdArray { - TraitID *ids; - u32 length; + PERSIST = 2, }; +} // namespace api } // namespace hermes #endif // HERMES_TYPES_H_ diff --git a/test/bucket_test.cc b/test/bucket_test.cc index aa9f90e92..772300fac 100644 --- a/test/bucket_test.cc +++ b/test/bucket_test.cc @@ -31,7 +31,7 @@ int compress_blob(HermesPtr hermes, hapi::TraitInput &input, hapi::Trait *trait); struct MyTrait : public hapi::Trait { int compress_level; - MyTrait() : Trait(10001, hermes::TraitIdArray(), hapi::TraitType::META) { + MyTrait() : Trait(10001, std::vector(), hapi::TraitType::META) { onLinkFn = std::bind(&compress_blob, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); From 4fc7835d2105a243a4a8a3cc6476adc7764ad702 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 2 Feb 2022 16:20:10 -0600 Subject: [PATCH 02/85] Add WriteOnlyTrait and test [skip ci] --- src/api/traits.cc | 71 +++++++++++++++++++++++++++++------ src/api/traits.h | 18 +++++++-- src/buffer_organizer.cc | 4 ++ src/metadata_management.h | 1 + test/bucket_test.cc | 8 ++-- test/buffer_organizer_test.cc | 44 +++++++++++++++++++--- 6 files changed, 121 insertions(+), 25 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index 4dc91a371..cacf4c945 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -34,18 +34,18 @@ PersistTrait::PersistTrait(const std::string &filename, bool synchronous) : Trait(HERMES_PERSIST_TRAIT, std::vector(), TraitType::PERSIST), filename(filename), offset_map(offset_map), synchronous(synchronous) { - this->onAttachFn = std::bind(&PersistTrait::onAttach, this, - std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - this->onDetachFn = std::bind(&PersistTrait::onDetach, this, - std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - this->onLinkFn = std::bind(&PersistTrait::onLink, this, - std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); - this->onUnlinkFn = std::bind(&PersistTrait::onUnlink, this, - std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); + onAttachFn = std::bind(&PersistTrait::onAttach, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onDetachFn = std::bind(&PersistTrait::onDetach, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onLinkFn = std::bind(&PersistTrait::onLink, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onUnlinkFn = std::bind(&PersistTrait::onUnlink, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); } void PersistTrait::onAttach(HermesPtr hermes, VBucketID id, Trait *trait) { @@ -95,5 +95,52 @@ void PersistTrait::onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait) { (void)trait; } +WriteOnlyTrait::WriteOnlyTrait() + : Trait(HERMES_WRITE_ONLY_TRAIT, std::vector(), TraitType::META) { + onAttachFn = std::bind(&WriteOnlyTrait::onAttach, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onDetachFn = std::bind(&WriteOnlyTrait::onDetach, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onLinkFn = std::bind(&WriteOnlyTrait::onLink, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + onUnlinkFn = std::bind(&WriteOnlyTrait::onUnlink, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); +} + +void WriteOnlyTrait::onAttach(HermesPtr hermes, VBucketID id, Trait *trait) { + (void)hermes; + (void)id; + (void)trait; +} + +void WriteOnlyTrait::onDetach(HermesPtr hermes, VBucketID id, Trait *trait) { + (void)hermes; + (void)id; + (void)trait; +} + +void WriteOnlyTrait::onLink(HermesPtr hermes, TraitInput &input, Trait *trait) { + (void)trait; + + SharedMemoryContext *context = &hermes->context_; + RpcContext *rpc = &hermes->rpc_; + BucketID bucket_id = GetBucketId(context, rpc, input.bucket_name.c_str()); + f32 epsilon = 0.1; + f32 custom_importance = 0; + hermes::OrganizeBlob(context, rpc, bucket_id, input.blob_name, epsilon, + custom_importance); +} + +void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, + Trait *trait) { + (void)hermes; + (void)input; + (void)trait; +} + } // namespace api } // namespace hermes diff --git a/src/api/traits.h b/src/api/traits.h index 7e06857dd..7668d4e1e 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -21,6 +21,9 @@ namespace hermes { namespace api { +#define HERMES_PERSIST_TRAIT 11 +#define HERMES_WRITE_ONLY_TRAIT 12 + /** A blob's hosting bucket and blob names */ struct BlobInfo { /** The blob-hosting bucket name */ @@ -59,8 +62,6 @@ struct Trait { Trait(TraitID id, std::vector conflict_traits, TraitType type); }; -#define HERMES_PERSIST_TRAIT 11 - /** (File) Persistence trait */ struct PersistTrait : public Trait { std::string filename; @@ -74,8 +75,17 @@ struct PersistTrait : public Trait { void onAttach(HermesPtr hermes, VBucketID id, Trait *trait); void onDetach(HermesPtr hermes, VBucketID id, Trait *trait); - void onLink(HermesPtr hermes, TraitInput &blob, Trait *trait); - void onUnlink(HermesPtr hermes, TraitInput &blob, Trait *trait); + void onLink(HermesPtr hermes, TraitInput &input, Trait *trait); + void onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait); +}; + +struct WriteOnlyTrait : public Trait { + WriteOnlyTrait(); + + void onAttach(HermesPtr hermes, VBucketID id, Trait *trait); + void onDetach(HermesPtr hermes, VBucketID id, Trait *trait); + void onLink(HermesPtr hermes, TraitInput &input, Trait *trait); + void onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait); }; } // namespace api diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index ff8b9945d..6e40cfa42 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -174,9 +174,11 @@ void LocalEnqueueBoMove(SharedMemoryContext *context, RpcContext *rpc, BoPriority priority) { ThreadPool *pool = &context->bo->pool; bool is_high_priority = priority == BoPriority::kHigh; + VLOG(1) << "BufferOrganizer moving Blob " << blob_id.as_int; pool->run(std::bind(BoMove, context, rpc, moves, blob_id, bucket_id, internal_blob_name), is_high_priority); + VLOG(1) << "BufferOrganizer " << blob_id.as_int << " done\n"; } /** @@ -260,6 +262,8 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, BlobInfo new_info = {}; BlobInfo *old_info = GetBlobInfoPtr(mdm, blob_id); new_info.stats = old_info->stats; + // Invalidate the old Blob. It will get deleted when its TicketMutex + // reaches old_info->last old_info->stop = true; ReleaseBlobInfoPtr(mdm); LocalPut(mdm, new_blob_id, new_info); diff --git a/src/metadata_management.h b/src/metadata_management.h index acb53aebd..3c1f98530 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -122,6 +122,7 @@ struct VBucketInfo { ChunkedIdList blobs; std::atomic ref_count; std::atomic async_flush_count; + /** Not currently used since Traits are process local. */ TraitID traits[kMaxTraitsPerVBucket]; bool active; }; diff --git a/test/bucket_test.cc b/test/bucket_test.cc index 772300fac..5f6403fdb 100644 --- a/test/bucket_test.cc +++ b/test/bucket_test.cc @@ -31,10 +31,10 @@ int compress_blob(HermesPtr hermes, hapi::TraitInput &input, hapi::Trait *trait); struct MyTrait : public hapi::Trait { int compress_level; - MyTrait() : Trait(10001, std::vector(), hapi::TraitType::META) { - onLinkFn = - std::bind(&compress_blob, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3); + MyTrait() : Trait(10001, std::vector(), + hapi::TraitType::META) { + onLinkFn = std::bind(&compress_blob, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); } // optional function pointer if only known at runtime diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 882a7b13c..fdb7820c1 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -33,7 +33,8 @@ using hermes::TargetID; using hermes::BucketID; using hermes::BlobID; using hermes::BufferInfo; - +using hapi::VBucket; +using hapi::Bucket; static void TestIsBoFunction() { using hermes::IsBoFunction; @@ -243,6 +244,30 @@ void TestOrganizeBlob() { hermes->Finalize(true); } +static void TestWriteOnlyBucket() { + HermesPtr hermes = hermes::InitHermesDaemon(); + std::string bkt_name = "WriteOnly"; + VBucket vbkt(bkt_name, hermes); + Bucket bkt(bkt_name, hermes); + + + hapi::WriteOnlyTrait trait; + vbkt.Attach(&trait); + + hapi::Blob blob(KILOBYTES(4), 127); + + const int kIters = 128; + for (int i = 0; i < kIters; ++i) { + std::string blob_name = "b" + std::to_string(i); + bkt.Put(blob_name, blob); + vbkt.Link(blob_name, bkt_name); + } + + vbkt.Destroy(); + bkt.Destroy(); + hermes->Finalize(true); +} + int main(int argc, char *argv[]) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -251,10 +276,19 @@ int main(int argc, char *argv[]) { return 1; } - TestIsBoFunction(); - TestBackgroundFlush(); - TestBoMove(); - TestOrganizeBlob(); +#define HERMES_ADD_TEST(test_name) \ + if (argc == 1 || std::string(argv[1]) == #test_name) { \ + fprintf(stdout, "### Running %s\n", #test_name); \ + test_name(); \ + } + + HERMES_ADD_TEST(TestIsBoFunction); + HERMES_ADD_TEST(TestBackgroundFlush); + HERMES_ADD_TEST(TestBoMove); + HERMES_ADD_TEST(TestOrganizeBlob); + HERMES_ADD_TEST(TestWriteOnlyBucket); + +#undef HERMES_ADD_TEST MPI_Finalize(); From b25e8ce3fbd9ce7f1abc946146e50dfc6dfc73cc Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 10 Feb 2022 08:12:49 -0600 Subject: [PATCH 03/85] Fix warning --- benchmarks/vpic_bench.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/vpic_bench.cc b/benchmarks/vpic_bench.cc index b6283ee30..2e3bd93f5 100644 --- a/benchmarks/vpic_bench.cc +++ b/benchmarks/vpic_bench.cc @@ -489,7 +489,8 @@ void CheckResults(float *data, size_t num_elements, CHECK(f); std::vector read_data(num_elements); - fread(read_data.data(), 1, num_elements * sizeof(float), f); + size_t bytes_read = fread(read_data.data(), 1, num_elements * sizeof(float), + f); for (size_t i = 0; i < num_elements; ++i) { Assert(data[i] == read_data[i]); From d47081325fb450318065b029810726b557c898bb Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 10 Feb 2022 08:28:10 -0600 Subject: [PATCH 04/85] Fix unused variable warning --- benchmarks/vpic_bench.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/vpic_bench.cc b/benchmarks/vpic_bench.cc index 2e3bd93f5..b83a04969 100644 --- a/benchmarks/vpic_bench.cc +++ b/benchmarks/vpic_bench.cc @@ -489,8 +489,9 @@ void CheckResults(float *data, size_t num_elements, CHECK(f); std::vector read_data(num_elements); - size_t bytes_read = fread(read_data.data(), 1, num_elements * sizeof(float), - f); + size_t num_bytes = num_elements * sizeof(float); + size_t bytes_read = fread(read_data.data(), 1, num_bytes, f); + Assert(bytes_read == num_bytes); for (size_t i = 0; i < num_elements; ++i) { Assert(data[i] == read_data[i]); From 15c938d1a692e6e7230139349150c31d822729d8 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 10 Feb 2022 08:39:52 -0600 Subject: [PATCH 05/85] TestWriteOnly BORG test working --- src/metadata_management.cc | 3 +++ test/buffer_organizer_test.cc | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index e366f5026..b7b635ccb 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -727,6 +727,9 @@ void WaitForOutstandingBlobOps(MetadataManager *mdm, BlobID blob_id) { BlobInfo *blob_info = GetBlobInfoPtr(mdm, blob_id); if (blob_info) { t = TryBeginTicketMutex(&blob_info->lock, ticket); + } else { + // Blob was deleted + break; } if (!t.acquired) { ReleaseBlobInfoPtr(mdm); diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index fdb7820c1..905691a0e 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -250,7 +250,6 @@ static void TestWriteOnlyBucket() { VBucket vbkt(bkt_name, hermes); Bucket bkt(bkt_name, hermes); - hapi::WriteOnlyTrait trait; vbkt.Attach(&trait); From e0fad8e5f63cb013d81c9e1a806d541f37735692 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 10 Feb 2022 11:14:34 -0600 Subject: [PATCH 06/85] Add ReadOnlyTrait and test --- src/api/traits.cc | 18 ++++++++++++++ src/api/traits.h | 9 +++++++ src/api/vbucket.cc | 44 +++++++++++++++++++++++++++-------- src/api/vbucket.h | 19 +++++++++++---- test/buffer_organizer_test.cc | 24 +++++++++++++++++++ 5 files changed, 99 insertions(+), 15 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index cacf4c945..b6fa06d7d 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -142,5 +142,23 @@ void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, (void)trait; } +ReadOnlyTrait::ReadOnlyTrait() + : Trait(HERMES_READ_ONLY_TRAIT, std::vector(), TraitType::META) { + onGetFn = std::bind(&ReadOnlyTrait::onGet, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); +} + +void ReadOnlyTrait::onGet(HermesPtr hermes, TraitInput &input, Trait *trait) { + (void)trait; + + SharedMemoryContext *context = &hermes->context_; + RpcContext *rpc = &hermes->rpc_; + BucketID bucket_id = GetBucketId(context, rpc, input.bucket_name.c_str()); + f32 epsilon = 0.1f; + f32 custom_importance = 1.0f; + hermes::OrganizeBlob(context, rpc, bucket_id, input.blob_name, epsilon, + custom_importance); +} + } // namespace api } // namespace hermes diff --git a/src/api/traits.h b/src/api/traits.h index 7668d4e1e..abefa2866 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -23,6 +23,7 @@ namespace api { #define HERMES_PERSIST_TRAIT 11 #define HERMES_WRITE_ONLY_TRAIT 12 +#define HERMES_READ_ONLY_TRAIT 13 /** A blob's hosting bucket and blob names */ struct BlobInfo { @@ -57,6 +58,8 @@ struct Trait { OnLinkCallback onLinkFn; /** Callback for blob- conflict_traits, TraitType type); @@ -86,8 +89,14 @@ struct WriteOnlyTrait : public Trait { void onDetach(HermesPtr hermes, VBucketID id, Trait *trait); void onLink(HermesPtr hermes, TraitInput &input, Trait *trait); void onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait); + void onGet(HermesPtr hermes, TraitInput &input, Trait *trait); }; +struct ReadOnlyTrait : public Trait { + ReadOnlyTrait(); + + void onGet(HermesPtr hermes, TraitInput &input, Trait *trait); +}; } // namespace api } // namespace hermes diff --git a/src/api/vbucket.cc b/src/api/vbucket.cc index df375184b..07e1f66b5 100644 --- a/src/api/vbucket.cc +++ b/src/api/vbucket.cc @@ -135,16 +135,40 @@ bool VBucket::ContainsBlob(std::string blob_name, std::string bucket_name) { return ret; } -Blob& VBucket::GetBlob(std::string blob_name, std::string bucket_name) { - LOG(INFO) << "Retrieving blob " << blob_name << " from bucket " << bucket_name - << " in VBucket " << name_ << '\n'; - hermes::api::Context ctx; - Bucket bkt(bucket_name, hermes_, ctx); - local_blob = {}; - size_t blob_size = bkt.Get(blob_name, local_blob, ctx); - local_blob.resize(blob_size); - bkt.Get(blob_name, local_blob, ctx); - return local_blob; +size_t VBucket::Get(const std::string &name, Bucket &bkt, Blob &user_blob, + const Context &ctx) { + size_t ret = Get(name, bkt, user_blob.data(), user_blob.size(), ctx); + + return ret; +} + +size_t VBucket::Get(const std::string &name, Bucket &bkt, Blob &user_blob) { + size_t result = Get(name, bkt, user_blob, ctx_); + + return result; +} + +size_t VBucket::Get(const std::string &name, Bucket &bkt, void *user_blob, + size_t blob_size, const Context &ctx) { + bool is_size_query = false; + if (blob_size != 0) { + is_size_query = true; + } + + size_t result = bkt.Get(name, user_blob, blob_size, ctx); + + if (!is_size_query) { + TraitInput input; + input.blob_name = name; + input.bucket_name = bkt.GetName(); + for (const auto& t : attached_traits_) { + if (t->onGetFn != nullptr) { + t->onGetFn(hermes_, input, t); + } + } + } + + return result; } std::vector VBucket::GetLinks(Context& ctx) { diff --git a/src/api/vbucket.h b/src/api/vbucket.h index dddda13c2..dfc8c117c 100644 --- a/src/api/vbucket.h +++ b/src/api/vbucket.h @@ -39,8 +39,6 @@ class VBucket { VBucketID id_; /** Traits attached to this vbucket */ std::list attached_traits_; - /** \todo What's that Bob? */ - Blob local_blob; /** internal Hermes object owned by vbucket */ std::shared_ptr hermes_; /** The Context for this VBucket. \todo Why do we need that? */ @@ -52,7 +50,6 @@ class VBucket { : name_(initial_name), id_({{0, 0}}), attached_traits_(), - local_blob(), hermes_(h), ctx_(ctx) { if (IsVBucketNameTooLong(name_)) { @@ -116,8 +113,20 @@ class VBucket { /** check if blob is in this vbucket */ bool ContainsBlob(std::string blob_name, std::string bucket_name); - /** get a blob linked to this vbucket */ - Blob &GetBlob(std::string blob_name, std::string bucket_name); + /** Get a Blob, calling any OnGet callbacks of attached Traits. + * + * Exactly like Bucket::Get, except this function invokes the OnGet callback + * of any attached Traits. + */ + size_t Get(const std::string &name, Bucket &bkt, Blob &user_blob, + const Context &ctx); + size_t Get(const std::string &name, Bucket &bkt, Blob &user_blob); + + /** + * * * \brief Retrieve a Blob into a user buffer. + * * */ + size_t Get(const std::string &name, Bucket &bkt, void *user_blob, + size_t blob_size, const Context &ctx); /** retrieves the subset of blob links satisfying pred */ /** could return iterator */ diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 905691a0e..d404847b0 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -267,6 +267,29 @@ static void TestWriteOnlyBucket() { hermes->Finalize(true); } +static void TestReadOnlyBucket() { + HermesPtr hermes = hermes::InitHermesDaemon(); + std::string bkt_name = "ReadOnly"; + VBucket vbkt(bkt_name, hermes); + Bucket bkt(bkt_name, hermes); + + hapi::ReadOnlyTrait trait; + vbkt.Attach(&trait); + + hapi::Blob blob(KILOBYTES(4), 127); + + const int kIters = 128; + for (int i = 0; i < kIters; ++i) { + std::string blob_name = "b" + std::to_string(i); + bkt.Put(blob_name, blob); + vbkt.Link(blob_name, bkt_name); + } + + vbkt.Destroy(); + bkt.Destroy(); + hermes->Finalize(true); +} + int main(int argc, char *argv[]) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -286,6 +309,7 @@ int main(int argc, char *argv[]) { HERMES_ADD_TEST(TestBoMove); HERMES_ADD_TEST(TestOrganizeBlob); HERMES_ADD_TEST(TestWriteOnlyBucket); + HERMES_ADD_TEST(TestReadOnlyBucket); #undef HERMES_ADD_TEST From 8c92f1cb41e9071112dd6470aed1dec5c255dde7 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 10 Feb 2022 14:37:52 -0600 Subject: [PATCH 07/85] Try to fix undefined symbol that can't be reproduced locally --- test/bucket_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/bucket_test.cc b/test/bucket_test.cc index 5f6403fdb..484ad82ee 100644 --- a/test/bucket_test.cc +++ b/test/bucket_test.cc @@ -31,8 +31,8 @@ int compress_blob(HermesPtr hermes, hapi::TraitInput &input, hapi::Trait *trait); struct MyTrait : public hapi::Trait { int compress_level; - MyTrait() : Trait(10001, std::vector(), - hapi::TraitType::META) { + MyTrait() : hapi::Trait(10001, std::vector(), + hapi::TraitType::META) { onLinkFn = std::bind(&compress_blob, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } From f4896f69539f07ded0bf11fbd2125c50a4d8cd19 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 11 Feb 2022 09:08:40 -0600 Subject: [PATCH 08/85] Add config variables for BO min/max capacity thresholds --- src/config_parser.cc | 13 +++++++++++-- src/config_parser.h | 1 + src/hermes_types.h | 4 +++- src/utils.cc | 6 ++++++ test/config_parser_test.cc | 7 +++++++ test/data/hermes.conf | 13 +++++++++++++ 6 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/config_parser.cc b/src/config_parser.cc index 7fbe6883b..142c628ae 100644 --- a/src/config_parser.cc +++ b/src/config_parser.cc @@ -82,6 +82,7 @@ static const char *kConfigVariableStrings[ConfigVariable_Count] = { "is_shared_device", "buffer_organizer_num_threads", "default_rr_split", + "bo_capacity_thresholds_mb", }; EntireFile ReadEntireFile(Arena *arena, const char *path) { @@ -505,8 +506,8 @@ Token *ParseIntList(Token *tok, int *out, int n) { return tok; } -Token *ParseIntListList(Token *tok, int out[][hermes::kMaxBufferPoolSlabs], - int n, int *m) { +template +Token *ParseIntListList(Token *tok, int out[][N], int n, int *m) { if (IsOpenCurlyBrace(tok)) { tok = tok->next; for (int i = 0; i < n; ++i) { @@ -979,6 +980,14 @@ void ParseTokens(TokenList *tokens, Config *config) { config->default_rr_split = ParseInt(&tok); break; } + case ConfigVariable_BOCapacityThresholdsMiB: { + RequireNumDevices(config); + // Each entry has a min and max threshold + std::vector num_thresholds(config->num_devices, 2); + tok = ParseIntListList(tok, config->bo_capacity_thresholds_mb, + config->num_devices, num_thresholds.data()); + break; + } default: { HERMES_INVALID_CODE_PATH; break; diff --git a/src/config_parser.h b/src/config_parser.h index 480e9f2bd..cf6c4d693 100644 --- a/src/config_parser.h +++ b/src/config_parser.h @@ -69,6 +69,7 @@ enum ConfigVariable { ConfigVariable_IsSharedDevice, ConfigVariable_BoNumThreads, ConfigVariable_RRSplit, + ConfigVariable_BOCapacityThresholdsMiB, ConfigVariable_Count }; diff --git a/src/hermes_types.h b/src/hermes_types.h index 1ddc4b681..f7c1a0cf8 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -245,7 +245,9 @@ struct Config { api::PlacementPolicy default_placement_policy; /** Whether blob splitting is enabled for Round-Robin blob placement. */ bool default_rr_split; - + /** The min and max capacity threshold in MiB for each device at which the + * BufferOrganizer will trigger. */ + int bo_capacity_thresholds_mb[kMaxDevices][2]; /** A base name for the BufferPool shared memory segement. Hermes appends the * value of the USER environment variable to this string. */ diff --git a/src/utils.cc b/src/utils.cc index b3977c000..159e84a46 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -120,6 +120,12 @@ void InitDefaultConfig(Config *config) { config->bo_num_threads = 4; config->default_rr_split = false; + + for (int i = 0; i < config->num_devices; ++i) { + config->bo_capacity_thresholds_mb[i][0] = 0; + int max_capacity_mb = (int)((f32)config->capacities[i] / 1024.0f / 1024.0f); + config->bo_capacity_thresholds_mb[i][1] = max_capacity_mb; + } } void FailedLibraryCall(std::string func) { diff --git a/test/config_parser_test.cc b/test/config_parser_test.cc index b3f7fdb1c..e77bd18a7 100644 --- a/test/config_parser_test.cc +++ b/test/config_parser_test.cc @@ -223,6 +223,13 @@ void TestDefaultConfig(Arena *arena, const char *config_file) { Assert(config.is_shared_device[2] == 0); Assert(config.is_shared_device[3] == 0); + + for (int i = 0; i < config.num_devices; ++i) { + Assert(config.bo_capacity_thresholds_mb[i][0] == 0); + int max_capacity_mb = (int)((f32)config.capacities[i] / 1024.0f / 1024.0f); + Assert(config.bo_capacity_thresholds_mb[i][1] == max_capacity_mb); + } + Assert(config.bo_num_threads == 4); } diff --git a/test/data/hermes.conf b/test/data/hermes.conf index 286d0b269..7f3892154 100644 --- a/test/data/hermes.conf +++ b/test/data/hermes.conf @@ -114,3 +114,16 @@ default_placement_policy = "MinimizeIoTime"; # If true (1) the RoundRobin placement policy algorithm will split each Blob # into a random number of smaller Blobs. default_rr_split = 0; + +# For each device, the minimum and maximum capacity threshold in MiB at which +# the BufferOrganizer will trigger. Decreasing the maximum thresholds will cause +# the BufferOrganizer to move data to lower devices, making more room in faster +# devices (ideal for write-heavy workloads). Conversely, increasing the minimum +# threshold will cause data to be moved from slower devices into faster devices +# (ideal for read-heavy workloads). +bo_capacity_thresholds_mb = { + {0, 50}, + {0, 50}, + {0, 50}, + {0, 50}, +}; \ No newline at end of file From c888df4ff638d9ad1aa8a57cb23a55ac52fee415 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 11 Feb 2022 16:14:29 -0600 Subject: [PATCH 09/85] Groundwork for read/write BORG trigger --- src/metadata_management.cc | 41 ++++++++++++++++++++++++++---- src/metadata_management.h | 5 ++++ src/metadata_management_internal.h | 3 ++- src/rpc_thallium.cc | 6 +++-- test/buffer_organizer_test.cc | 12 ++++++++- 5 files changed, 58 insertions(+), 9 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index b7b635ccb..1238b3ce9 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -968,16 +968,29 @@ SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context) { return result; } -void LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, - std::vector adjustments) { +std::vector +LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, + std::vector adjustments) { + std::vector result; + for (size_t i = 0; i < adjustments.size(); ++i) { SystemViewState *state = GetGlobalSystemViewState(context); if (adjustments[i]) { state->bytes_available[i].fetch_add(adjustments[i]); DLOG(INFO) << "DeviceID " << i << " adjusted by " << adjustments[i] << " bytes\n"; + + // Collect devices for which to trigger the BufferOrganizer if the + // capacities are beyond the min/max thresholds + int mb_available = (int)(state->bytes_available[i] / 1024.0f / 1024.0f); + if (mb_available < state->bo_capacity_thresholds_mb[i][0] || + mb_available > state->bo_capacity_thresholds_mb[i][1]) { + result.push_back((DeviceID)i); + } } } + + return result; } void UpdateGlobalSystemViewState(SharedMemoryContext *context, @@ -994,15 +1007,26 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, } } + std::vector devices_to_organize; if (update_needed) { u32 target_node = mdm->global_system_view_state_node_id; if (target_node == rpc->node_id) { - LocalUpdateGlobalSystemViewState(context, adjustments); + devices_to_organize = + LocalUpdateGlobalSystemViewState(context, adjustments); } else { - RpcCall(rpc, target_node, "RemoteUpdateGlobalSystemViewState", - adjustments); + devices_to_organize = + RpcCall>(rpc, target_node, + "RemoteUpdateGlobalSystemViewState", + adjustments); } } + + for (size_t i = 0; i < devices_to_organize.size(); ++i) { + // TODO(chogan): + // for each blob with (a percentage of ?) buffers in this device: + // OrganizeBlob(context, rpc, bucket_id, blob_name, epsilon, + // custom_importance); + } } TargetID FindTargetIdFromDeviceId(const std::vector &targets, @@ -1033,6 +1057,13 @@ SystemViewState *CreateSystemViewState(Arena *arena, Config *config) { result->num_devices = config->num_devices; for (int i = 0; i < result->num_devices; ++i) { result->bytes_available[i] = config->capacities[i]; + + // Min and max thresholds + const int kNumThresholds = 2; + for (int j = 0; j < kNumThresholds; ++j) { + result->bo_capacity_thresholds_mb[i][j] = + config->bo_capacity_thresholds_mb[i][j]; + } } return result; diff --git a/src/metadata_management.h b/src/metadata_management.h index 3c1f98530..9d6d86cd1 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -128,7 +128,12 @@ struct VBucketInfo { }; struct SystemViewState { + /** The number of bytes available for buffering in each device. */ std::atomic bytes_available[kMaxDevices]; + /** The min and max threshold for each device at which the BufferOrganizer + * will trigger. */ + int bo_capacity_thresholds_mb[kMaxDevices][2]; + /** The total number of buffering devices. */ int num_devices; }; diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index 7789978c1..432007ba1 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -75,7 +75,8 @@ void LocalPut(MetadataManager *mdm, const char *key, u64 val, MapType map_type); void LocalDelete(MetadataManager *mdm, const char *key, MapType map_type); u64 LocalGetRemainingTargetCapacity(SharedMemoryContext *context, TargetID id); -void LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, +std::vector +LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, std::vector adjustments); SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context); std::vector LocalGetGlobalDeviceCapacities(SharedMemoryContext *context); diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 1bcf2c479..c016807bc 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -312,8 +312,10 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, // Probably should move it to a completely separate tl::engine. auto rpc_update_global_system_view_state = [context](const request &req, std::vector adjustments) { - LocalUpdateGlobalSystemViewState(context, adjustments); - req.respond(true); + std::vector result = + LocalUpdateGlobalSystemViewState(context, adjustments); + + req.respond(result); }; auto rpc_get_blob_ids = [context](const request &req, BucketID bucket_id) { diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index d404847b0..b4eb1a125 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -276,7 +276,9 @@ static void TestReadOnlyBucket() { hapi::ReadOnlyTrait trait; vbkt.Attach(&trait); - hapi::Blob blob(KILOBYTES(4), 127); + const int kBlobSize = KILOBYTES(4); + const u8 kBlobData = 127; + hapi::Blob blob(kBlobSize, kBlobData); const int kIters = 128; for (int i = 0; i < kIters; ++i) { @@ -285,6 +287,14 @@ static void TestReadOnlyBucket() { vbkt.Link(blob_name, bkt_name); } + for (int i = 0; i < kIters; ++i) { + std::string blob_name = "b" + std::to_string(i); + hapi::Blob retrieved_data(kBlobSize); + // Call Get through VBucket so the OnGet callback is triggered + vbkt.Get(blob_name, bkt, retrieved_data); + Assert(retrieved_data == blob); + } + vbkt.Destroy(); bkt.Destroy(); hermes->Finalize(true); From 67458699e1d704e77f8840f8da2577caaaf40357 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 17 Feb 2022 10:15:42 -0600 Subject: [PATCH 10/85] Remove ReadOnlyTrait --- src/api/traits.cc | 18 ------------------ src/api/traits.h | 5 ----- test/buffer_organizer_test.cc | 34 ---------------------------------- 3 files changed, 57 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index b6fa06d7d..cacf4c945 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -142,23 +142,5 @@ void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, (void)trait; } -ReadOnlyTrait::ReadOnlyTrait() - : Trait(HERMES_READ_ONLY_TRAIT, std::vector(), TraitType::META) { - onGetFn = std::bind(&ReadOnlyTrait::onGet, this, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3); -} - -void ReadOnlyTrait::onGet(HermesPtr hermes, TraitInput &input, Trait *trait) { - (void)trait; - - SharedMemoryContext *context = &hermes->context_; - RpcContext *rpc = &hermes->rpc_; - BucketID bucket_id = GetBucketId(context, rpc, input.bucket_name.c_str()); - f32 epsilon = 0.1f; - f32 custom_importance = 1.0f; - hermes::OrganizeBlob(context, rpc, bucket_id, input.blob_name, epsilon, - custom_importance); -} - } // namespace api } // namespace hermes diff --git a/src/api/traits.h b/src/api/traits.h index abefa2866..f0da428de 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -92,11 +92,6 @@ struct WriteOnlyTrait : public Trait { void onGet(HermesPtr hermes, TraitInput &input, Trait *trait); }; -struct ReadOnlyTrait : public Trait { - ReadOnlyTrait(); - - void onGet(HermesPtr hermes, TraitInput &input, Trait *trait); -}; } // namespace api } // namespace hermes diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index b4eb1a125..905691a0e 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -267,39 +267,6 @@ static void TestWriteOnlyBucket() { hermes->Finalize(true); } -static void TestReadOnlyBucket() { - HermesPtr hermes = hermes::InitHermesDaemon(); - std::string bkt_name = "ReadOnly"; - VBucket vbkt(bkt_name, hermes); - Bucket bkt(bkt_name, hermes); - - hapi::ReadOnlyTrait trait; - vbkt.Attach(&trait); - - const int kBlobSize = KILOBYTES(4); - const u8 kBlobData = 127; - hapi::Blob blob(kBlobSize, kBlobData); - - const int kIters = 128; - for (int i = 0; i < kIters; ++i) { - std::string blob_name = "b" + std::to_string(i); - bkt.Put(blob_name, blob); - vbkt.Link(blob_name, bkt_name); - } - - for (int i = 0; i < kIters; ++i) { - std::string blob_name = "b" + std::to_string(i); - hapi::Blob retrieved_data(kBlobSize); - // Call Get through VBucket so the OnGet callback is triggered - vbkt.Get(blob_name, bkt, retrieved_data); - Assert(retrieved_data == blob); - } - - vbkt.Destroy(); - bkt.Destroy(); - hermes->Finalize(true); -} - int main(int argc, char *argv[]) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -319,7 +286,6 @@ int main(int argc, char *argv[]) { HERMES_ADD_TEST(TestBoMove); HERMES_ADD_TEST(TestOrganizeBlob); HERMES_ADD_TEST(TestWriteOnlyBucket); - HERMES_ADD_TEST(TestReadOnlyBucket); #undef HERMES_ADD_TEST From 2aae73d746adea51862bc565b126d8a9130343ed Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 17 Feb 2022 10:17:19 -0600 Subject: [PATCH 11/85] Remove unused define --- src/api/traits.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/traits.h b/src/api/traits.h index f0da428de..2dcff3b24 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -23,7 +23,6 @@ namespace api { #define HERMES_PERSIST_TRAIT 11 #define HERMES_WRITE_ONLY_TRAIT 12 -#define HERMES_READ_ONLY_TRAIT 13 /** A blob's hosting bucket and blob names */ struct BlobInfo { From a467bbe8af205da57844e95ea5cdf92f712c7b61 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 17 Feb 2022 16:02:02 -0600 Subject: [PATCH 12/85] Allow passing options to placement algorithms --- benchmarks/dpe_bench.cc | 2 +- src/api/vbucket.h | 4 ---- test/buffer_organizer_test.cc | 4 +++- test/dpe_optimization_test.cc | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/benchmarks/dpe_bench.cc b/benchmarks/dpe_bench.cc index 75c6017d6..4a71fe337 100644 --- a/benchmarks/dpe_bench.cc +++ b/benchmarks/dpe_bench.cc @@ -163,7 +163,7 @@ int main(int argc, char **argv) { time_point start_tm = now(); result = MinimizeIoTimePlacement(blob_sizes, tgt_state.bytes_available, tgt_state.bandwidth, targets, - output_tmp); + output_tmp, api::Context()); time_point end_tm = now(); dpe_seconds = std::chrono::duration(end_tm - start_tm).count(); break; diff --git a/src/api/vbucket.h b/src/api/vbucket.h index dfc8c117c..fd963ab13 100644 --- a/src/api/vbucket.h +++ b/src/api/vbucket.h @@ -121,10 +121,6 @@ class VBucket { size_t Get(const std::string &name, Bucket &bkt, Blob &user_blob, const Context &ctx); size_t Get(const std::string &name, Bucket &bkt, Blob &user_blob); - - /** - * * * \brief Retrieve a Blob into a user buffer. - * * */ size_t Get(const std::string &name, Bucket &bkt, void *user_blob, size_t blob_size, const Context &ctx); diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 905691a0e..0714ca600 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -253,7 +253,9 @@ static void TestWriteOnlyBucket() { hapi::WriteOnlyTrait trait; vbkt.Attach(&trait); - hapi::Blob blob(KILOBYTES(4), 127); + const size_t kBlobSize = KILOBYTES(4); + hapi::Blob blob(kBlobSize); + std::iota(blob.begin(), blob.end(), 0); const int kIters = 128; for (int i = 0; i < kIters; ++i) { diff --git a/test/dpe_optimization_test.cc b/test/dpe_optimization_test.cc index 5ae476af4..6de64dcde 100644 --- a/test/dpe_optimization_test.cc +++ b/test/dpe_optimization_test.cc @@ -33,7 +33,7 @@ void MinimizeIoTimePlaceBlob(std::vector &blob_sizes, Status result = MinimizeIoTimePlacement(blob_sizes, node_state.bytes_available, node_state.bandwidth, targets, - schemas_tmp); + schemas_tmp, api::Context()); if (result.Failed()) { std::cout << "\nMinimizeIoTimePlacement failed\n" << std::flush; exit(1); From a57ab3bf01f831ec64d2cb8bb4b7b1f59f23d1e1 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 17 Feb 2022 16:02:22 -0600 Subject: [PATCH 13/85] Initial BORG benchmark --- benchmarks/CMakeLists.txt | 2 +- benchmarks/borg_bench.cc | 76 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 benchmarks/borg_bench.cc diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 7abc369b8..57ae9c3ff 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -3,7 +3,7 @@ include_directories( ${PROJECT_SOURCE_DIR}/test ) -set(BENCHMARKS mdm_bench dpe_bench vpic_bench) +set(BENCHMARKS mdm_bench dpe_bench vpic_bench borg_bench) foreach(benchmark ${BENCHMARKS}) add_executable(${benchmark} ${benchmark}.cc) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc new file mode 100644 index 000000000..a5026e294 --- /dev/null +++ b/benchmarks/borg_bench.cc @@ -0,0 +1,76 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Distributed under BSD 3-Clause license. * + * Copyright by The HDF Group. * + * Copyright by the Illinois Institute of Technology. * + * All rights reserved. * + * * + * This file is part of Hermes. The full Hermes copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the top directory. If you do not * + * have access to the file, you may request a copy from help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +#include + +#include + +#include "hermes.h" +#include "bucket.h" +#include "vbucket.h" +#include "test_utils.h" + +namespace hapi = hermes::api; +using HermesPtr = std::shared_ptr; + +int main(int argc, char *argv[]) { + int mpi_threads_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); + if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { + fprintf(stderr, "Didn't receive appropriate MPI threading specification\n"); + return 1; + } + + bool use_borg = true; + HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); + + if (hermes->IsApplicationCore()) { + hapi::Context ctx; + // Disable swapping of Blobs + ctx.disable_swap = true; + // disable MinimizeIoTime PlacementPolicy constraints + ctx.minimize_io_time_options.minimum_remaining_capacity = 0; + ctx.minimize_io_time_options.capacity_change_threshold = 0; + + std::string bkt_name = "BORG"; + hapi::VBucket vbkt(bkt_name, hermes); + hapi::Bucket bkt(bkt_name, hermes); + + hapi::WriteOnlyTrait trait; + vbkt.Attach(&trait); + + const size_t kBlobSize = KILOBYTES(4); + hapi::Blob blob(kBlobSize); + std::iota(blob.begin(), blob.end(), 0); + + // MinIoTime with retry + const int kIters = 128; + for (int i = 0; i < kIters; ++i) { + std::string blob_name = "b" + std::to_string(i); + bkt.Put(blob_name, blob); + if (use_borg) { + vbkt.Link(blob_name, bkt_name); + } + } + + vbkt.Destroy(); + bkt.Destroy(); + } + + hermes->Finalize(); + + MPI_Finalize(); + + return 0; +} From 5477e06e81ac3678dce32118cf61b3298297317d Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 22 Feb 2022 16:18:55 -0600 Subject: [PATCH 14/85] Basic write only BORG benchmark running --- benchmarks/borg_bench.cc | 51 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index a5026e294..db50b2561 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -24,6 +24,21 @@ namespace hapi = hermes::api; using HermesPtr = std::shared_ptr; +double GetMPIAverage(double rank_seconds, int num_ranks, MPI_Comm comm) { + double total_secs = 0; + MPI_Reduce(&rank_seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, comm); + double result = total_secs / num_ranks; + + return result; +} + +double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, int ranks) { + double avg_total_seconds = GetMPIAverage(total_elapsed, ranks, comm); + double result = total_mb / avg_total_seconds; + + return result; +} + int main(int argc, char *argv[]) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -36,6 +51,7 @@ int main(int argc, char *argv[]) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { + hermes::testing::Timer timer; hapi::Context ctx; // Disable swapping of Blobs ctx.disable_swap = true; @@ -58,14 +74,43 @@ int main(int argc, char *argv[]) { const int kIters = 128; for (int i = 0; i < kIters; ++i) { std::string blob_name = "b" + std::to_string(i); - bkt.Put(blob_name, blob); + timer.resumeTime(); + hapi::Status status; + while (!status.Succeeded()) { + status = bkt.Put(blob_name, blob); + } if (use_borg) { vbkt.Link(blob_name, bkt_name); } + timer.pauseTime(); + hermes->AppBarrier(); + } + + hermes->AppBarrier(); + if (!hermes->IsFirstRankOnNode()) { + vbkt.Release(); + bkt.Release(); + } + + hermes->AppBarrier(); + if (hermes->IsFirstRankOnNode()) { + vbkt.Destroy(); + bkt.Destroy(); + } + + hermes->AppBarrier(); + + MPI_Comm *comm = (MPI_Comm *)hermes->GetAppCommunicator(); + int num_ranks = hermes->GetNumProcesses(); + double total_mb = (kBlobSize * kIters * num_ranks) / 1024.0 / 1024.0; + double bandwidth = GetBandwidth(timer.getElapsedTime(), total_mb, *comm, + num_ranks); + + if (hermes->IsFirstRankOnNode()) { + fprintf(stderr, "##################### %f MiB/s\n", bandwidth); } - vbkt.Destroy(); - bkt.Destroy(); + hermes->AppBarrier(); } hermes->Finalize(); From f293a60032a182728b0c43b9d4aef089b1b59332 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 22 Feb 2022 16:20:40 -0600 Subject: [PATCH 15/85] Appease linter --- benchmarks/borg_bench.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index db50b2561..1f86260b7 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -32,7 +32,8 @@ double GetMPIAverage(double rank_seconds, int num_ranks, MPI_Comm comm) { return result; } -double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, int ranks) { +double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, + int ranks) { double avg_total_seconds = GetMPIAverage(total_elapsed, ranks, comm); double result = total_mb / avg_total_seconds; From fa5323d8d24157e5507b58108df612add62f390d Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 2 Mar 2022 10:19:15 -0600 Subject: [PATCH 16/85] WIP --- benchmarks/borg_bench.cc | 35 +++++++++++++++++++++-------------- src/buffer_organizer.cc | 2 ++ src/memory_management.cc | 2 +- src/metadata_management.cc | 1 + test/data/hermes.conf | 2 +- 5 files changed, 26 insertions(+), 16 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 1f86260b7..49520ae31 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -52,6 +52,7 @@ int main(int argc, char *argv[]) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { + int rank = hermes->GetProcessRank(); hermes::testing::Timer timer; hapi::Context ctx; // Disable swapping of Blobs @@ -60,7 +61,7 @@ int main(int argc, char *argv[]) { ctx.minimize_io_time_options.minimum_remaining_capacity = 0; ctx.minimize_io_time_options.capacity_change_threshold = 0; - std::string bkt_name = "BORG"; + std::string bkt_name = "BORG" + std::string(" ") + std::to_string(rank); hapi::VBucket vbkt(bkt_name, hermes); hapi::Bucket bkt(bkt_name, hermes); @@ -73,31 +74,39 @@ int main(int argc, char *argv[]) { // MinIoTime with retry const int kIters = 128; + size_t failed_puts = 0; + size_t failed_links = 0; for (int i = 0; i < kIters; ++i) { - std::string blob_name = "b" + std::to_string(i); + std::string blob_name = "b_" + std::to_string(rank) + "_" + std::to_string(i); timer.resumeTime(); hapi::Status status; - while (!status.Succeeded()) { - status = bkt.Put(blob_name, blob); + while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { + failed_puts++; } if (use_borg) { - vbkt.Link(blob_name, bkt_name); + hapi::Status link_status = vbkt.Link(blob_name, bkt_name); + if (!link_status.Succeeded()) { + failed_links++; + } } timer.pauseTime(); hermes->AppBarrier(); } - hermes->AppBarrier(); - if (!hermes->IsFirstRankOnNode()) { - vbkt.Release(); - bkt.Release(); - } + std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; + std::cout << " " << "failed links: " << failed_links << "\n"; + + // hermes->AppBarrier(); + // if (!hermes->IsFirstRankOnNode()) { + // vbkt.Release(); + // bkt.Release(); + // } hermes->AppBarrier(); - if (hermes->IsFirstRankOnNode()) { + // if (hermes->IsFirstRankOnNode()) { vbkt.Destroy(); bkt.Destroy(); - } + // } hermes->AppBarrier(); @@ -110,8 +119,6 @@ int main(int argc, char *argv[]) { if (hermes->IsFirstRankOnNode()) { fprintf(stderr, "##################### %f MiB/s\n", bandwidth); } - - hermes->AppBarrier(); } hermes->Finalize(); diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 6e40cfa42..d620fa0aa 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -187,6 +187,7 @@ void LocalEnqueueBoMove(SharedMemoryContext *context, RpcContext *rpc, void BoMove(SharedMemoryContext *context, RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, BucketID bucket_id, const std::string &internal_blob_name) { + VLOG(1) << "Moving blob " << blob_id.bits.buffer_ids_offset << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); if (LocalLockBlob(context, blob_id)) { @@ -280,6 +281,7 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, LocalFreeBufferIdList(context, blob_id); } LocalUnlockBlob(context, blob_id); + VLOG(1) << "Done moving blob " << blob_id.bits.buffer_ids_offset << std::endl; } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } diff --git a/src/memory_management.cc b/src/memory_management.cc index f4246c363..f69f8376f 100644 --- a/src/memory_management.cc +++ b/src/memory_management.cc @@ -480,7 +480,7 @@ void BeginTicketMutex(TicketMutex *mutex) { // ticket mutex with a waiting array at some point: // https://arxiv.org/pdf/1810.01573.pdf. It looks like that should give // us the best of both worlds. - sched_yield(); + // sched_yield(); } } diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 1238b3ce9..391d88f98 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -729,6 +729,7 @@ void WaitForOutstandingBlobOps(MetadataManager *mdm, BlobID blob_id) { t = TryBeginTicketMutex(&blob_info->lock, ticket); } else { // Blob was deleted + ReleaseBlobInfoPtr(mdm); break; } if (!t.acquired) { diff --git a/test/data/hermes.conf b/test/data/hermes.conf index 7f3892154..151963f0f 100644 --- a/test/data/hermes.conf +++ b/test/data/hermes.conf @@ -103,7 +103,7 @@ rpc_host_number_range = {}; # The number of handler threads for each RPC server. rpc_num_threads = 1; # The number of threads used in the background organization of internal Hermes buffers. -buffer_organizer_num_threads = 4; +buffer_organizer_num_threads = 1; # The shared memory prefix for the hermes shared memory segment. A user name # will be automatically appended. buffer_pool_shmem_name = "/hermes_buffer_pool_"; From bc0ef4e09a97a5b5ab5f3c77ea09bb6debee6d58 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 5 May 2022 11:24:19 -0500 Subject: [PATCH 17/85] Appease linter --- benchmarks/borg_bench.cc | 3 ++- src/buffer_organizer.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 49520ae31..dcc5cba57 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -77,7 +77,8 @@ int main(int argc, char *argv[]) { size_t failed_puts = 0; size_t failed_links = 0; for (int i = 0; i < kIters; ++i) { - std::string blob_name = "b_" + std::to_string(rank) + "_" + std::to_string(i); + std::string blob_name = ("b_" + std::to_string(rank) + "_" + + std::to_string(i)); timer.resumeTime(); hapi::Status status; while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index d620fa0aa..83468c126 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -281,7 +281,7 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, LocalFreeBufferIdList(context, blob_id); } LocalUnlockBlob(context, blob_id); - VLOG(1) << "Done moving blob " << blob_id.bits.buffer_ids_offset << std::endl; + VLOG(1) << "Done moving blob " << blob_id.bits.buffer_ids_offset; } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } From f015dc35e276e4425a3c487ff5d561e8184b6e5e Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 6 May 2022 07:17:35 -0500 Subject: [PATCH 18/85] Remove default context arg from MinimizeIoTimePlacement --- benchmarks/dpe_bench.cc | 2 +- src/metadata_management_internal.h | 2 +- test/dpe_optimization_test.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/dpe_bench.cc b/benchmarks/dpe_bench.cc index 4a71fe337..75c6017d6 100644 --- a/benchmarks/dpe_bench.cc +++ b/benchmarks/dpe_bench.cc @@ -163,7 +163,7 @@ int main(int argc, char **argv) { time_point start_tm = now(); result = MinimizeIoTimePlacement(blob_sizes, tgt_state.bytes_available, tgt_state.bandwidth, targets, - output_tmp, api::Context()); + output_tmp); time_point end_tm = now(); dpe_seconds = std::chrono::duration(end_tm - start_tm).count(); break; diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index 432007ba1..df526788d 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -77,7 +77,7 @@ void LocalDelete(MetadataManager *mdm, const char *key, MapType map_type); u64 LocalGetRemainingTargetCapacity(SharedMemoryContext *context, TargetID id); std::vector LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, - std::vector adjustments); + std::vector adjustments); SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context); std::vector LocalGetGlobalDeviceCapacities(SharedMemoryContext *context); std::vector GetGlobalDeviceCapacities(SharedMemoryContext *context, diff --git a/test/dpe_optimization_test.cc b/test/dpe_optimization_test.cc index 6de64dcde..5ae476af4 100644 --- a/test/dpe_optimization_test.cc +++ b/test/dpe_optimization_test.cc @@ -33,7 +33,7 @@ void MinimizeIoTimePlaceBlob(std::vector &blob_sizes, Status result = MinimizeIoTimePlacement(blob_sizes, node_state.bytes_available, node_state.bandwidth, targets, - schemas_tmp, api::Context()); + schemas_tmp); if (result.Failed()) { std::cout << "\nMinimizeIoTimePlacement failed\n" << std::flush; exit(1); From baec9d6e348ee9243dbb42bc3849b162674efca9 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 6 May 2022 11:16:40 -0500 Subject: [PATCH 19/85] Fix deadlock in buffer_organizer_test --- src/buffer_organizer.cc | 7 ++++++- src/metadata_management.cc | 1 + test/buffer_organizer_test.cc | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 83468c126..60446e3dc 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -278,7 +278,12 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, if (!BlobIsInSwap(blob_id)) { LocalReleaseBuffers(context, replaced_ids); } - LocalFreeBufferIdList(context, blob_id); + // NOTE(chogan): We don't free the Blob's BufferIdList here because that + // would make the buffer_id_list_offset available for new incoming Blobs, + // and we can't reuse the buffer_id_list_offset until the old BlobInfo is + // deleted. We take care of both in LocalLockBlob when the final + // outstanding operation on this BlobID is complete (which is tracked by + // BlobInfo::last). } LocalUnlockBlob(context, blob_id); VLOG(1) << "Done moving blob " << blob_id.bits.buffer_ids_offset; diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 391d88f98..bab2e4db4 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -1465,6 +1465,7 @@ bool LocalLockBlob(SharedMemoryContext *context, BlobID blob_id) { result = false; if (t.ticket == blob_info->last) { LocalDelete(mdm, blob_id); + LocalFreeBufferIdList(context, blob_id); } } ReleaseBlobInfoPtr(mdm); diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 0714ca600..1fe994993 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -245,7 +245,7 @@ void TestOrganizeBlob() { } static void TestWriteOnlyBucket() { - HermesPtr hermes = hermes::InitHermesDaemon(); + HermesPtr hermes = hermes::InitHermesDaemon(getenv("HERMES_CONF")); std::string bkt_name = "WriteOnly"; VBucket vbkt(bkt_name, hermes); Bucket bkt(bkt_name, hermes); From 4cd7439e4f6b47317a719ffce2cc3c60be4bdaf3 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 12 May 2022 15:30:54 -0500 Subject: [PATCH 20/85] Make capacity thresholds a percentage --- src/config_parser.cc | 19 +++++++++++++------ src/config_parser.h | 2 +- src/hermes_types.h | 7 ++++++- src/memory_management.cc | 2 +- src/metadata_management.cc | 18 ++++++++++-------- src/metadata_management.h | 10 ++++++---- src/utils.cc | 5 ++--- test/config_parser_test.cc | 5 ++--- test/data/hermes.conf | 19 +++++++++++-------- 9 files changed, 52 insertions(+), 35 deletions(-) diff --git a/src/config_parser.cc b/src/config_parser.cc index 142c628ae..3144dbc22 100644 --- a/src/config_parser.cc +++ b/src/config_parser.cc @@ -82,7 +82,7 @@ static const char *kConfigVariableStrings[ConfigVariable_Count] = { "is_shared_device", "buffer_organizer_num_threads", "default_rr_split", - "bo_capacity_thresholds_mb", + "bo_capacity_thresholds", }; EntireFile ReadEntireFile(Arena *arena, const char *path) { @@ -571,8 +571,8 @@ Token *ParseFloatList(Token *tok, f32 *out, int n) { return tok; } -Token *ParseFloatListList(Token *tok, f32 out[][hermes::kMaxBufferPoolSlabs], - int n, int *m) { +template +Token *ParseFloatListList(Token *tok, f32 out[][N], int n, int *m) { if (IsOpenCurlyBrace(tok)) { tok = tok->next; for (int i = 0; i < n; ++i) { @@ -980,12 +980,19 @@ void ParseTokens(TokenList *tokens, Config *config) { config->default_rr_split = ParseInt(&tok); break; } - case ConfigVariable_BOCapacityThresholdsMiB: { + case ConfigVariable_BOCapacityThresholds: { RequireNumDevices(config); // Each entry has a min and max threshold std::vector num_thresholds(config->num_devices, 2); - tok = ParseIntListList(tok, config->bo_capacity_thresholds_mb, - config->num_devices, num_thresholds.data()); + float thresholds[kMaxDevices][2] = {0}; + + tok = ParseFloatListList(tok, thresholds, config->num_devices, + num_thresholds.data()); + + for (int i = 0; i < config->num_devices; ++i) { + config->bo_capacity_thresholds[i].min = thresholds[i][0]; + config->bo_capacity_thresholds[i].max = thresholds[i][1]; + } break; } default: { diff --git a/src/config_parser.h b/src/config_parser.h index cf6c4d693..68743e4b1 100644 --- a/src/config_parser.h +++ b/src/config_parser.h @@ -69,7 +69,7 @@ enum ConfigVariable { ConfigVariable_IsSharedDevice, ConfigVariable_BoNumThreads, ConfigVariable_RRSplit, - ConfigVariable_BOCapacityThresholdsMiB, + ConfigVariable_BOCapacityThresholds, ConfigVariable_Count }; diff --git a/src/hermes_types.h b/src/hermes_types.h index f7c1a0cf8..30d999758 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -168,6 +168,11 @@ enum ArenaType { kArenaType_Count /**< Sentinel value */ }; +struct Thresholds { + float min; + float max; +}; + /** * System and user configuration that is used to initialize Hermes. */ @@ -247,7 +252,7 @@ struct Config { bool default_rr_split; /** The min and max capacity threshold in MiB for each device at which the * BufferOrganizer will trigger. */ - int bo_capacity_thresholds_mb[kMaxDevices][2]; + Thresholds bo_capacity_thresholds[kMaxDevices]; /** A base name for the BufferPool shared memory segement. Hermes appends the * value of the USER environment variable to this string. */ diff --git a/src/memory_management.cc b/src/memory_management.cc index f69f8376f..f4246c363 100644 --- a/src/memory_management.cc +++ b/src/memory_management.cc @@ -480,7 +480,7 @@ void BeginTicketMutex(TicketMutex *mutex) { // ticket mutex with a waiting array at some point: // https://arxiv.org/pdf/1810.01573.pdf. It looks like that should give // us the best of both worlds. - // sched_yield(); + sched_yield(); } } diff --git a/src/metadata_management.cc b/src/metadata_management.cc index bab2e4db4..4f76a4685 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -983,9 +983,14 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, // Collect devices for which to trigger the BufferOrganizer if the // capacities are beyond the min/max thresholds - int mb_available = (int)(state->bytes_available[i] / 1024.0f / 1024.0f); - if (mb_available < state->bo_capacity_thresholds_mb[i][0] || - mb_available > state->bo_capacity_thresholds_mb[i][1]) { + float percentage_available = 0.0f; + if (state->bytes_available[i] > 0) { + percentage_available = + (f32)state->capacities[i] / (f32)state->bytes_available[i].load(); + } + + if (percentage_available < state->bo_capacity_thresholds[i].min || + percentage_available > state->bo_capacity_thresholds[i].max) { result.push_back((DeviceID)i); } } @@ -1057,14 +1062,11 @@ SystemViewState *CreateSystemViewState(Arena *arena, Config *config) { SystemViewState *result = PushClearedStruct(arena); result->num_devices = config->num_devices; for (int i = 0; i < result->num_devices; ++i) { + result->capacities[i] = config->capacities[i]; result->bytes_available[i] = config->capacities[i]; // Min and max thresholds - const int kNumThresholds = 2; - for (int j = 0; j < kNumThresholds; ++j) { - result->bo_capacity_thresholds_mb[i][j] = - config->bo_capacity_thresholds_mb[i][j]; - } + result->bo_capacity_thresholds[i] = config->bo_capacity_thresholds[i]; } return result; diff --git a/src/metadata_management.h b/src/metadata_management.h index 9d6d86cd1..bd73dc8ca 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -128,11 +128,13 @@ struct VBucketInfo { }; struct SystemViewState { - /** The number of bytes available for buffering in each device. */ + /** Total capacities of each device. */ + u64 capacities[kMaxDevices]; + /** The remaining bytes available for buffering in each device. */ std::atomic bytes_available[kMaxDevices]; - /** The min and max threshold for each device at which the BufferOrganizer - * will trigger. */ - int bo_capacity_thresholds_mb[kMaxDevices][2]; + /** The min and max threshold (percentage) for each device at which the + * BufferOrganizer will trigger. */ + Thresholds bo_capacity_thresholds[kMaxDevices]; /** The total number of buffering devices. */ int num_devices; }; diff --git a/src/utils.cc b/src/utils.cc index 159e84a46..216818b5d 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -122,9 +122,8 @@ void InitDefaultConfig(Config *config) { config->default_rr_split = false; for (int i = 0; i < config->num_devices; ++i) { - config->bo_capacity_thresholds_mb[i][0] = 0; - int max_capacity_mb = (int)((f32)config->capacities[i] / 1024.0f / 1024.0f); - config->bo_capacity_thresholds_mb[i][1] = max_capacity_mb; + config->bo_capacity_thresholds[i].min = 0.0f; + config->bo_capacity_thresholds[i].max = 1.0f; } } diff --git a/test/config_parser_test.cc b/test/config_parser_test.cc index e77bd18a7..1eec11a24 100644 --- a/test/config_parser_test.cc +++ b/test/config_parser_test.cc @@ -225,9 +225,8 @@ void TestDefaultConfig(Arena *arena, const char *config_file) { for (int i = 0; i < config.num_devices; ++i) { - Assert(config.bo_capacity_thresholds_mb[i][0] == 0); - int max_capacity_mb = (int)((f32)config.capacities[i] / 1024.0f / 1024.0f); - Assert(config.bo_capacity_thresholds_mb[i][1] == max_capacity_mb); + Assert(config.bo_capacity_thresholds[i].min == 0.0f); + Assert(config.bo_capacity_thresholds[i].max == 1.0f); } Assert(config.bo_num_threads == 4); diff --git a/test/data/hermes.conf b/test/data/hermes.conf index 151963f0f..42f0baab1 100644 --- a/test/data/hermes.conf +++ b/test/data/hermes.conf @@ -103,7 +103,7 @@ rpc_host_number_range = {}; # The number of handler threads for each RPC server. rpc_num_threads = 1; # The number of threads used in the background organization of internal Hermes buffers. -buffer_organizer_num_threads = 1; +buffer_organizer_num_threads = 4; # The shared memory prefix for the hermes shared memory segment. A user name # will be automatically appended. buffer_pool_shmem_name = "/hermes_buffer_pool_"; @@ -115,15 +115,18 @@ default_placement_policy = "MinimizeIoTime"; # into a random number of smaller Blobs. default_rr_split = 0; -# For each device, the minimum and maximum capacity threshold in MiB at which +# For each device, the minimum and maximum percent capacity threshold at which # the BufferOrganizer will trigger. Decreasing the maximum thresholds will cause # the BufferOrganizer to move data to lower devices, making more room in faster # devices (ideal for write-heavy workloads). Conversely, increasing the minimum # threshold will cause data to be moved from slower devices into faster devices -# (ideal for read-heavy workloads). -bo_capacity_thresholds_mb = { - {0, 50}, - {0, 50}, - {0, 50}, - {0, 50}, +# (ideal for read-heavy workloads). For example, a maximum capacity threshold of +# 0.8 would have the effect of always keeping 20% of the device's space free for +# incoming writes. Conversely, a minimum capacity threshold of 0.3 would ensure +# that the device is always at least 30% occupied. +bo_capacity_thresholds = { + {0.0, 1.0}, + {0.0, 1.0}, + {0.0, 1.0}, + {0.0, 1.0}, }; \ No newline at end of file From ff2e68b1c2fd56082d62042349c3698b36a7328b Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 17 May 2022 14:52:05 -0500 Subject: [PATCH 21/85] Placeholders --- src/buffer_organizer.cc | 18 ++++++++++++++++++ src/buffer_organizer.h | 2 ++ src/metadata_management.cc | 9 +++------ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 60446e3dc..29e55cb79 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -418,6 +418,24 @@ void OrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, } } +void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, + DeviceID devices_id) { + (void)context; + (void)rpc; + (void)devices_id; + + // TODO(chogan): Pass in whether it's min or max that's violated + // TODO(chogan): Pass in how much space needs to be filled/freed + + // while (max is violated) + // Choose largest buffer from least important Blob + // Move to lower tier + + // while (min is violated) + // Choose largest buffer from most important Blob + // Move to higher tier +} + void LocalShutdownBufferOrganizer(SharedMemoryContext *context) { // NOTE(chogan): ThreadPool destructor needs to be called manually since we // allocated the BO instance with placement new. diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index e958ba0a2..60371d87e 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -98,6 +98,8 @@ void LocalOrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, void OrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, BucketID bucket_id, const std::string &blob_name, f32 epsilon, f32 importance_score = -1); +void OrganizeDevice(SharedMemoryContext *context, RpcContext *rpc, + DeviceID devices_id); std::vector GetBufferInfo(SharedMemoryContext *context, RpcContext *rpc, const std::vector &buffer_ids); diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 4f76a4685..da8223f9c 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -989,8 +989,8 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, (f32)state->capacities[i] / (f32)state->bytes_available[i].load(); } - if (percentage_available < state->bo_capacity_thresholds[i].min || - percentage_available > state->bo_capacity_thresholds[i].max) { + // TODO(chogan): Handle violation of bo_capacity_thresholds[i].min + if (percentage_available > state->bo_capacity_thresholds[i].max) { result.push_back((DeviceID)i); } } @@ -1028,10 +1028,7 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, } for (size_t i = 0; i < devices_to_organize.size(); ++i) { - // TODO(chogan): - // for each blob with (a percentage of ?) buffers in this device: - // OrganizeBlob(context, rpc, bucket_id, blob_name, epsilon, - // custom_importance); + EnforceCapacityThresholds(context, rpc, devices_to_organize[i]); } } From 59527576803589d06e60d8988da1cc7ac58e1bf0 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 17 May 2022 15:27:27 -0500 Subject: [PATCH 22/85] Fix compilation error --- src/buffer_organizer.h | 3 ++- src/metadata_management.cc | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index 60371d87e..d45e81c58 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -113,7 +113,8 @@ void LocalEnqueueBoMove(SharedMemoryContext *context, RpcContext *rpc, void EnqueueBoMove(RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, BucketID bucket_id, const std::string &internal_name, BoPriority priority); - +void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, + DeviceID devices_id); } // namespace hermes #endif // HERMES_BUFFER_ORGANIZER_H_ diff --git a/src/metadata_management.cc b/src/metadata_management.cc index da8223f9c..eabd647b2 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -20,6 +20,7 @@ #include "memory_management.h" #include "buffer_pool.h" #include "buffer_pool_internal.h" +#include "buffer_organizer.h" #include "rpc.h" #include "metadata_storage.h" From 68007125f3cb42cb813654717e3ef31bd582d950 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 17 May 2022 15:47:55 -0500 Subject: [PATCH 23/85] Remove debugging code from borg_bench.cc --- benchmarks/borg_bench.cc | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index dcc5cba57..6ae676cd4 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -49,6 +49,11 @@ int main(int argc, char *argv[]) { } bool use_borg = true; + + if (argc == 2) { + use_borg = false; + } + HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { @@ -66,7 +71,9 @@ int main(int argc, char *argv[]) { hapi::Bucket bkt(bkt_name, hermes); hapi::WriteOnlyTrait trait; - vbkt.Attach(&trait); + if (use_borg) { + vbkt.Attach(&trait); + } const size_t kBlobSize = KILOBYTES(4); hapi::Blob blob(kBlobSize); @@ -97,17 +104,17 @@ int main(int argc, char *argv[]) { std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; std::cout << " " << "failed links: " << failed_links << "\n"; - // hermes->AppBarrier(); - // if (!hermes->IsFirstRankOnNode()) { - // vbkt.Release(); - // bkt.Release(); - // } + hermes->AppBarrier(); + if (!hermes->IsFirstRankOnNode()) { + vbkt.Release(); + bkt.Release(); + } hermes->AppBarrier(); - // if (hermes->IsFirstRankOnNode()) { + if (hermes->IsFirstRankOnNode()) { vbkt.Destroy(); bkt.Destroy(); - // } + } hermes->AppBarrier(); From d0c4f8b58083fe9f1ecf6cb0674bc3349d9897cb Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 18 May 2022 15:43:43 -0500 Subject: [PATCH 24/85] Always subtract 1 in GetHostNumberAsString --- src/rpc_thallium.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index c016807bc..f765ee6f8 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -22,12 +22,10 @@ namespace hermes { std::string GetHostNumberAsString(RpcContext *rpc, u32 node_id) { std::string result = ""; - if (rpc->num_nodes > 1) { - // Subtract 1 because the node_id index starts at 1 instead of 0. We reserve - // 0 so that BufferIDs (which are made from the node_id) can be NULL. - int index = (node_id - 1); - result = std::to_string(rpc->host_numbers[index]); - } + // Subtract 1 because the node_id index starts at 1 instead of 0. We reserve + // 0 so that BufferIDs (which are made from the node_id) can be NULL. + int index = (node_id - 1); + result = std::to_string(rpc->host_numbers[index]); return result; } From 79bfd2c61bb21117424247ed1324635a05e30082 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 18 May 2022 16:04:59 -0500 Subject: [PATCH 25/85] Only call onUnlink for valid blobs --- src/api/vbucket.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/api/vbucket.cc b/src/api/vbucket.cc index 07e1f66b5..470b8a14d 100644 --- a/src/api/vbucket.cc +++ b/src/api/vbucket.cc @@ -332,11 +332,13 @@ Status VBucket::Destroy(Context& ctx) { for (const auto& blob_id : blob_ids) { TraitInput input = {}; BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, blob_id); - input.bucket_name = GetBucketNameById(context, rpc, bucket_id); - input.blob_name = GetBlobNameFromId(context, rpc, blob_id); - if (t->onUnlinkFn != nullptr) { - t->onUnlinkFn(hermes_, input, t); - // TODO(hari): @errorhandling Check if unlinking was successful + if (!IsNullBucketId(bucket_id)) { + input.bucket_name = GetBucketNameById(context, rpc, bucket_id); + input.blob_name = GetBlobNameFromId(context, rpc, blob_id); + if (t->onUnlinkFn != nullptr) { + t->onUnlinkFn(hermes_, input, t); + // TODO(hari): @errorhandling Check if unlinking was successful + } } } } From 486000a424a5f68cb5089beef7286a01ce4a3ec0 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 20 May 2022 08:05:57 -0500 Subject: [PATCH 26/85] Fix case where rpc_host_number_range is empty --- src/api/hermes.cc | 8 ++++++-- src/rpc.h | 3 +++ src/rpc_thallium.cc | 11 +++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 1fe2f6ce0..9bce23378 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -248,9 +248,13 @@ SharedMemoryContext InitHermesCore(Config *config, CommunicationContext *comm, mdm->host_names_offset = (u8 *)rpc->host_names - (u8 *)shmem_base; } else { + // The number of host numbers in the rpc_host_number_range entry of the + // configuration file + size_t num_host_numbers = config->host_numbers.size(); + rpc->num_host_numbers = num_host_numbers; rpc->host_numbers = PushArray(&arenas[kArenaType_MetaData], - config->host_numbers.size()); - for (size_t i = 0; i < config->host_numbers.size(); ++i) { + num_host_numbers); + for (size_t i = 0; num_host_numbers; ++i) { rpc->host_numbers[i] = config->host_numbers[i]; } mdm->host_numbers_offset = (u8 *)rpc->host_numbers - (u8 *)shmem_base; diff --git a/src/rpc.h b/src/rpc.h index 1ba429c6c..55d49315e 100644 --- a/src/rpc.h +++ b/src/rpc.h @@ -42,6 +42,9 @@ struct RpcContext { /** Array of host numbers in shared memory. This size is * RpcContext::num_nodes */ int *host_numbers; + /** The number of host numbers that were present in the rpc_host_number_range + * entry in the config file*/ + size_t num_host_numbers; /** Array of host names stored in shared memory. This array size is * RpcContext::num_nodes. */ ShmemString *host_names; diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index f765ee6f8..264f2477b 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -22,10 +22,13 @@ namespace hermes { std::string GetHostNumberAsString(RpcContext *rpc, u32 node_id) { std::string result = ""; - // Subtract 1 because the node_id index starts at 1 instead of 0. We reserve - // 0 so that BufferIDs (which are made from the node_id) can be NULL. - int index = (node_id - 1); - result = std::to_string(rpc->host_numbers[index]); + + if (rpc->num_host_numbers > 0) { + // Subtract 1 because the node_id index starts at 1 instead of 0. We reserve + // 0 so that BufferIDs (which are made from the node_id) can be NULL. + int index = (node_id - 1); + result = std::to_string(rpc->host_numbers[index]); + } return result; } From 09fe08a272b36b260256e6e090022b32aabe45a8 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 24 May 2022 09:09:53 -0500 Subject: [PATCH 27/85] Add missing check in for loop --- src/api/hermes.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 9bce23378..3ee692758 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -254,7 +254,7 @@ SharedMemoryContext InitHermesCore(Config *config, CommunicationContext *comm, rpc->num_host_numbers = num_host_numbers; rpc->host_numbers = PushArray(&arenas[kArenaType_MetaData], num_host_numbers); - for (size_t i = 0; num_host_numbers; ++i) { + for (size_t i = 0; i < num_host_numbers; ++i) { rpc->host_numbers[i] = config->host_numbers[i]; } mdm->host_numbers_offset = (u8 *)rpc->host_numbers - (u8 *)shmem_base; From eb6b3bd52fddb5e35b49240cc73d24b179ddaea9 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 24 May 2022 09:42:13 -0500 Subject: [PATCH 28/85] All ranks must get RpcContext::num_host_numbers --- benchmarks/borg_bench.cc | 2 +- src/api/hermes.cc | 8 ++------ src/rpc_thallium.cc | 4 ++++ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 6ae676cd4..307dfd28c 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) { ctx.minimize_io_time_options.minimum_remaining_capacity = 0; ctx.minimize_io_time_options.capacity_change_threshold = 0; - std::string bkt_name = "BORG" + std::string(" ") + std::to_string(rank); + std::string bkt_name = "BORG_" + std::to_string(rank); hapi::VBucket vbkt(bkt_name, hermes); hapi::Bucket bkt(bkt_name, hermes); diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 3ee692758..316a062be 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -248,13 +248,9 @@ SharedMemoryContext InitHermesCore(Config *config, CommunicationContext *comm, mdm->host_names_offset = (u8 *)rpc->host_names - (u8 *)shmem_base; } else { - // The number of host numbers in the rpc_host_number_range entry of the - // configuration file - size_t num_host_numbers = config->host_numbers.size(); - rpc->num_host_numbers = num_host_numbers; rpc->host_numbers = PushArray(&arenas[kArenaType_MetaData], - num_host_numbers); - for (size_t i = 0; i < num_host_numbers; ++i) { + rpc->num_host_numbers); + for (size_t i = 0; i < rpc->num_host_numbers; ++i) { rpc->host_numbers[i] = config->host_numbers[i]; } mdm->host_numbers_offset = (u8 *)rpc->host_numbers - (u8 *)shmem_base; diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 264f2477b..1af756076 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -642,6 +642,10 @@ void StopGlobalSystemViewStateUpdateThread(RpcContext *rpc) { void InitRpcContext(RpcContext *rpc, u32 num_nodes, u32 node_id, Config *config) { rpc->num_nodes = num_nodes; + // The number of host numbers in the rpc_host_number_range entry of the + // configuration file. Not necessarily the number of nodes because when there + // is only 1 node, the entry can be left blank, or contain 1 host number. + rpc->num_host_numbers = config->host_numbers.size(); rpc->node_id = node_id; rpc->start_server = ThalliumStartRpcServer; rpc->state_size = sizeof(ThalliumState); From d1294fc8a0d0f2744ede116239e4f37805919634 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 24 May 2022 10:51:15 -0500 Subject: [PATCH 29/85] Blob name in verbose BORG logging --- src/buffer_organizer.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 29e55cb79..d8a0bd53c 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -187,7 +187,9 @@ void LocalEnqueueBoMove(SharedMemoryContext *context, RpcContext *rpc, void BoMove(SharedMemoryContext *context, RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, BucketID bucket_id, const std::string &internal_blob_name) { - VLOG(1) << "Moving blob " << blob_id.bits.buffer_ids_offset << std::endl; + VLOG(1) << "Moving blob " + << internal_blob_name.substr(kBucketIdStringSize, std::string::npos) + << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); if (LocalLockBlob(context, blob_id)) { @@ -286,7 +288,9 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, // BlobInfo::last). } LocalUnlockBlob(context, blob_id); - VLOG(1) << "Done moving blob " << blob_id.bits.buffer_ids_offset; + VLOG(1) << "Done moving blob " + << internal_blob_name.substr(kBucketIdStringSize, std::string::npos) + << std::endl; } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } From d5db940b9bba015a3d5a65de6bc46f4acef3cca9 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 24 May 2022 15:48:36 -0500 Subject: [PATCH 30/85] Check for NULL BucketInfo --- src/api/vbucket.cc | 2 +- src/buffer_organizer.cc | 3 +-- src/metadata_storage_stb_ds.cc | 19 +++++++++++-------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/api/vbucket.cc b/src/api/vbucket.cc index 470b8a14d..f567d7c2d 100644 --- a/src/api/vbucket.cc +++ b/src/api/vbucket.cc @@ -316,7 +316,7 @@ Status VBucket::Destroy(Context& ctx) { Status result; if (IsValid()) { - // NOTE(chogan): Let all flusing tasks complete before destroying the + // NOTE(chogan): Let all flushing tasks complete before destroying the // VBucket. WaitForBackgroundFlush(); diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index d8a0bd53c..3ac5e8f7c 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -174,11 +174,10 @@ void LocalEnqueueBoMove(SharedMemoryContext *context, RpcContext *rpc, BoPriority priority) { ThreadPool *pool = &context->bo->pool; bool is_high_priority = priority == BoPriority::kHigh; - VLOG(1) << "BufferOrganizer moving Blob " << blob_id.as_int; + VLOG(1) << "BufferOrganizer queuing Blob " << blob_id.as_int; pool->run(std::bind(BoMove, context, rpc, moves, blob_id, bucket_id, internal_blob_name), is_high_priority); - VLOG(1) << "BufferOrganizer " << blob_id.as_int << " done\n"; } /** diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 21e79f884..9b5d92d2a 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -424,17 +424,20 @@ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, BlobID new_blob_id) { MetadataManager *mdm = GetMetadataManagerFromContext(context); BeginTicketMutex(&mdm->bucket_mutex); - BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); - ChunkedIdList *blobs = &info->blobs; - BlobID *blobs_arr = (BlobID *)GetIdsPtr(mdm, *blobs); - for (u32 i = 0; i < blobs->length; ++i) { - if (blobs_arr[i].as_int == old_blob_id.as_int) { - blobs_arr[i] = new_blob_id; - break; + if (info) { + BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); + ChunkedIdList *blobs = &info->blobs; + + BlobID *blobs_arr = (BlobID *)GetIdsPtr(mdm, *blobs); + for (u32 i = 0; i < blobs->length; ++i) { + if (blobs_arr[i].as_int == old_blob_id.as_int) { + blobs_arr[i] = new_blob_id; + break; + } } + ReleaseIdsPtr(mdm); } - ReleaseIdsPtr(mdm); EndTicketMutex(&mdm->bucket_mutex); } From fa3cc678bba14420a3d186da3542e9a9c2132f77 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 24 May 2022 15:51:45 -0500 Subject: [PATCH 31/85] Fix typo --- src/metadata_storage_stb_ds.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 9b5d92d2a..79e616da7 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -424,9 +424,9 @@ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, BlobID new_blob_id) { MetadataManager *mdm = GetMetadataManagerFromContext(context); BeginTicketMutex(&mdm->bucket_mutex); + BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); if (info) { - BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); ChunkedIdList *blobs = &info->blobs; BlobID *blobs_arr = (BlobID *)GetIdsPtr(mdm, *blobs); From fb8094c71aed5cfde8ca657b92449333b40c7cbb Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 25 May 2022 15:08:16 -0500 Subject: [PATCH 32/85] Fix deadlock issue with aggressive locking --- benchmarks/borg_bench.cc | 2 +- src/buffer_organizer.cc | 7 +++++++ src/metadata_storage_stb_ds.cc | 7 ++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 307dfd28c..601ed66b5 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -75,7 +75,7 @@ int main(int argc, char *argv[]) { vbkt.Attach(&trait); } - const size_t kBlobSize = KILOBYTES(4); + const size_t kBlobSize = KILOBYTES(32); hapi::Blob blob(kBlobSize); std::iota(blob.begin(), blob.end(), 0); diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 3ac5e8f7c..4f824c619 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -191,6 +191,10 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); + // TODO(chogan): This locking is too aggressive but I don't know how else to + // solve the deadlock that results when the following block of code is running + // after LocalDestroyBlobByName holds MetadataManager::bucket_mutex + BeginTicketMutex(&mdm->bucket_mutex); if (LocalLockBlob(context, blob_id)) { auto warning_string = [](BufferID id) { std::ostringstream ss; @@ -240,6 +244,8 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, } if (replacement_ids.size() > 0) { + // TODO(chogan): Only need to allocate a new BufferIdList if + // replacement.size > replaced.size std::vector buffer_ids = LocalGetBufferIdList(mdm, blob_id); using BufferIdSet = std::unordered_set; BufferIdSet new_buffer_ids(buffer_ids.begin(), buffer_ids.end()); @@ -293,6 +299,7 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } + EndTicketMutex(&mdm->bucket_mutex); } void LocalOrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 79e616da7..1ab1764fe 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -419,14 +419,15 @@ i64 GetIndexOfId(MetadataManager *mdm, ChunkedIdList *id_list, u64 id) { return result; } +/** Assumes MetadataManager::bucket_mutex is held by the caller. */ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, BucketID bucket_id, BlobID old_blob_id, BlobID new_blob_id) { MetadataManager *mdm = GetMetadataManagerFromContext(context); - BeginTicketMutex(&mdm->bucket_mutex); + // BeginTicketMutex(&mdm->bucket_mutex); BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); - if (info) { + if (info && info->active) { ChunkedIdList *blobs = &info->blobs; BlobID *blobs_arr = (BlobID *)GetIdsPtr(mdm, *blobs); @@ -439,7 +440,7 @@ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, ReleaseIdsPtr(mdm); } - EndTicketMutex(&mdm->bucket_mutex); + // EndTicketMutex(&mdm->bucket_mutex); } void LocalAddBlobIdToBucket(MetadataManager *mdm, BucketID bucket_id, From 3715eeca3fb2e9f6269f3d7d632c95264621af38 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 26 May 2022 10:42:46 -0500 Subject: [PATCH 33/85] Default MinimizeIoTimeOptions should be simplest case --- src/data_placement_engine.cc | 28 +++++++++++++++------------- src/hermes_types.h | 13 ++++++++----- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/data_placement_engine.cc b/src/data_placement_engine.cc index aea6d6504..e773130dc 100644 --- a/src/data_placement_engine.cc +++ b/src/data_placement_engine.cc @@ -351,22 +351,24 @@ Status MinimizeIoTimePlacement(const std::vector &blob_sizes, int last4 = 0; // Placement Ratio - for (size_t j {0}; j < num_targets-1; ++j) { - std::string row_name {"pr_row_" + std::to_string(j)}; - glp_set_row_name(lp, num_constrts+j+1, row_name.c_str()); - glp_set_row_bnds(lp, num_constrts+j+1, GLP_LO, 0.0, 0.0); - - for (size_t i {0}; i < num_blobs; ++i) { - int ij = j * num_blobs + i + 1 + last3 + j; - ia[ij] = num_constrts+j+1, ja[ij] = j+2, + if (ctx.minimize_io_time_options.use_placement_ratio) { + for (size_t j {0}; j < num_targets-1; ++j) { + std::string row_name {"pr_row_" + std::to_string(j)}; + glp_set_row_name(lp, num_constrts+j+1, row_name.c_str()); + glp_set_row_bnds(lp, num_constrts+j+1, GLP_LO, 0.0, 0.0); + + for (size_t i {0}; i < num_blobs; ++i) { + int ij = j * num_blobs + i + 1 + last3 + j; + ia[ij] = num_constrts+j+1, ja[ij] = j+2, ar[ij] = static_cast(blob_sizes[i]); - double placement_ratio = static_cast(node_state[j+1])/ - node_state[j]; - ij = ij + 1; - ia[ij] = num_constrts+j+1, ja[ij] = j+1, + double placement_ratio = static_cast(node_state[j+1])/ + node_state[j]; + ij = ij + 1; + ia[ij] = num_constrts+j+1, ja[ij] = j+1, ar[ij] = static_cast(blob_sizes[i])*(0-placement_ratio); - last4 = ij; + last4 = ij; + } } } diff --git a/src/hermes_types.h b/src/hermes_types.h index 30d999758..91d2c450b 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -56,11 +56,14 @@ enum class PlacementPolicy { struct MinimizeIoTimeOptions { double minimum_remaining_capacity; double capacity_change_threshold; - - MinimizeIoTimeOptions(double minimum_remaining_capacity = 0.1, - double capacity_change_threshold = 0.2) - : minimum_remaining_capacity(minimum_remaining_capacity), - capacity_change_threshold(capacity_change_threshold) { + bool use_placement_ratio; + + MinimizeIoTimeOptions(double minimum_remaining_capacity_ = 0.0, + double capacity_change_threshold_ = 0.0, + bool use_placement_ratio_ = false) + : minimum_remaining_capacity(minimum_remaining_capacity_), + capacity_change_threshold(capacity_change_threshold_), + use_placement_ratio(use_placement_ratio_) { } }; From bc927c12d20b5b8cf5768476e8d9930252887d93 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 26 May 2022 12:56:05 -0500 Subject: [PATCH 34/85] Add Timer::reset --- test/test_utils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_utils.h b/test/test_utils.h index 906a508db..81d078f33 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -43,6 +43,9 @@ class Timer { double getElapsedTime() { return elapsed_time; } + void reset() { + elapsed_time = 0; + } private: std::chrono::high_resolution_clock::time_point t1; double elapsed_time; From 43d54e23a01e36c134d5953312d8f12d7b5f3515 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 26 May 2022 14:03:49 -0500 Subject: [PATCH 35/85] More granular locking in DestroyBucket --- src/buffer_organizer.cc | 5 ----- src/metadata_storage_stb_ds.cc | 13 ++++++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 4f824c619..5c65323d5 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -191,10 +191,6 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); - // TODO(chogan): This locking is too aggressive but I don't know how else to - // solve the deadlock that results when the following block of code is running - // after LocalDestroyBlobByName holds MetadataManager::bucket_mutex - BeginTicketMutex(&mdm->bucket_mutex); if (LocalLockBlob(context, blob_id)) { auto warning_string = [](BufferID id) { std::ostringstream ss; @@ -299,7 +295,6 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } - EndTicketMutex(&mdm->bucket_mutex); } void LocalOrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 1ab1764fe..bfef33de8 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -419,12 +419,11 @@ i64 GetIndexOfId(MetadataManager *mdm, ChunkedIdList *id_list, u64 id) { return result; } -/** Assumes MetadataManager::bucket_mutex is held by the caller. */ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, BucketID bucket_id, BlobID old_blob_id, BlobID new_blob_id) { MetadataManager *mdm = GetMetadataManagerFromContext(context); - // BeginTicketMutex(&mdm->bucket_mutex); + BeginTicketMutex(&mdm->bucket_mutex); BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); if (info && info->active) { @@ -440,7 +439,7 @@ void LocalReplaceBlobIdInBucket(SharedMemoryContext *context, ReleaseIdsPtr(mdm); } - // EndTicketMutex(&mdm->bucket_mutex); + EndTicketMutex(&mdm->bucket_mutex); } void LocalAddBlobIdToBucket(MetadataManager *mdm, BucketID bucket_id, @@ -622,8 +621,6 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, BeginTicketMutex(&mdm->bucket_mutex); BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); - // TODO(chogan): @optimization Lock granularity can probably be relaxed if - // this is slow int ref_count = info->ref_count.load(); if (ref_count == 1) { if (HasAllocatedBlobs(info)) { @@ -638,9 +635,15 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, } ReleaseIdsPtr(mdm); + // NOTE(chogan): Holding the mdm->bucket_mutex while destroying Blobs can + // result in deadlock if the BORG is in the middle of moving a Blob's + // Buffers. + EndTicketMutex(&mdm->bucket_mutex); for (auto blob_id : blobs_to_destroy) { DestroyBlobById(context, rpc, blob_id, bucket_id); } + BeginTicketMutex(&mdm->bucket_mutex); + // Delete BlobId list FreeIdList(mdm, info->blobs); } From be08c5269fed134b4a902f68bba0c8181e43ec68 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 3 Jun 2022 07:49:34 -0500 Subject: [PATCH 36/85] Time groups of Puts --- benchmarks/borg_bench.cc | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 601ed66b5..79bee61a9 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -53,6 +53,13 @@ int main(int argc, char *argv[]) { if (argc == 2) { use_borg = false; } + // int gdb_iii = 0; + // char gdb_DEBUG_hostname[256]; + // gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); + // printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); + // fflush(stdout); + // while (0 == gdb_iii) + // sleep(5); HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); @@ -62,13 +69,10 @@ int main(int argc, char *argv[]) { hapi::Context ctx; // Disable swapping of Blobs ctx.disable_swap = true; - // disable MinimizeIoTime PlacementPolicy constraints - ctx.minimize_io_time_options.minimum_remaining_capacity = 0; - ctx.minimize_io_time_options.capacity_change_threshold = 0; std::string bkt_name = "BORG_" + std::to_string(rank); hapi::VBucket vbkt(bkt_name, hermes); - hapi::Bucket bkt(bkt_name, hermes); + hapi::Bucket bkt(bkt_name, hermes, ctx); hapi::WriteOnlyTrait trait; if (use_borg) { @@ -80,24 +84,40 @@ int main(int argc, char *argv[]) { std::iota(blob.begin(), blob.end(), 0); // MinIoTime with retry - const int kIters = 128; + const int kIters = 1500; + const int kReportFrequency = 30; + hermes::testing::Timer put_timer; size_t failed_puts = 0; size_t failed_links = 0; for (int i = 0; i < kIters; ++i) { std::string blob_name = ("b_" + std::to_string(rank) + "_" + std::to_string(i)); timer.resumeTime(); + put_timer.resumeTime(); hapi::Status status; + int consecutive_fails = 0; while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { failed_puts++; + if (++consecutive_fails > 10) { + break; + } } - if (use_borg) { + put_timer.pauseTime(); + + if (use_borg && consecutive_fails <= 10) { hapi::Status link_status = vbkt.Link(blob_name, bkt_name); if (!link_status.Succeeded()) { failed_links++; } } timer.pauseTime(); + if (i > 0 && i % kReportFrequency == 0) { + // TODO(chogan): Support more than 1 rank + constexpr double total_mb = (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; + + std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; + put_timer.reset(); + } hermes->AppBarrier(); } From 64985cfd39c6a0461b4bdc94ada7a22244c0e097 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 3 Jun 2022 10:10:31 -0500 Subject: [PATCH 37/85] Add bucket_delete_mutex --- benchmarks/borg_bench.cc | 3 ++- src/buffer_organizer.cc | 7 ++++++- src/memory_management.cc | 16 ++++++++++++++++ src/metadata_management.h | 2 ++ src/metadata_storage_stb_ds.cc | 6 ++++-- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 79bee61a9..ceb71e0cd 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -113,7 +113,8 @@ int main(int argc, char *argv[]) { timer.pauseTime(); if (i > 0 && i % kReportFrequency == 0) { // TODO(chogan): Support more than 1 rank - constexpr double total_mb = (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; + constexpr double total_mb = + (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; put_timer.reset(); diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 5c65323d5..13beb6be2 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -191,7 +191,8 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); - if (LocalLockBlob(context, blob_id)) { + bool got_lock = BeginTicketMutexIfNoWait(&mdm->bucket_delete_mutex); + if (got_lock && LocalLockBlob(context, blob_id)) { auto warning_string = [](BufferID id) { std::ostringstream ss; ss << "BufferID" << id.as_int << " not found on this node\n"; @@ -295,6 +296,10 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, } else { LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; } + + if (got_lock) { + EndTicketMutex(&mdm->bucket_delete_mutex); + } } void LocalOrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, diff --git a/src/memory_management.cc b/src/memory_management.cc index f4246c363..9bffe0f92 100644 --- a/src/memory_management.cc +++ b/src/memory_management.cc @@ -469,6 +469,22 @@ Ticket TryBeginTicketMutex(TicketMutex *mutex, Ticket *existing_ticket) { return result; } +/** + * + */ +bool BeginTicketMutexIfNoWait(TicketMutex *mutex) { + u32 serving = mutex->serving.load(); + u32 ticket = mutex->ticket.load(); + u32 next = ticket + 1; + + bool result = false; + if (serving == ticket) { + result = mutex->ticket.compare_exchange_strong(ticket, next); + } + + return result; +} + void BeginTicketMutex(TicketMutex *mutex) { u32 ticket = mutex->ticket.fetch_add(1); while (ticket != mutex->serving.load()) { diff --git a/src/metadata_management.h b/src/metadata_management.h index bd73dc8ca..3e8da6e2d 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -170,6 +170,8 @@ struct MetadataManager { /** Lock for accessing `BucketInfo` structures located at * `bucket_info_offset` */ TicketMutex bucket_mutex; + TicketMutex bucket_delete_mutex; + /** Lock for accessing `VBucketInfo` structures located at * `vbucket_info_offset` */ TicketMutex vbucket_mutex; diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index bfef33de8..44497bc25 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -618,6 +618,7 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, const char *bucket_name, BucketID bucket_id) { bool destroyed = false; MetadataManager *mdm = GetMetadataManagerFromContext(context); + BeginTicketMutex(&mdm->bucket_delete_mutex); BeginTicketMutex(&mdm->bucket_mutex); BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); @@ -638,11 +639,11 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, // NOTE(chogan): Holding the mdm->bucket_mutex while destroying Blobs can // result in deadlock if the BORG is in the middle of moving a Blob's // Buffers. - EndTicketMutex(&mdm->bucket_mutex); + // EndTicketMutex(&mdm->bucket_mutex); for (auto blob_id : blobs_to_destroy) { DestroyBlobById(context, rpc, blob_id, bucket_id); } - BeginTicketMutex(&mdm->bucket_mutex); + // BeginTicketMutex(&mdm->bucket_mutex); // Delete BlobId list FreeIdList(mdm, info->blobs); @@ -668,6 +669,7 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, << ". It's refcount is " << ref_count << std::endl; } EndTicketMutex(&mdm->bucket_mutex); + EndTicketMutex(&mdm->bucket_delete_mutex); return destroyed; } From e6bdd3555357bf27aa5d395ae1fa38752ff3b2b1 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 3 Jun 2022 12:53:38 -0500 Subject: [PATCH 38/85] Add RwLock for BoMove and DestroyBucket --- src/buffer_organizer.cc | 4 +-- src/memory_management.cc | 49 ++++++++++++++++++++++++++++++++++ src/memory_management.h | 11 ++++++++ src/metadata_management.h | 2 +- src/metadata_storage_stb_ds.cc | 4 +-- 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 13beb6be2..80c1d8dc2 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -191,7 +191,7 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, << std::endl; MetadataManager *mdm = GetMetadataManagerFromContext(context); - bool got_lock = BeginTicketMutexIfNoWait(&mdm->bucket_delete_mutex); + bool got_lock = BeginReaderLock(&mdm->bucket_delete_lock); if (got_lock && LocalLockBlob(context, blob_id)) { auto warning_string = [](BufferID id) { std::ostringstream ss; @@ -298,7 +298,7 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, } if (got_lock) { - EndTicketMutex(&mdm->bucket_delete_mutex); + EndReaderLock(&mdm->bucket_delete_lock); } } diff --git a/src/memory_management.cc b/src/memory_management.cc index 9bffe0f92..cacbe01d6 100644 --- a/src/memory_management.cc +++ b/src/memory_management.cc @@ -504,4 +504,53 @@ void EndTicketMutex(TicketMutex *mutex) { mutex->serving.fetch_add(1); } +const int kAttemptsBeforeYield = 100; + +bool BeginReaderLock(RwLock *lock) { + bool result = false; + if (!lock->writer_waiting.load()) { + lock->readers++; + result = true; + } + + return result; +} + +void EndReaderLock(RwLock *lock) { + u32 readers = lock->readers.load(); + + int retry = 0; + while (true) { + if (readers > 0) { + if (lock->readers.compare_exchange_weak(readers, readers - 1)) { + break; + } + } + retry++; + if (retry > kAttemptsBeforeYield) { + retry = 0; + sched_yield(); + } + } +} + +void BeginWriterLock(RwLock *lock) { + lock->writer_waiting.store(true); + + int retry = 0; + while (lock->readers.load() > 0) { + retry++; + if (retry > kAttemptsBeforeYield) { + retry = 0; + sched_yield(); + } + } + BeginTicketMutex(&lock->mutex); +} + +void EndWriterLock(RwLock *lock) { + EndTicketMutex(&lock->mutex); + lock->writer_waiting.store(false); +} + } // namespace hermes diff --git a/src/memory_management.h b/src/memory_management.h index cbe4dc26e..a629dd179 100644 --- a/src/memory_management.h +++ b/src/memory_management.h @@ -43,6 +43,12 @@ struct Ticket { bool acquired; }; +struct RwLock { + TicketMutex mutex; + std::atomic readers; + std::atomic writer_waiting; +}; + struct ArenaInfo { size_t sizes[kArenaType_Count]; size_t total; @@ -374,6 +380,11 @@ u8 *HeapExtentToPtr(Heap *heap); void BeginTicketMutex(TicketMutex *mutex); void EndTicketMutex(TicketMutex *mutex); +bool BeginReaderLock(RwLock *lock); +void EndReaderLock(RwLock *lock); +void BeginWriterLock(RwLock *lock); +void EndWriterLock(RwLock *lock); + } // namespace hermes #endif // HERMES_MEMORY_MANAGEMENT_H_ diff --git a/src/metadata_management.h b/src/metadata_management.h index 3e8da6e2d..4671362b7 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -170,7 +170,7 @@ struct MetadataManager { /** Lock for accessing `BucketInfo` structures located at * `bucket_info_offset` */ TicketMutex bucket_mutex; - TicketMutex bucket_delete_mutex; + RwLock bucket_delete_lock; /** Lock for accessing `VBucketInfo` structures located at * `vbucket_info_offset` */ diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 44497bc25..cb328f36f 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -618,7 +618,7 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, const char *bucket_name, BucketID bucket_id) { bool destroyed = false; MetadataManager *mdm = GetMetadataManagerFromContext(context); - BeginTicketMutex(&mdm->bucket_delete_mutex); + BeginWriterLock(&mdm->bucket_delete_lock); BeginTicketMutex(&mdm->bucket_mutex); BucketInfo *info = LocalGetBucketInfoById(mdm, bucket_id); @@ -669,7 +669,7 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, << ". It's refcount is " << ref_count << std::endl; } EndTicketMutex(&mdm->bucket_mutex); - EndTicketMutex(&mdm->bucket_delete_mutex); + EndWriterLock(&mdm->bucket_delete_lock); return destroyed; } From 4ceb6b02e5ccdc104d0c704d8337f89bdb8b753c Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 3 Jun 2022 13:08:24 -0500 Subject: [PATCH 39/85] Don't warn about locking blob when RwLock wasn't taken --- src/buffer_organizer.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 80c1d8dc2..fe9dd3f21 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -294,7 +294,9 @@ void BoMove(SharedMemoryContext *context, RpcContext *rpc, << internal_blob_name.substr(kBucketIdStringSize, std::string::npos) << std::endl; } else { - LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; + if (got_lock) { + LOG(WARNING) << "Couldn't lock BlobID " << blob_id.as_int << "\n"; + } } if (got_lock) { From a450103bd347fcd918fc6610431510e15ef26527 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 3 Jun 2022 14:27:19 -0500 Subject: [PATCH 40/85] More iters in borg_bench.cc --- benchmarks/borg_bench.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index ceb71e0cd..b33e0fae6 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -84,7 +84,7 @@ int main(int argc, char *argv[]) { std::iota(blob.begin(), blob.end(), 0); // MinIoTime with retry - const int kIters = 1500; + const int kIters = 2000; const int kReportFrequency = 30; hermes::testing::Timer put_timer; size_t failed_puts = 0; From d6b41cc98204ad2d36b48dbdf802ba34d712b493 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 8 Jun 2022 09:04:18 -0500 Subject: [PATCH 41/85] Adding results verification to borg_bench --- benchmarks/borg_bench.cc | 157 +++++++++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 29 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index b33e0fae6..b9b06caf8 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -10,7 +10,9 @@ * have access to the file, you may request a copy from help@hdfgroup.org. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +#include #include +#include #include @@ -21,6 +23,60 @@ #include "vbucket.h" #include "test_utils.h" +struct Options { + bool use_borg; + bool verify; + char *output_filename; +}; + +void PrintUsage(char *program) { + fprintf(stderr, "Usage: %s [-b ] [-f] \n", program); + fprintf(stderr, " -b\n"); + fprintf(stderr, " If present, enable the BORG.\n"); + fprintf(stderr, " -f\n"); + fprintf(stderr, " The filename of the persisted data (for correctness verification).\n"); + fprintf(stderr, " -v\n"); + fprintf(stderr, " If present, verify results at the end.\n"); +} + +Options HandleArgs(int argc, char **argv) { + Options result = {}; + int option = -1; + while ((option = getopt(argc, argv, "bf:hv")) != -1) { + switch (option) { + case 'h': { + PrintUsage(argv[0]); + exit(0); + } + case 'b': { + result.use_borg = true; + break; + } + case 'f': { + result.output_filename = optarg; + break; + } + case 'v': { + result.verify = true; + break; + } + default: { + PrintUsage(argv[0]); + exit(1); + } + } + } + if (optind < argc) { + fprintf(stderr, "non-option ARGV-elements: "); + while (optind < argc) { + fprintf(stderr, "%s ", argv[optind++]); + } + fprintf(stderr, "\n"); + } + return result; +} + + namespace hapi = hermes::api; using HermesPtr = std::shared_ptr; @@ -40,7 +96,18 @@ double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, return result; } +std::string MakeBlobName(int rank, int i) { + std::string result = std::to_string(rank) + "_" + std::to_string(i); + + return result; +} + int main(int argc, char *argv[]) { + const size_t kBlobSize = KILOBYTES(32); + const int kIters = 2000; + + Options options = HandleArgs(argc, argv); + int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { @@ -48,11 +115,6 @@ int main(int argc, char *argv[]) { return 1; } - bool use_borg = true; - - if (argc == 2) { - use_borg = false; - } // int gdb_iii = 0; // char gdb_DEBUG_hostname[256]; // gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); @@ -65,6 +127,9 @@ int main(int argc, char *argv[]) { if (hermes->IsApplicationCore()) { int rank = hermes->GetProcessRank(); + const int kNumRanks = hermes->GetNumProcesses(); + const size_t kTotalBytes = kNumRanks * kBlobSize * kIters; + hermes::testing::Timer timer; hapi::Context ctx; // Disable swapping of Blobs @@ -75,25 +140,21 @@ int main(int argc, char *argv[]) { hapi::Bucket bkt(bkt_name, hermes, ctx); hapi::WriteOnlyTrait trait; - if (use_borg) { + if (options.use_borg) { vbkt.Attach(&trait); } - const size_t kBlobSize = KILOBYTES(32); - hapi::Blob blob(kBlobSize); - std::iota(blob.begin(), blob.end(), 0); - // MinIoTime with retry - const int kIters = 2000; - const int kReportFrequency = 30; - hermes::testing::Timer put_timer; + // const int kReportFrequency = 30; + // hermes::testing::Timer put_timer; size_t failed_puts = 0; size_t failed_links = 0; for (int i = 0; i < kIters; ++i) { - std::string blob_name = ("b_" + std::to_string(rank) + "_" + - std::to_string(i)); + std::string blob_name = MakeBlobName(rank, i); + hapi::Blob blob(kBlobSize, i % 255); + timer.resumeTime(); - put_timer.resumeTime(); + // put_timer.resumeTime(); hapi::Status status; int consecutive_fails = 0; while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { @@ -102,23 +163,23 @@ int main(int argc, char *argv[]) { break; } } - put_timer.pauseTime(); + // put_timer.pauseTime(); - if (use_borg && consecutive_fails <= 10) { + if (options.use_borg && consecutive_fails <= 10) { hapi::Status link_status = vbkt.Link(blob_name, bkt_name); if (!link_status.Succeeded()) { failed_links++; } } timer.pauseTime(); - if (i > 0 && i % kReportFrequency == 0) { - // TODO(chogan): Support more than 1 rank - constexpr double total_mb = - (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; + // if (i > 0 && i % kReportFrequency == 0) { + // // TODO(chogan): Support more than 1 rank + // constexpr double total_mb = + // (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; - std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; - put_timer.reset(); - } + // std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; + // put_timer.reset(); + // } hermes->AppBarrier(); } @@ -134,24 +195,62 @@ int main(int argc, char *argv[]) { hermes->AppBarrier(); if (hermes->IsFirstRankOnNode()) { vbkt.Destroy(); + if (options.verify) { + hapi::VBucket file_vbucket(options.output_filename, hermes); + auto offset_map = std::unordered_map(); + + for (int i = 0; i < kNumRanks; ++i) { + for (int j = 0; j < kIters; ++j) { + std::string blob_name = MakeBlobName(i, j); + file_vbucket.Link(blob_name, options.output_filename, ctx); + const size_t kBytesPerRank = kIters * kBlobSize; + size_t offset = (i * kBytesPerRank) + (j * kBlobSize); + offset_map.emplace(blob_name, offset); + } + } + bool flush_synchronously = true; + hapi::PersistTrait persist_trait(options.output_filename, offset_map, + flush_synchronously); + file_vbucket.Attach(&persist_trait); + + file_vbucket.Destroy(); + } bkt.Destroy(); } hermes->AppBarrier(); MPI_Comm *comm = (MPI_Comm *)hermes->GetAppCommunicator(); - int num_ranks = hermes->GetNumProcesses(); - double total_mb = (kBlobSize * kIters * num_ranks) / 1024.0 / 1024.0; + double total_mb = kTotalBytes / 1024.0 / 1024.0; double bandwidth = GetBandwidth(timer.getElapsedTime(), total_mb, *comm, - num_ranks); + kNumRanks); if (hermes->IsFirstRankOnNode()) { - fprintf(stderr, "##################### %f MiB/s\n", bandwidth); + std::cout << "##################### " << bandwidth << " MiB/s\n"; } } hermes->Finalize(); + int rank; + int comm_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &comm_size); + + const size_t kAppCores = comm_size - 1; + const size_t kTotalBytes = kAppCores * kIters * kBlobSize; + if (options.verify && rank == 0) { + std::vector data(kTotalBytes); + FILE *f = fopen(options.output_filename, "r"); + Assert(f); + Assert(fseek(f, 0L, SEEK_END) == 0); + size_t file_size = ftell(f); + Assert(file_size == kTotalBytes); + Assert(fseek(f, 0L, SEEK_SET) == 0); + size_t result = fread(data.data(), kTotalBytes, 1, f); + Assert(result == 1); + } + MPI_Finalize(); return 0; From a09d3ba60d3e1c44f77360f9157453689dc91bc2 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 8 Jun 2022 09:39:31 -0500 Subject: [PATCH 42/85] Verficiation working --- benchmarks/borg_bench.cc | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index b9b06caf8..8ccaa70a3 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -66,6 +66,11 @@ Options HandleArgs(int argc, char **argv) { } } } + + if (result.verify && !result.output_filename) { + fprintf(stderr, "Please supply filename via -f\n"); + exit(1); + } if (optind < argc) { fprintf(stderr, "non-option ARGV-elements: "); while (optind < argc) { @@ -202,7 +207,7 @@ int main(int argc, char *argv[]) { for (int i = 0; i < kNumRanks; ++i) { for (int j = 0; j < kIters; ++j) { std::string blob_name = MakeBlobName(i, j); - file_vbucket.Link(blob_name, options.output_filename, ctx); + file_vbucket.Link(blob_name, bkt_name, ctx); const size_t kBytesPerRank = kIters * kBlobSize; size_t offset = (i * kBytesPerRank) + (j * kBlobSize); offset_map.emplace(blob_name, offset); @@ -232,14 +237,14 @@ int main(int argc, char *argv[]) { hermes->Finalize(); - int rank; + int my_rank; int comm_size; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); const size_t kAppCores = comm_size - 1; const size_t kTotalBytes = kAppCores * kIters * kBlobSize; - if (options.verify && rank == 0) { + if (options.verify && my_rank == 0) { std::vector data(kTotalBytes); FILE *f = fopen(options.output_filename, "r"); Assert(f); @@ -249,6 +254,15 @@ int main(int argc, char *argv[]) { Assert(fseek(f, 0L, SEEK_SET) == 0); size_t result = fread(data.data(), kTotalBytes, 1, f); Assert(result == 1); + + for (size_t rank = 0; rank < kAppCores; ++rank) { + for (size_t iter = 0; iter < kIters; ++iter) { + for (size_t byte = 0; byte < kBlobSize; ++byte) { + Assert(data[(rank * kIters * kBlobSize) + (iter * kBlobSize) + byte] + == iter % 255); + } + } + } } MPI_Finalize(); From 7f2f4faa22fb5e0d4d62edaf647451cd327540b8 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 8 Jun 2022 09:42:59 -0500 Subject: [PATCH 43/85] Appease linter --- benchmarks/borg_bench.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 8ccaa70a3..6dd4ce10d 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -34,7 +34,8 @@ void PrintUsage(char *program) { fprintf(stderr, " -b\n"); fprintf(stderr, " If present, enable the BORG.\n"); fprintf(stderr, " -f\n"); - fprintf(stderr, " The filename of the persisted data (for correctness verification).\n"); + fprintf(stderr, " The filename of the persisted data (for correctness" + "verification).\n"); fprintf(stderr, " -v\n"); fprintf(stderr, " If present, verify results at the end.\n"); } @@ -182,7 +183,8 @@ int main(int argc, char *argv[]) { // constexpr double total_mb = // (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; - // std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; + // std::cout << i << ", " << total_mb / put_timer.getElapsedTime() + // << "\n"; // put_timer.reset(); // } hermes->AppBarrier(); From 03bb5d48745132936850c9871ec944f1c1012ce1 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 8 Jun 2022 11:35:53 -0500 Subject: [PATCH 44/85] Wait for outstanding BORG tasks before flushing a blog --- src/api/traits.cc | 10 ++++++++-- src/metadata_management_internal.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index cacf4c945..d9337e1be 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -15,6 +15,7 @@ #include #include "buffer_organizer.h" +#include "metadata_management_internal.h" namespace hermes { namespace api { @@ -137,9 +138,14 @@ void WriteOnlyTrait::onLink(HermesPtr hermes, TraitInput &input, Trait *trait) { void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait) { - (void)hermes; - (void)input; (void)trait; + + BucketID bucket_id = GetBucketId(&hermes->context_, &hermes->rpc_, + input.bucket_name.c_str()); + BlobID blob_id = GetBlobId(&hermes->context_, &hermes->rpc_, input.blob_name, + bucket_id, false); + MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); + WaitForOutstandingBlobOps(mdm, blob_id); } } // namespace api diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index df526788d..449f655d2 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -136,6 +136,7 @@ std::string LocalGetBucketNameById(SharedMemoryContext *context, BucketID blob_id); +void WaitForOutstandingBlobOps(MetadataManager *mdm, BlobID blob_id); int LocalGetNumOutstandingFlushingTasks(SharedMemoryContext *context, VBucketID id); int GetNumOutstandingFlushingTasks(SharedMemoryContext *context, From 6eef7940d4a430727fb626de1b9b2ddf62ec2328 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 8 Jun 2022 15:44:40 -0500 Subject: [PATCH 45/85] Revert wait in WriteOnlyTrait::onUnlink --- src/api/traits.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api/traits.cc b/src/api/traits.cc index d9337e1be..f74e060ee 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -138,14 +138,18 @@ void WriteOnlyTrait::onLink(HermesPtr hermes, TraitInput &input, Trait *trait) { void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, Trait *trait) { + (void)hermes; + (void)input; (void)trait; +#if 0 BucketID bucket_id = GetBucketId(&hermes->context_, &hermes->rpc_, input.bucket_name.c_str()); BlobID blob_id = GetBlobId(&hermes->context_, &hermes->rpc_, input.blob_name, bucket_id, false); MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); WaitForOutstandingBlobOps(mdm, blob_id); +#endif } } // namespace api From 09599ca5e358ddb8c8beafbd5dac513f7857069f Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 16 Jun 2022 09:06:02 -0500 Subject: [PATCH 46/85] WIP: Get all necessary info to EnforceCapacityThresholds --- src/buffer_organizer.cc | 30 ++++++++++++++++++---------- src/buffer_organizer.h | 2 +- src/metadata_management.cc | 32 ++++++++++++++++++++++-------- src/metadata_management.h | 11 ++++++++++ src/metadata_management_internal.h | 2 +- src/rpc_thallium.cc | 2 +- src/rpc_thallium.h | 25 +++++++++++++++++++++++ 7 files changed, 82 insertions(+), 22 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index fe9dd3f21..690434db4 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -431,21 +431,29 @@ void OrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, } void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, - DeviceID devices_id) { + const ViolationInfo &info) { (void)context; (void)rpc; - (void)devices_id; - // TODO(chogan): Pass in whether it's min or max that's violated - // TODO(chogan): Pass in how much space needs to be filled/freed + // DeviceID device_id = info.device_id; - // while (max is violated) - // Choose largest buffer from least important Blob - // Move to lower tier - - // while (min is violated) - // Choose largest buffer from most important Blob - // Move to higher tier + switch (info.violation) { + case ThresholdViolation::kMin: { + // while (min is violated) + // Choose largest buffer from most important Blob + // Move to higher tier + break; + } + case ThresholdViolation::kMax: { + // while (max is violated) + // Choose largest buffer from least important Blob + // Move to lower tier + break; + } + default: { + HERMES_INVALID_CODE_PATH; + } + } } void LocalShutdownBufferOrganizer(SharedMemoryContext *context) { diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index d45e81c58..a77a93911 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -114,7 +114,7 @@ void EnqueueBoMove(RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, BucketID bucket_id, const std::string &internal_name, BoPriority priority); void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, - DeviceID devices_id); + const ViolationInfo &info); } // namespace hermes #endif // HERMES_BUFFER_ORGANIZER_H_ diff --git a/src/metadata_management.cc b/src/metadata_management.cc index eabd647b2..4debbbad0 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -970,10 +970,10 @@ SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context) { return result; } -std::vector +std::vector LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, std::vector adjustments) { - std::vector result; + std::vector result; for (size_t i = 0; i < adjustments.size(); ++i) { SystemViewState *state = GetGlobalSystemViewState(context); @@ -990,9 +990,25 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, (f32)state->capacities[i] / (f32)state->bytes_available[i].load(); } - // TODO(chogan): Handle violation of bo_capacity_thresholds[i].min if (percentage_available > state->bo_capacity_thresholds[i].max) { - result.push_back((DeviceID)i); + float percentage_violation = + percentage_available - state->bo_capacity_thresholds[i].max; + ViolationInfo info = {}; + info.device_id = (DeviceID)i; + info.violation = ThresholdViolation::kMax; + info.violation_size = + (size_t)(percentage_violation * state->capacities[i]); + result.push_back(info); + } + if (percentage_available < state->bo_capacity_thresholds[i].min) { + float percentage_violation = + state->bo_capacity_thresholds[i].max - percentage_available; + ViolationInfo info = {}; + info.device_id = (DeviceID)i; + info.violation = ThresholdViolation::kMin; + info.violation_size = + (size_t)(percentage_violation * state->capacities[i]); + result.push_back(info); } } } @@ -1014,7 +1030,7 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, } } - std::vector devices_to_organize; + std::vector devices_to_organize; if (update_needed) { u32 target_node = mdm->global_system_view_state_node_id; if (target_node == rpc->node_id) { @@ -1022,9 +1038,9 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, LocalUpdateGlobalSystemViewState(context, adjustments); } else { devices_to_organize = - RpcCall>(rpc, target_node, - "RemoteUpdateGlobalSystemViewState", - adjustments); + RpcCall>(rpc, target_node, + "RemoteUpdateGlobalSystemViewState", + adjustments); } } diff --git a/src/metadata_management.h b/src/metadata_management.h index 4671362b7..70ae6793c 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -63,6 +63,17 @@ enum MapType { kMapType_Count }; +enum class ThresholdViolation { + kMin, + kMax +}; + +struct ViolationInfo { + DeviceID device_id; + ThresholdViolation violation; + size_t violation_size; +}; + struct Stats { u32 recency; u32 frequency; diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index 449f655d2..ac8c12bb8 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -75,7 +75,7 @@ void LocalPut(MetadataManager *mdm, const char *key, u64 val, MapType map_type); void LocalDelete(MetadataManager *mdm, const char *key, MapType map_type); u64 LocalGetRemainingTargetCapacity(SharedMemoryContext *context, TargetID id); -std::vector +std::vector LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, std::vector adjustments); SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context); diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 1af756076..1b6b40bf3 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -313,7 +313,7 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, // Probably should move it to a completely separate tl::engine. auto rpc_update_global_system_view_state = [context](const request &req, std::vector adjustments) { - std::vector result = + std::vector result = LocalUpdateGlobalSystemViewState(context, adjustments); req.respond(result); diff --git a/src/rpc_thallium.h b/src/rpc_thallium.h index 451539130..e3367745e 100644 --- a/src/rpc_thallium.h +++ b/src/rpc_thallium.h @@ -129,6 +129,11 @@ void serialize(A &ar, BufferInfo &info) { } #ifndef THALLIUM_USE_CEREAL + +// NOTE(chogan): Thallium's default serialization doesn't handle enums by +// default so we must write serialization code for all enums when we're not +// using cereal. + /** * Lets Thallium know how to serialize a MapType. * @@ -170,6 +175,19 @@ void load(A &ar, BoPriority &priority) { ar.read(&val, 1); priority = (BoPriority)val; } + +template +void save(A &ar, ThresholdViolation &violation) { + int val = (int)violation; + ar.write(&val, 1); +} + +template +void load(A &ar, ThresholdViolation &violation) { + int val = 0; + ar.read(&val, 1); + violation = (ThresholdViolation)val; +} #endif // #ifndef THALLIUM_USE_CEREAL @@ -198,6 +216,13 @@ void serialize(A &ar, BoTask &bo_task) { ar & bo_task.args; } +template +void serialize(A &ar, ViolationInfo &info) { + ar & info.device_id; + ar & info.violation; + ar & info.violation_size; +} + namespace api { template #ifndef THALLIUM_USE_CEREAL From c1f4594db9efc79f7cc237d4702c10293293f75a Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 17 Jun 2022 14:24:04 -0500 Subject: [PATCH 47/85] Rework global SVS to track all Target remaining capacities --- src/api/hermes.cc | 5 +- src/buffer_organizer.cc | 2 +- src/metadata_management.cc | 105 +++++++++++++++++++---------- src/metadata_management.h | 33 ++++++++- src/metadata_management_internal.h | 6 +- src/rpc_thallium.cc | 8 +-- 6 files changed, 108 insertions(+), 51 deletions(-) diff --git a/src/api/hermes.cc b/src/api/hermes.cc index 316a062be..d53615f38 100644 --- a/src/api/hermes.cc +++ b/src/api/hermes.cc @@ -256,7 +256,7 @@ SharedMemoryContext InitHermesCore(Config *config, CommunicationContext *comm, mdm->host_numbers_offset = (u8 *)rpc->host_numbers - (u8 *)shmem_base; } - InitMetadataManager(mdm, &arenas[kArenaType_MetaData], config, comm->node_id); + InitMetadataManager(mdm, rpc, &arenas[kArenaType_MetaData], config); InitMetadataStorage(&context, mdm, &arenas[kArenaType_MetaData], config); ShmemClientInfo *client_info = (ShmemClientInfo *)shmem_base; @@ -381,8 +381,7 @@ std::shared_ptr InitHermes(Config *config, bool is_daemon, double sleep_ms = config->system_view_state_update_interval_ms; StartGlobalSystemViewStateUpdateThread(&result->context_, &result->rpc_, - &result->trans_arena_, - sleep_ms); + &result->trans_arena_, sleep_ms); } WorldBarrier(&comm); diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 690434db4..44ababab2 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -435,7 +435,7 @@ void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, (void)context; (void)rpc; - // DeviceID device_id = info.device_id; + // DeviceID dev_id = info.device_id; switch (info.violation) { case ThresholdViolation::kMin: { diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 4debbbad0..9bea2a015 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -934,7 +934,7 @@ SystemViewState *GetLocalSystemViewState(SharedMemoryContext *context) { } std::vector LocalGetGlobalDeviceCapacities(SharedMemoryContext *context) { - SystemViewState *global_svs = GetGlobalSystemViewState(context); + GlobalSystemViewState *global_svs = GetGlobalSystemViewState(context); std::vector result(global_svs->num_devices); for (size_t i = 0; i < result.size(); ++i) { @@ -945,7 +945,7 @@ std::vector LocalGetGlobalDeviceCapacities(SharedMemoryContext *context) { } std::vector GetGlobalDeviceCapacities(SharedMemoryContext *context, - RpcContext *rpc) { + RpcContext *rpc) { MetadataManager *mdm = GetMetadataManagerFromContext(context); u32 target_node = mdm->global_system_view_state_node_id; @@ -961,55 +961,57 @@ std::vector GetGlobalDeviceCapacities(SharedMemoryContext *context, return result; } -SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context) { +GlobalSystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context) { MetadataManager *mdm = GetMetadataManagerFromContext(context); - SystemViewState *result = - (SystemViewState *)((u8 *)mdm + mdm->global_system_view_state_offset); + GlobalSystemViewState *result = + (GlobalSystemViewState *)((u8 *)mdm + mdm->global_system_view_state_offset); assert((u8 *)result != (u8 *)mdm); return result; } std::vector -LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, +LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, std::vector adjustments) { std::vector result; - - for (size_t i = 0; i < adjustments.size(); ++i) { - SystemViewState *state = GetGlobalSystemViewState(context); - if (adjustments[i]) { - state->bytes_available[i].fetch_add(adjustments[i]); - DLOG(INFO) << "DeviceID " << i << " adjusted by " << adjustments[i] - << " bytes\n"; + // TODO(chogan): Take node_id into account when updating GlobalSVS + for (size_t device_idx = 0; device_idx < adjustments.size(); ++device_idx) { + GlobalSystemViewState *state = GetGlobalSystemViewState(context); + if (adjustments[device_idx]) { + u32 target_idx = ((node_id - 1) * adjustments.size()) + device_idx; + state->bytes_available[target_idx].fetch_add(adjustments[device_idx]); + DLOG(INFO) << "DeviceID " << device_idx << "on node " << node_id + << " adjusted by " << adjustments[device_idx] << " bytes\n"; // Collect devices for which to trigger the BufferOrganizer if the // capacities are beyond the min/max thresholds float percentage_available = 0.0f; - if (state->bytes_available[i] > 0) { - percentage_available = - (f32)state->capacities[i] / (f32)state->bytes_available[i].load(); + if (state->bytes_available[target_idx] > 0) { + percentage_available = ((f32)state->capacities[device_idx] / + (f32)state->bytes_available[target_idx].load()); } - if (percentage_available > state->bo_capacity_thresholds[i].max) { - float percentage_violation = - percentage_available - state->bo_capacity_thresholds[i].max; - ViolationInfo info = {}; - info.device_id = (DeviceID)i; + ViolationInfo info = {}; + info.device_id = (DeviceID)device_idx; + info.node_id = node_id; + + float percentage_violation = 0.0f; + if (percentage_available > + state->bo_capacity_thresholds[device_idx].max) { + percentage_violation = + percentage_available - state->bo_capacity_thresholds[device_idx].max; info.violation = ThresholdViolation::kMax; - info.violation_size = - (size_t)(percentage_violation * state->capacities[i]); - result.push_back(info); } - if (percentage_available < state->bo_capacity_thresholds[i].min) { - float percentage_violation = - state->bo_capacity_thresholds[i].max - percentage_available; - ViolationInfo info = {}; - info.device_id = (DeviceID)i; + if (percentage_available < + state->bo_capacity_thresholds[device_idx].min) { + percentage_violation = + state->bo_capacity_thresholds[device_idx].max - percentage_available; info.violation = ThresholdViolation::kMin; - info.violation_size = - (size_t)(percentage_violation * state->capacities[i]); - result.push_back(info); } + + info.violation_size = + (size_t)(percentage_violation * state->capacities[device_idx]); + result.push_back(info); } } @@ -1022,6 +1024,7 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, BufferPool *pool = GetBufferPoolFromContext(context); bool update_needed = false; + // TODO(chogan): BufferPool code should post adjustments via 1-sided rpc. std::vector adjustments(pool->num_devices); for (size_t i = 0; i < adjustments.size(); ++i) { adjustments[i] = pool->capacity_adjustments[i].exchange(0); @@ -1035,7 +1038,7 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, u32 target_node = mdm->global_system_view_state_node_id; if (target_node == rpc->node_id) { devices_to_organize = - LocalUpdateGlobalSystemViewState(context, adjustments); + LocalUpdateGlobalSystemViewState(context, rpc->node_id, adjustments); } else { devices_to_organize = RpcCall>(rpc, target_node, @@ -1086,6 +1089,32 @@ SystemViewState *CreateSystemViewState(Arena *arena, Config *config) { return result; } +GlobalSystemViewState *CreateGlobalSystemViewState(RpcContext *rpc, Arena *arena, + Config *config) { + GlobalSystemViewState *result = + PushClearedStruct(arena); + result->num_devices = config->num_devices; + + for (int i = 0; i < result->num_devices; ++i) { + result->capacities[i] = config->capacities[i]; + // Min and max thresholds + result->bo_capacity_thresholds[i] = config->bo_capacity_thresholds[i]; + } + size_t num_targets = config->num_devices * rpc->num_nodes; + result->num_targets = num_targets; + result->bytes_available = + PushClearedArray>(arena, num_targets); + + for (u32 node_idx = 0; node_idx < rpc->num_nodes; ++node_idx) { + for (int device_idx = 0; device_idx < result->num_devices; ++device_idx) { + u64 index = (node_idx * result->num_devices) + device_idx; + result->bytes_available[index].store(result->capacities[device_idx]); + } + } + + return result; +} + std::string GetSwapFilename(MetadataManager *mdm, u32 node_id) { char *prefix = (char *)((u8 *)mdm + mdm->swap_filename_prefix_offset); char *suffix = (char *)((u8 *)mdm + mdm->swap_filename_suffix_offset); @@ -1140,11 +1169,11 @@ SwapBlob IdArrayToSwapBlob(BufferIdArray ids) { return result; } -void InitMetadataManager(MetadataManager *mdm, Arena *arena, Config *config, - int node_id) { +void InitMetadataManager(MetadataManager *mdm, RpcContext *rpc, Arena *arena, + Config *config) { // NOTE(chogan): All MetadataManager offsets are relative to the address of // the MDM itself. - + u32 node_id = rpc->node_id; arena->error_handler = MetadataArenaErrorHandler; mdm->map_seed = 0x4E58E5DF; @@ -1160,9 +1189,11 @@ void InitMetadataManager(MetadataManager *mdm, Arena *arena, Config *config, // Initialize Global SystemViewState + // TODO(chogan): if (node_id == 1) { // NOTE(chogan): Only Node 1 has the Global SystemViewState - SystemViewState *global_state = CreateSystemViewState(arena, config); + GlobalSystemViewState *global_state = + CreateGlobalSystemViewState(rpc, arena, config); mdm->global_system_view_state_offset = GetOffsetFromMdm(mdm, global_state); } mdm->global_system_view_state_node_id = 1; diff --git a/src/metadata_management.h b/src/metadata_management.h index 70ae6793c..381ccb284 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -72,6 +72,7 @@ struct ViolationInfo { DeviceID device_id; ThresholdViolation violation; size_t violation_size; + u32 node_id; }; struct Stats { @@ -150,6 +151,34 @@ struct SystemViewState { int num_devices; }; +// TODO(chogan): +/** + * A snapshot view of the entire system's Targets' available capacities. + * + * This information is only stored on 1 node, designated by + * MetadataManager::global_system_view_state_node_id, and is only updated by 1 + * rank (the Hermes process on that node). Hence, it does not need to be stored + * in shared memory and we are able to use normal std containers. However, since + * multiple RPC handler threads can potentially update the `bytes_available` + * field concurrently, we must not do any operations on the vector itself. We + * can only do operations on the atomics within. The vector is created in + * StartGlobalSystemViewStateUpdateThread, and thereafter we can only call + * functions on the individual atomics (e.g., bytes_available[i].fetch_add), + * which is thread safe. + */ +struct GlobalSystemViewState { + /** The total number of buffering Targets in the system */ + u64 num_targets; + /** The number of devices per node */ + int num_devices; + u64 capacities[kMaxDevices]; + /** The remaining capacity of each Target in the system */ + std::atomic *bytes_available; + /** The min and max capacity thresholds (percentage) for each Target in the + * system */ + Thresholds bo_capacity_thresholds[kMaxDevices]; +}; + struct MetadataManager { // All offsets are relative to the beginning of the MDM ptrdiff_t bucket_info_offset; @@ -217,8 +246,8 @@ struct RpcContext; /** * */ -void InitMetadataManager(MetadataManager *mdm, Arena *arena, Config *config, - int node_id); +void InitMetadataManager(MetadataManager *mdm, RpcContext *rpc, Arena *arena, + Config *config); /** * diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index ac8c12bb8..d2f475215 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -76,9 +76,9 @@ void LocalDelete(MetadataManager *mdm, const char *key, MapType map_type); u64 LocalGetRemainingTargetCapacity(SharedMemoryContext *context, TargetID id); std::vector -LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, +LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, std::vector adjustments); -SystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context); +GlobalSystemViewState *GetGlobalSystemViewState(SharedMemoryContext *context); std::vector LocalGetGlobalDeviceCapacities(SharedMemoryContext *context); std::vector GetGlobalDeviceCapacities(SharedMemoryContext *context, RpcContext *rpc); @@ -87,7 +87,7 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, void StartGlobalSystemViewStateUpdateThread(SharedMemoryContext *context, RpcContext *rpc, Arena *arena, - double slepp_ms); + double sleep_ms); void InitMetadataStorage(SharedMemoryContext *context, MetadataManager *mdm, Arena *arena, Config *config); diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 1b6b40bf3..9bc2f2c32 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -309,12 +309,10 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, req.respond(result); }; - // TODO(chogan): Only need this on mdm->global_system_view_state_node_id. - // Probably should move it to a completely separate tl::engine. auto rpc_update_global_system_view_state = - [context](const request &req, std::vector adjustments) { + [context, rpc](const request &req, std::vector adjustments) { std::vector result = - LocalUpdateGlobalSystemViewState(context, adjustments); + LocalUpdateGlobalSystemViewState(context, rpc->node_id, adjustments); req.respond(result); }; @@ -604,7 +602,7 @@ void StartBufferOrganizer(SharedMemoryContext *context, RpcContext *rpc, void StartGlobalSystemViewStateUpdateThread(SharedMemoryContext *context, RpcContext *rpc, Arena *arena, - double sleep_ms) { + double sleep_ms) { struct ThreadArgs { SharedMemoryContext *context; RpcContext *rpc; From 217467f3419f654dc1ce974359e52fa967432812 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 17 Jun 2022 14:26:20 -0500 Subject: [PATCH 48/85] Appease linter --- src/metadata_management.cc | 5 +++-- src/rpc_thallium.cc | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 9bea2a015..48d25c723 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -1089,8 +1089,9 @@ SystemViewState *CreateSystemViewState(Arena *arena, Config *config) { return result; } -GlobalSystemViewState *CreateGlobalSystemViewState(RpcContext *rpc, Arena *arena, - Config *config) { +GlobalSystemViewState *CreateGlobalSystemViewState(RpcContext *rpc, + Arena *arena, + Config *config) { GlobalSystemViewState *result = PushClearedStruct(arena); result->num_devices = config->num_devices; diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 9bc2f2c32..bfba2cf82 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -602,7 +602,7 @@ void StartBufferOrganizer(SharedMemoryContext *context, RpcContext *rpc, void StartGlobalSystemViewStateUpdateThread(SharedMemoryContext *context, RpcContext *rpc, Arena *arena, - double sleep_ms) { + double sleep_ms) { struct ThreadArgs { SharedMemoryContext *context; RpcContext *rpc; From 7764302eb4e2000a06fce5c2aa3265723641ffb8 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 21 Jun 2022 09:35:46 -0500 Subject: [PATCH 49/85] Fix division bug --- src/metadata_management.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 48d25c723..35d6fff5b 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -974,21 +974,20 @@ std::vector LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, std::vector adjustments) { std::vector result; - // TODO(chogan): Take node_id into account when updating GlobalSVS for (size_t device_idx = 0; device_idx < adjustments.size(); ++device_idx) { GlobalSystemViewState *state = GetGlobalSystemViewState(context); if (adjustments[device_idx]) { u32 target_idx = ((node_id - 1) * adjustments.size()) + device_idx; state->bytes_available[target_idx].fetch_add(adjustments[device_idx]); - DLOG(INFO) << "DeviceID " << device_idx << "on node " << node_id + DLOG(INFO) << "DeviceID " << device_idx << " on node " << node_id << " adjusted by " << adjustments[device_idx] << " bytes\n"; // Collect devices for which to trigger the BufferOrganizer if the // capacities are beyond the min/max thresholds float percentage_available = 0.0f; if (state->bytes_available[target_idx] > 0) { - percentage_available = ((f32)state->capacities[device_idx] / - (f32)state->bytes_available[target_idx].load()); + percentage_available = ((f32)state->bytes_available[target_idx].load() / + (f32)state->capacities[device_idx]); } ViolationInfo info = {}; From 87ebfa73f9c2535fe0943417a29a7c70e711b648 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 21 Jun 2022 09:47:49 -0500 Subject: [PATCH 50/85] Don't return false threshold violations --- src/metadata_management.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 35d6fff5b..4338b7e93 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -991,10 +991,8 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, } ViolationInfo info = {}; - info.device_id = (DeviceID)device_idx; - info.node_id = node_id; - float percentage_violation = 0.0f; + if (percentage_available > state->bo_capacity_thresholds[device_idx].max) { percentage_violation = @@ -1008,9 +1006,13 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, info.violation = ThresholdViolation::kMin; } - info.violation_size = - (size_t)(percentage_violation * state->capacities[device_idx]); - result.push_back(info); + if (percentage_violation > 0.0f) { + info.device_id = (DeviceID)device_idx; + info.node_id = node_id; + info.violation_size = + (size_t)(percentage_violation * state->capacities[device_idx]); + result.push_back(info); + } } } From af60a353e6da33a95e19cdc1f6df9d0b6e18bfdf Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 22 Jun 2022 07:47:39 -0500 Subject: [PATCH 51/85] Adding more options to borg_bench --- benchmarks/borg_bench.cc | 70 ++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 6dd4ce10d..91401d556 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -26,6 +26,8 @@ struct Options { bool use_borg; bool verify; + bool time_puts; + long sleep_ms; char *output_filename; }; @@ -36,6 +38,10 @@ void PrintUsage(char *program) { fprintf(stderr, " -f\n"); fprintf(stderr, " The filename of the persisted data (for correctness" "verification).\n"); + fprintf(stderr, " -p\n"); + fprintf(stderr, " Get average for groups of puts.\n"); + fprintf(stderr, " -s\n"); + fprintf(stderr, " Sleep ms between each Put.\n"); fprintf(stderr, " -v\n"); fprintf(stderr, " If present, verify results at the end.\n"); } @@ -43,7 +49,7 @@ void PrintUsage(char *program) { Options HandleArgs(int argc, char **argv) { Options result = {}; int option = -1; - while ((option = getopt(argc, argv, "bf:hv")) != -1) { + while ((option = getopt(argc, argv, "bf:hps:v")) != -1) { switch (option) { case 'h': { PrintUsage(argv[0]); @@ -57,6 +63,14 @@ Options HandleArgs(int argc, char **argv) { result.output_filename = optarg; break; } + case 'p': { + result.time_puts = true; + break; + } + case 's': { + result.sleep_ms = strtol(optarg, NULL, 0); + break; + } case 'v': { result.verify = true; break; @@ -140,6 +154,7 @@ int main(int argc, char *argv[]) { hapi::Context ctx; // Disable swapping of Blobs ctx.disable_swap = true; + // ctx.policy = hapi::PlacementPolicy::kRoundRobin; std::string bkt_name = "BORG_" + std::to_string(rank); hapi::VBucket vbkt(bkt_name, hermes); @@ -151,25 +166,26 @@ int main(int argc, char *argv[]) { } // MinIoTime with retry - // const int kReportFrequency = 30; - // hermes::testing::Timer put_timer; + const int kReportFrequency = 30; + hermes::testing::Timer put_timer; size_t failed_puts = 0; size_t failed_links = 0; + size_t retries = 0; for (int i = 0; i < kIters; ++i) { std::string blob_name = MakeBlobName(rank, i); hapi::Blob blob(kBlobSize, i % 255); timer.resumeTime(); - // put_timer.resumeTime(); + put_timer.resumeTime(); hapi::Status status; int consecutive_fails = 0; while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { + retries++; + if (++consecutive_fails > 10) { failed_puts++; - if (++consecutive_fails > 10) { - break; - } + break; + } } - // put_timer.pauseTime(); if (options.use_borg && consecutive_fails <= 10) { hapi::Status link_status = vbkt.Link(blob_name, bkt_name); @@ -177,21 +193,32 @@ int main(int argc, char *argv[]) { failed_links++; } } + + if (options.sleep_ms > 0 && i > 0 && i % kReportFrequency == 0) { + std::this_thread::sleep_for( + std::chrono::milliseconds(options.sleep_ms)); + } + + put_timer.pauseTime(); timer.pauseTime(); - // if (i > 0 && i % kReportFrequency == 0) { - // // TODO(chogan): Support more than 1 rank - // constexpr double total_mb = - // (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; - - // std::cout << i << ", " << total_mb / put_timer.getElapsedTime() - // << "\n"; - // put_timer.reset(); - // } + + if (options.time_puts && i > 0 && i % kReportFrequency == 0) { + // TODO(chogan): Support more than 1 rank + Assert(kNumRanks == 1); + constexpr double total_mb = + (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; + + std::cout << i << ", " << total_mb / put_timer.getElapsedTime() + << "\n"; + put_timer.reset(); + } hermes->AppBarrier(); } - std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; - std::cout << " " << "failed links: " << failed_links << "\n"; + Assert(failed_puts == 0); + // std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; + // std::cout << "Rank " << rank << " failed links: " << failed_links << "\n"; + // std::cout << "Rank " << rank << " Put retries: " << retries << "\n"; hermes->AppBarrier(); if (!hermes->IsFirstRankOnNode()) { @@ -218,6 +245,7 @@ int main(int argc, char *argv[]) { bool flush_synchronously = true; hapi::PersistTrait persist_trait(options.output_filename, offset_map, flush_synchronously); + std::cout << "Flushing buffers...\n"; file_vbucket.Attach(&persist_trait); file_vbucket.Destroy(); @@ -233,7 +261,8 @@ int main(int argc, char *argv[]) { kNumRanks); if (hermes->IsFirstRankOnNode()) { - std::cout << "##################### " << bandwidth << " MiB/s\n"; + std::cout << bandwidth << "," << kNumRanks << "," << options.use_borg + << "," << options.sleep_ms << "\n"; } } @@ -248,6 +277,7 @@ int main(int argc, char *argv[]) { const size_t kTotalBytes = kAppCores * kIters * kBlobSize; if (options.verify && my_rank == 0) { std::vector data(kTotalBytes); + std::cout << "Verifying data\n"; FILE *f = fopen(options.output_filename, "r"); Assert(f); Assert(fseek(f, 0L, SEEK_END) == 0); From 4020727bec4f6211cb93d6e0e83696c270a2ffa6 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 22 Jun 2022 08:14:40 -0500 Subject: [PATCH 52/85] Add more options to borg_bench --- benchmarks/borg_bench.cc | 51 +++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 91401d556..9ccf14e74 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -27,6 +27,8 @@ struct Options { bool use_borg; bool verify; bool time_puts; + bool verbose; + bool debug; long sleep_ms; char *output_filename; }; @@ -35,6 +37,8 @@ void PrintUsage(char *program) { fprintf(stderr, "Usage: %s [-b ] [-f] \n", program); fprintf(stderr, " -b\n"); fprintf(stderr, " If present, enable the BORG.\n"); + fprintf(stderr, " -d\n"); + fprintf(stderr, " If present, enable MPI breakpoint for debugging.\n"); fprintf(stderr, " -f\n"); fprintf(stderr, " The filename of the persisted data (for correctness" "verification).\n"); @@ -43,13 +47,15 @@ void PrintUsage(char *program) { fprintf(stderr, " -s\n"); fprintf(stderr, " Sleep ms between each Put.\n"); fprintf(stderr, " -v\n"); + fprintf(stderr, " Print verbose information.\n"); + fprintf(stderr, " -x\n"); fprintf(stderr, " If present, verify results at the end.\n"); } Options HandleArgs(int argc, char **argv) { Options result = {}; int option = -1; - while ((option = getopt(argc, argv, "bf:hps:v")) != -1) { + while ((option = getopt(argc, argv, "bdf:hps:vx")) != -1) { switch (option) { case 'h': { PrintUsage(argv[0]); @@ -59,6 +65,10 @@ Options HandleArgs(int argc, char **argv) { result.use_borg = true; break; } + case 'd': { + result.debug = true; + break; + } case 'f': { result.output_filename = optarg; break; @@ -72,6 +82,10 @@ Options HandleArgs(int argc, char **argv) { break; } case 'v': { + result.verbose = true; + break; + } + case 'x': { result.verify = true; break; } @@ -135,13 +149,15 @@ int main(int argc, char *argv[]) { return 1; } - // int gdb_iii = 0; - // char gdb_DEBUG_hostname[256]; - // gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); - // printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); - // fflush(stdout); - // while (0 == gdb_iii) - // sleep(5); + if (options.debug) { + int gdb_iii = 0; + char gdb_DEBUG_hostname[256]; + gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); + printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); + fflush(stdout); + while (0 == gdb_iii) + sleep(5); + } HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); @@ -203,7 +219,6 @@ int main(int argc, char *argv[]) { timer.pauseTime(); if (options.time_puts && i > 0 && i % kReportFrequency == 0) { - // TODO(chogan): Support more than 1 rank Assert(kNumRanks == 1); constexpr double total_mb = (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; @@ -216,9 +231,11 @@ int main(int argc, char *argv[]) { } Assert(failed_puts == 0); - // std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; - // std::cout << "Rank " << rank << " failed links: " << failed_links << "\n"; - // std::cout << "Rank " << rank << " Put retries: " << retries << "\n"; + if (options.verbose) { + std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; + std::cout << "Rank " << rank << " failed links: " << failed_links << "\n"; + std::cout << "Rank " << rank << " Put retries: " << retries << "\n"; + } hermes->AppBarrier(); if (!hermes->IsFirstRankOnNode()) { @@ -245,7 +262,9 @@ int main(int argc, char *argv[]) { bool flush_synchronously = true; hapi::PersistTrait persist_trait(options.output_filename, offset_map, flush_synchronously); - std::cout << "Flushing buffers...\n"; + if (options.verbose) { + std::cout << "Flushing buffers...\n"; + } file_vbucket.Attach(&persist_trait); file_vbucket.Destroy(); @@ -277,7 +296,11 @@ int main(int argc, char *argv[]) { const size_t kTotalBytes = kAppCores * kIters * kBlobSize; if (options.verify && my_rank == 0) { std::vector data(kTotalBytes); - std::cout << "Verifying data\n"; + + if (options.verbose) { + std::cout << "Verifying data\n"; + } + FILE *f = fopen(options.output_filename, "r"); Assert(f); Assert(fseek(f, 0L, SEEK_END) == 0); From e535e806b4e7b380cae0ff171bfbbb9e0eec9f41 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 22 Jun 2022 12:59:28 -0500 Subject: [PATCH 53/85] [skip ci] Add BlobInfo::effective_targets --- src/buffer_organizer.cc | 18 +++++++++++++----- src/buffer_organizer.h | 4 +++- src/buffer_pool.cc | 15 ++++++++++++--- src/buffer_pool.h | 1 + src/hermes_types.h | 6 ++++++ src/metadata_management.cc | 25 +++++++++++++++++-------- src/metadata_management.h | 13 ++++--------- src/metadata_management_internal.h | 2 +- src/rpc_thallium.cc | 13 +++++++++++-- src/rpc_thallium.h | 2 +- 10 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 44ababab2..061a9681d 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -431,23 +431,31 @@ void OrganizeBlob(SharedMemoryContext *context, RpcContext *rpc, } void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, - const ViolationInfo &info) { - (void)context; - (void)rpc; - - // DeviceID dev_id = info.device_id; + ViolationInfo info) { + u32 target_node = info.target_id.bits.node_id; + if (target_node == rpc->node_id) { + LocalEnforceCapacityThresholds(context, info); + } else { + RpcCall(rpc, target_node, "RemoteEnforceCapacityThresholds", info); + } +} +void LocalEnforceCapacityThresholds(SharedMemoryContext *context, + ViolationInfo info) { switch (info.violation) { case ThresholdViolation::kMin: { // while (min is violated) // Choose largest buffer from most important Blob // Move to higher tier + // Ensure info.violation_size has been moved break; } case ThresholdViolation::kMax: { // while (max is violated) // Choose largest buffer from least important Blob + // find least important blob // Move to lower tier + // Ensure info.violation_size has been moved break; } default: { diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index a77a93911..87178c7b4 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -114,7 +114,9 @@ void EnqueueBoMove(RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, BucketID bucket_id, const std::string &internal_name, BoPriority priority); void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, - const ViolationInfo &info); + ViolationInfo info); +void LocalEnforceCapacityThresholds(SharedMemoryContext *context, + ViolationInfo info); } // namespace hermes #endif // HERMES_BUFFER_ORGANIZER_H_ diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc index 59b68ccb4..26c7cf6ae 100644 --- a/src/buffer_pool.cc +++ b/src/buffer_pool.cc @@ -781,7 +781,7 @@ Device *InitDevices(Arena *arena, Config *config, f32 &min_bw, f32 &max_bw) { Target *InitTargets(Arena *arena, Config *config, Device *devices, int node_id) { - Target *result = PushArray(arena, config->num_targets); + Target *result = PushClearedArray(arena, config->num_targets); if (config->num_targets != config->num_devices) { HERMES_NOT_IMPLEMENTED_YET; @@ -1723,7 +1723,9 @@ SwapBlob PutToSwap(SharedMemoryContext *context, RpcContext *rpc, u32 target_node = rpc->node_id; SwapBlob swap_blob = WriteToSwap(context, blob, target_node, bucket_id); std::vector buffer_ids = SwapBlobToVec(swap_blob); - AttachBlobToBucket(context, rpc, name.c_str(), bucket_id, buffer_ids, true); + TargetID effective_target = {}; + AttachBlobToBucket(context, rpc, name.c_str(), bucket_id, buffer_ids, + effective_target, true); return swap_blob; } @@ -1768,9 +1770,16 @@ api::Status PlaceBlob(SharedMemoryContext *context, RpcContext *rpc, WriteBlobToBuffers(context, rpc, blob, buffer_ids); HERMES_END_TIMED_BLOCK(); + std::pair max_target = + *std::max_element(schema.begin(), schema.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + TargetID effective_target = max_target.second; + // NOTE(chogan): Update all metadata associated with this Put AttachBlobToBucket(context, rpc, name.c_str(), bucket_id, buffer_ids, - false, called_from_buffer_organizer); + effective_target, false, called_from_buffer_organizer); } else { if (ctx.disable_swap) { result = PLACE_SWAP_BLOB_TO_BUF_FAILED; diff --git a/src/buffer_pool.h b/src/buffer_pool.h index 707303198..92e183c7c 100644 --- a/src/buffer_pool.h +++ b/src/buffer_pool.h @@ -75,6 +75,7 @@ struct Target { u64 capacity; std::atomic remaining_space; std::atomic speed; + ChunkedIdList effective_blobs; }; /** diff --git a/src/hermes_types.h b/src/hermes_types.h index 91d2c450b..59565d916 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -43,6 +43,12 @@ typedef double f64; typedef u16 DeviceID; +struct ChunkedIdList { + u32 head_offset; + u32 length; + u32 capacity; +}; + namespace api { typedef std::vector Blob; diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 4338b7e93..520ff171e 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -651,29 +651,32 @@ BufferIdArray GetBufferIdsFromBlobId(Arena *arena, } void LocalCreateBlobMetadata(MetadataManager *mdm, const std::string &blob_name, - BlobID blob_id) { + BlobID blob_id, TargetID effective_target) { LocalPut(mdm, blob_name.c_str(), blob_id.as_int, kMapType_BlobId); BlobInfo blob_info = {}; blob_info.stats.frequency = 1; blob_info.stats.recency = mdm->clock++; + blob_info.effective_target = effective_target; LocalPut(mdm, blob_id, blob_info); } void CreateBlobMetadata(MetadataManager *mdm, RpcContext *rpc, - const std::string &blob_name, BlobID blob_id) { + const std::string &blob_name, BlobID blob_id, + TargetID effective_target) { u32 target_node = GetBlobNodeId(blob_id); if (target_node == rpc->node_id) { - LocalCreateBlobMetadata(mdm, blob_name, blob_id); + LocalCreateBlobMetadata(mdm, blob_name, blob_id, effective_target); } else { RpcCall(rpc, target_node, "RemoteCreateBlobMetadata", blob_name, - blob_id); + blob_id, effective_target); } } void AttachBlobToBucket(SharedMemoryContext *context, RpcContext *rpc, const char *blob_name, BucketID bucket_id, const std::vector &buffer_ids, - bool is_swap_blob, bool called_from_buffer_organizer) { + TargetID effective_target, bool is_swap_blob, + bool called_from_buffer_organizer) { MetadataManager *mdm = GetMetadataManagerFromContext(context); std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); @@ -700,7 +703,7 @@ void AttachBlobToBucket(SharedMemoryContext *context, RpcContext *rpc, blob_id.bits.buffer_ids_offset = AllocateBufferIdList(context, rpc, target_node, buffer_ids); - CreateBlobMetadata(mdm, rpc, internal_name, blob_id); + CreateBlobMetadata(mdm, rpc, internal_name, blob_id, effective_target); AddBlobIdToBucket(mdm, rpc, blob_id, bucket_id); } @@ -1007,8 +1010,14 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, } if (percentage_violation > 0.0f) { - info.device_id = (DeviceID)device_idx; - info.node_id = node_id; + TargetID target_id = {}; + target_id.bits.node_id = node_id; + target_id.bits.device_id = (DeviceID)device_idx; + // TODO(chogan): This needs to change when we support num_devices != + // num_targets + target_id.bits.index = device_idx; + + info.target_id = target_id; info.violation_size = (size_t)(percentage_violation * state->capacities[device_idx]); result.push_back(info); diff --git a/src/metadata_management.h b/src/metadata_management.h index 381ccb284..2d85387a4 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -69,10 +69,9 @@ enum class ThresholdViolation { }; struct ViolationInfo { - DeviceID device_id; + TargetID target_id; ThresholdViolation violation; size_t violation_size; - u32 node_id; }; struct Stats { @@ -82,12 +81,6 @@ struct Stats { const int kIdListChunkSize = 10; -struct ChunkedIdList { - u32 head_offset; - u32 length; - u32 capacity; -}; - struct IdList { u32 head_offset; u32 length; @@ -101,6 +94,7 @@ struct BufferIdArray { struct BlobInfo { Stats stats; TicketMutex lock; + TargetID effective_target; u32 last; bool stop; @@ -109,6 +103,7 @@ struct BlobInfo { stats.frequency = 0; lock.ticket.store(0); lock.serving.store(0); + effective_target.as_int = 0; } BlobInfo& operator=(const BlobInfo &other) { @@ -337,7 +332,7 @@ VBucketID GetOrCreateVBucketId(SharedMemoryContext *context, RpcContext *rpc, void AttachBlobToBucket(SharedMemoryContext *context, RpcContext *rpc, const char *blob_name, BucketID bucket_id, const std::vector &buffer_ids, - bool is_swap_blob = false, + TargetID effective_target, bool is_swap_blob = false, bool called_from_buffer_organizer = false); /** diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index d2f475215..363ba9e3f 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -142,7 +142,7 @@ int LocalGetNumOutstandingFlushingTasks(SharedMemoryContext *context, int GetNumOutstandingFlushingTasks(SharedMemoryContext *context, RpcContext *rpc, VBucketID id); void LocalCreateBlobMetadata(MetadataManager *mdm, const std::string &blob_name, - BlobID blob_id); + BlobID blob_id, TargetID effective_target); Heap *GetIdHeap(MetadataManager *mdm); Heap *GetMapHeap(MetadataManager *mdm); IdList AllocateIdList(MetadataManager *mdm, u32 length); diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index bfba2cf82..3e419801e 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -419,9 +419,9 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, auto rpc_create_blob_metadata = [context](const request &req, const std::string &blob_name, - BlobID blob_id) { + BlobID blob_id, TargetID effective_target) { MetadataManager *mdm = GetMetadataManagerFromContext(context); - LocalCreateBlobMetadata(mdm, blob_name, blob_id); + LocalCreateBlobMetadata(mdm, blob_name, blob_id, effective_target); req.respond(true); }; @@ -434,6 +434,13 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, req.respond(true); }; + auto rpc_enforce_capacity_thresholds = [context](const request &req, + ViolationInfo info) { + LocalEnforceCapacityThresholds(context, info); + // TODO(chogan): Can this be async? + req.respond(true); + }; + // TODO(chogan): Currently these three are only used for testing. rpc_server->define("GetBuffers", rpc_get_buffers); rpc_server->define("SplitBuffers", rpc_split_buffers).disable_response(); @@ -499,6 +506,8 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, rpc_server->define("RemoteCreateBlobMetadata", rpc_create_blob_metadata); rpc_server->define("RemoteReplaceBlobIdInBucket", rpc_replace_blob_id_in_bucket); + rpc_server->define("RemoteEnforceCapacityThresholds", + rpc_enforce_capacity_thresholds); } void StartBufferOrganizer(SharedMemoryContext *context, RpcContext *rpc, diff --git a/src/rpc_thallium.h b/src/rpc_thallium.h index e3367745e..6a2aeb700 100644 --- a/src/rpc_thallium.h +++ b/src/rpc_thallium.h @@ -218,7 +218,7 @@ void serialize(A &ar, BoTask &bo_task) { template void serialize(A &ar, ViolationInfo &info) { - ar & info.device_id; + ar & info.target_id; ar & info.violation; ar & info.violation_size; } From 15affb987935f5d5664443207dd3ee59c708efd0 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 22 Jun 2022 13:30:59 -0500 Subject: [PATCH 54/85] [skip ci] Track Target::effective_blobs --- src/buffer_organizer.cc | 11 ++++++----- src/buffer_pool.h | 1 + src/metadata_management.cc | 20 +++++++++++++++----- src/metadata_management_internal.h | 6 ++++-- src/rpc_thallium.cc | 3 ++- 5 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 061a9681d..c00bd272d 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -445,16 +445,17 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, switch (info.violation) { case ThresholdViolation::kMin: { // while (min is violated) - // Choose largest buffer from most important Blob - // Move to higher tier + // find most important blob in Target::effective_blobs + // Choose largest buffer from most important Blob + // Move to higher tier // Ensure info.violation_size has been moved break; } case ThresholdViolation::kMax: { // while (max is violated) - // Choose largest buffer from least important Blob - // find least important blob - // Move to lower tier + // find least important blob in Target + // Choose largest buffer from least important Blob + // Move to lower tier // Ensure info.violation_size has been moved break; } diff --git a/src/buffer_pool.h b/src/buffer_pool.h index 92e183c7c..3002b69f9 100644 --- a/src/buffer_pool.h +++ b/src/buffer_pool.h @@ -76,6 +76,7 @@ struct Target { std::atomic remaining_space; std::atomic speed; ChunkedIdList effective_blobs; + TicketMutex effective_blobs_lock; }; /** diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 520ff171e..55a150f4e 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -18,6 +18,7 @@ #include #include "memory_management.h" +#include "metadata_management_internal.h" #include "buffer_pool.h" #include "buffer_pool_internal.h" #include "buffer_organizer.h" @@ -650,22 +651,31 @@ BufferIdArray GetBufferIdsFromBlobId(Arena *arena, return result; } -void LocalCreateBlobMetadata(MetadataManager *mdm, const std::string &blob_name, - BlobID blob_id, TargetID effective_target) { +void LocalCreateBlobMetadata(SharedMemoryContext *context, MetadataManager *mdm, + const std::string &blob_name, BlobID blob_id, + TargetID effective_target) { LocalPut(mdm, blob_name.c_str(), blob_id.as_int, kMapType_BlobId); BlobInfo blob_info = {}; blob_info.stats.frequency = 1; blob_info.stats.recency = mdm->clock++; blob_info.effective_target = effective_target; + assert(blob_id.bits.node_id == (int)effective_target.bits.node_id); + + Target *target = GetTargetFromId(context, effective_target); + BeginTicketMutex(&target->effective_blobs_lock); + AppendToChunkedIdList(mdm, &target->effective_blobs, blob_id.as_int); + EndTicketMutex(&target->effective_blobs_lock); + LocalPut(mdm, blob_id, blob_info); } -void CreateBlobMetadata(MetadataManager *mdm, RpcContext *rpc, +void CreateBlobMetadata(SharedMemoryContext *context, RpcContext *rpc, const std::string &blob_name, BlobID blob_id, TargetID effective_target) { + MetadataManager *mdm = GetMetadataManagerFromContext(context); u32 target_node = GetBlobNodeId(blob_id); if (target_node == rpc->node_id) { - LocalCreateBlobMetadata(mdm, blob_name, blob_id, effective_target); + LocalCreateBlobMetadata(context, mdm, blob_name, blob_id, effective_target); } else { RpcCall(rpc, target_node, "RemoteCreateBlobMetadata", blob_name, blob_id, effective_target); @@ -703,7 +713,7 @@ void AttachBlobToBucket(SharedMemoryContext *context, RpcContext *rpc, blob_id.bits.buffer_ids_offset = AllocateBufferIdList(context, rpc, target_node, buffer_ids); - CreateBlobMetadata(mdm, rpc, internal_name, blob_id, effective_target); + CreateBlobMetadata(context, rpc, internal_name, blob_id, effective_target); AddBlobIdToBucket(mdm, rpc, blob_id, bucket_id); } diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h index 363ba9e3f..18add4dc4 100644 --- a/src/metadata_management_internal.h +++ b/src/metadata_management_internal.h @@ -141,11 +141,13 @@ int LocalGetNumOutstandingFlushingTasks(SharedMemoryContext *context, VBucketID id); int GetNumOutstandingFlushingTasks(SharedMemoryContext *context, RpcContext *rpc, VBucketID id); -void LocalCreateBlobMetadata(MetadataManager *mdm, const std::string &blob_name, - BlobID blob_id, TargetID effective_target); +void LocalCreateBlobMetadata(SharedMemoryContext *context, MetadataManager *mdm, + const std::string &blob_name, BlobID blob_id, + TargetID effective_target); Heap *GetIdHeap(MetadataManager *mdm); Heap *GetMapHeap(MetadataManager *mdm); IdList AllocateIdList(MetadataManager *mdm, u32 length); void FreeIdList(MetadataManager *mdm, IdList id_list); +u32 AppendToChunkedIdList(MetadataManager *mdm, ChunkedIdList *id_list, u64 id); } // namespace hermes #endif // HERMES_METADATA_MANAGEMENT_INTERNAL_H_ diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index 3e419801e..c5bd5e353 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -421,7 +421,8 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, [context](const request &req, const std::string &blob_name, BlobID blob_id, TargetID effective_target) { MetadataManager *mdm = GetMetadataManagerFromContext(context); - LocalCreateBlobMetadata(mdm, blob_name, blob_id, effective_target); + LocalCreateBlobMetadata(context, mdm, blob_name, blob_id, + effective_target); req.respond(true); }; From fcedca20cf05c256cdc2e8aa180aac70dce44a6e Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 22 Jun 2022 16:12:34 -0500 Subject: [PATCH 55/85] [skip ci] Ready to test BlobInfo::effective_target and Target::effective_blobs --- src/buffer_organizer.cc | 2 ++ src/buffer_pool.cc | 3 +-- src/hermes_types.h | 2 ++ src/metadata_management.cc | 16 +++++++---- test/buffer_organizer_test.cc | 6 ----- test/mdm_test.cc | 50 ++++++++++++++++++++++++----------- test/test_utils.h | 6 +++++ 7 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index c00bd272d..acde279b6 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -442,6 +442,7 @@ void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, void LocalEnforceCapacityThresholds(SharedMemoryContext *context, ViolationInfo info) { + (void)context; switch (info.violation) { case ThresholdViolation::kMin: { // while (min is violated) @@ -454,6 +455,7 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, case ThresholdViolation::kMax: { // while (max is violated) // find least important blob in Target + // f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); // Choose largest buffer from least important Blob // Move to lower tier // Ensure info.violation_size has been moved diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc index 26c7cf6ae..0614d2f42 100644 --- a/src/buffer_pool.cc +++ b/src/buffer_pool.cc @@ -1723,9 +1723,8 @@ SwapBlob PutToSwap(SharedMemoryContext *context, RpcContext *rpc, u32 target_node = rpc->node_id; SwapBlob swap_blob = WriteToSwap(context, blob, target_node, bucket_id); std::vector buffer_ids = SwapBlobToVec(swap_blob); - TargetID effective_target = {}; AttachBlobToBucket(context, rpc, name.c_str(), bucket_id, buffer_ids, - effective_target, true); + kSwapTargetId, true); return swap_blob; } diff --git a/src/hermes_types.h b/src/hermes_types.h index 59565d916..2af2d0f32 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -150,6 +150,8 @@ union TargetID { u64 as_int; }; +const TargetID kSwapTargetId = {{0, 0, 0}}; + /** * A PlacementSchema is a vector of (size, target) pairs where size is the * number of bytes to buffer and target is the TargetID where to buffer those diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 55a150f4e..8dc5884df 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -27,6 +27,10 @@ namespace hermes { +static bool operator!=(const TargetID &lhs, const TargetID &rhs) { + return lhs.as_int != rhs.as_int; +} + static bool IsNameTooLong(const std::string &name, size_t max) { bool result = false; if (name.size() + 1 >= max) { @@ -659,12 +663,14 @@ void LocalCreateBlobMetadata(SharedMemoryContext *context, MetadataManager *mdm, blob_info.stats.frequency = 1; blob_info.stats.recency = mdm->clock++; blob_info.effective_target = effective_target; - assert(blob_id.bits.node_id == (int)effective_target.bits.node_id); - Target *target = GetTargetFromId(context, effective_target); - BeginTicketMutex(&target->effective_blobs_lock); - AppendToChunkedIdList(mdm, &target->effective_blobs, blob_id.as_int); - EndTicketMutex(&target->effective_blobs_lock); + if (effective_target != kSwapTargetId) { + assert(blob_id.bits.node_id == (int)effective_target.bits.node_id); + Target *target = GetTargetFromId(context, effective_target); + BeginTicketMutex(&target->effective_blobs_lock); + AppendToChunkedIdList(mdm, &target->effective_blobs, blob_id.as_int); + EndTicketMutex(&target->effective_blobs_lock); + } LocalPut(mdm, blob_id, blob_info); } diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 1fe994993..7a831ad5a 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -277,12 +277,6 @@ int main(int argc, char *argv[]) { return 1; } -#define HERMES_ADD_TEST(test_name) \ - if (argc == 1 || std::string(argv[1]) == #test_name) { \ - fprintf(stdout, "### Running %s\n", #test_name); \ - test_name(); \ - } - HERMES_ADD_TEST(TestIsBoFunction); HERMES_ADD_TEST(TestBackgroundFlush); HERMES_ADD_TEST(TestBoMove); diff --git a/test/mdm_test.cc b/test/mdm_test.cc index 3706a9070..520e2657a 100644 --- a/test/mdm_test.cc +++ b/test/mdm_test.cc @@ -380,6 +380,25 @@ static void TestMdmViz() { hermes->Finalize(true); } +static void TestEffectiveTarget() { + using namespace hermes; // NOLINT(*) + + hermes::Config config = {}; + hermes::InitDefaultConfig(&config); + config.default_placement_policy = hapi::PlacementPolicy::kRoundRobin; + config.default_rr_split = 0; + HermesPtr hermes = hermes::InitHermesDaemon(&config); + SharedMemoryContext *context = &hermes->context_; + MetadataManager *mdm = GetMetadataManagerFromContext(context); + + hermes::RoundRobinState rr_state; + size_t num_devices = rr_state.GetNumDevices(); + + // TODO(chogan): + + hermes->Finalize(true); +} + int main(int argc, char **argv) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -390,24 +409,25 @@ int main(int argc, char **argv) { HermesPtr hermes = hapi::InitHermes(NULL, true); - TestNullIds(); - TestGetMapMutex(); - TestLocalGetNextFreeBucketId(hermes); - TestGetOrCreateBucketId(hermes); - TestRenameBlob(hermes); - TestRenameBucket(hermes); - TestBucketRefCounting(hermes); - TestMaxNameLength(hermes); - TestGetRelativeNodeId(); - TestDuplicateBlobNames(hermes); - TestGetBucketIdFromBlobId(hermes); - TestHexStringToU64(); + HERMES_ADD_TEST(TestNullIds); + HERMES_ADD_TEST(TestGetMapMutex); + HERMES_ADD_TEST(TestLocalGetNextFreeBucketId, hermes); + HERMES_ADD_TEST(TestGetOrCreateBucketId, hermes); + HERMES_ADD_TEST(TestRenameBlob, hermes); + HERMES_ADD_TEST(TestRenameBucket, hermes); + HERMES_ADD_TEST(TestBucketRefCounting, hermes); + HERMES_ADD_TEST(TestMaxNameLength, hermes); + HERMES_ADD_TEST(TestGetRelativeNodeId); + HERMES_ADD_TEST(TestDuplicateBlobNames, hermes); + HERMES_ADD_TEST(TestGetBucketIdFromBlobId, hermes); + HERMES_ADD_TEST(TestHexStringToU64); hermes->Finalize(true); - TestSwapBlobsExistInBucket(); - TestBlobInfoMap(); - TestMdmViz(); + HERMES_ADD_TEST(TestSwapBlobsExistInBucket); + HERMES_ADD_TEST(TestBlobInfoMap); + HERMES_ADD_TEST(TestMdmViz); + HERMES_ADD_TEST(TestEffectiveTarget); MPI_Finalize(); diff --git a/test/test_utils.h b/test/test_utils.h index 81d078f33..8a2ea063e 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -24,6 +24,12 @@ #include "hermes_types.h" #include "bucket.h" +#define HERMES_ADD_TEST(test_name, ...) \ + if (argc == 1 || std::string(argv[1]) == #test_name) { \ + fprintf(stdout, "### Running %s\n", #test_name); \ + test_name(__VA_ARGS__); \ + } + namespace hermes { namespace testing { From 3bad02520603c67811ac3dc42f4d21ba6c001601 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 23 Jun 2022 15:22:17 -0500 Subject: [PATCH 56/85] [skip ci] Add effective_target test and fix BlobInfo copy assignment operator --- src/metadata_management.cc | 2 +- src/metadata_management.h | 3 +++ test/mdm_test.cc | 23 ++++++++++++++++++++--- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 8dc5884df..6a5cd78a9 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -27,7 +27,7 @@ namespace hermes { -static bool operator!=(const TargetID &lhs, const TargetID &rhs) { +bool operator!=(const TargetID &lhs, const TargetID &rhs) { return lhs.as_int != rhs.as_int; } diff --git a/src/metadata_management.h b/src/metadata_management.h index 2d85387a4..102853001 100644 --- a/src/metadata_management.h +++ b/src/metadata_management.h @@ -110,6 +110,9 @@ struct BlobInfo { stats = other.stats; lock.ticket.store(other.lock.ticket.load()); lock.serving.store(other.lock.serving.load()); + effective_target = other.effective_target; + last = other.last; + stop = other.stop; return *this; } diff --git a/test/mdm_test.cc b/test/mdm_test.cc index 520e2657a..0ca960f3c 100644 --- a/test/mdm_test.cc +++ b/test/mdm_test.cc @@ -18,6 +18,7 @@ #include "bucket.h" #include "vbucket.h" #include "metadata_management_internal.h" +#include "metadata_storage.h" #include "test_utils.h" using namespace hermes; // NOLINT(*) @@ -388,13 +389,29 @@ static void TestEffectiveTarget() { config.default_placement_policy = hapi::PlacementPolicy::kRoundRobin; config.default_rr_split = 0; HermesPtr hermes = hermes::InitHermesDaemon(&config); + + hermes::RoundRobinState rr_state; + // size_t num_devices = rr_state.GetNumDevices(); + rr_state.SetCurrentDeviceIndex(0); + + std::string bucket_name(__func__); + hapi::Bucket bucket(bucket_name, hermes); + hapi::Blob data(4 * 1024, 'z'); + std::string blob_name("1"); + Assert(bucket.Put(blob_name, data).Succeeded()); + SharedMemoryContext *context = &hermes->context_; + RpcContext *rpc = &hermes->rpc_; MetadataManager *mdm = GetMetadataManagerFromContext(context); - hermes::RoundRobinState rr_state; - size_t num_devices = rr_state.GetNumDevices(); + BucketID bucket_id = GetBucketId(context, rpc, bucket_name.c_str()); + BlobID blob_id = GetBlobId(context, rpc, blob_name, bucket_id, false); + BlobInfo *info = GetBlobInfoPtr(mdm, blob_id); + TargetID expected_target_id = {{1, 0, 0}}; + Assert(info->effective_target.as_int == expected_target_id.as_int); + ReleaseBlobInfoPtr(mdm); - // TODO(chogan): + bucket.Destroy(); hermes->Finalize(true); } From 9dd1ddb25c654b2d1ebcb9a79dd57d0039c2a13b Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 23 Jun 2022 15:52:05 -0500 Subject: [PATCH 57/85] [skip ci] add Target::effective_blobs test --- src/metadata_storage.h | 4 ++++ test/mdm_test.cc | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/metadata_storage.h b/src/metadata_storage.h index 9503c4eab..e78cf97ce 100644 --- a/src/metadata_storage.h +++ b/src/metadata_storage.h @@ -88,6 +88,10 @@ BlobInfo *GetBlobInfoPtr(MetadataManager *mdm, BlobID blob_id); */ void ReleaseBlobInfoPtr(MetadataManager *mdm); +template +u64 *GetIdsPtr(MetadataManager *mdm, T id_list); +void ReleaseIdsPtr(MetadataManager *mdm); + } // namespace hermes #endif // HERMES_METADATA_STORAGE_H_ diff --git a/test/mdm_test.cc b/test/mdm_test.cc index 0ca960f3c..0b2444145 100644 --- a/test/mdm_test.cc +++ b/test/mdm_test.cc @@ -17,6 +17,7 @@ #include "hermes.h" #include "bucket.h" #include "vbucket.h" +#include "buffer_pool_internal.h" #include "metadata_management_internal.h" #include "metadata_storage.h" #include "test_utils.h" @@ -404,6 +405,7 @@ static void TestEffectiveTarget() { RpcContext *rpc = &hermes->rpc_; MetadataManager *mdm = GetMetadataManagerFromContext(context); + // Check BlobInfo::effective_target BucketID bucket_id = GetBucketId(context, rpc, bucket_name.c_str()); BlobID blob_id = GetBlobId(context, rpc, blob_name, bucket_id, false); BlobInfo *info = GetBlobInfoPtr(mdm, blob_id); @@ -411,6 +413,15 @@ static void TestEffectiveTarget() { Assert(info->effective_target.as_int == expected_target_id.as_int); ReleaseBlobInfoPtr(mdm); + // Check Target::effective_blobs + Target *ram_target = GetTarget(context, 0); + Assert(ram_target->effective_blobs.length == 1); + u64 *ids = GetIdsPtr(mdm, ram_target->effective_blobs); + BlobID effective_blob_id = {}; + effective_blob_id.as_int = ids[0]; + Assert(effective_blob_id.as_int == blob_id.as_int); + ReleaseIdsPtr(mdm); + bucket.Destroy(); hermes->Finalize(true); From ce0f51329e400fd4757893a9f503f605b02e392d Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 24 Jun 2022 11:35:28 -0500 Subject: [PATCH 58/85] EnforceCapacityThreshold skeleton implementation done --- src/buffer_organizer.cc | 87 ++++++++++++++++++++++++++++++---- src/buffer_organizer.h | 2 +- src/buffer_pool.cc | 2 +- src/hermes_types.h | 2 +- src/metadata_storage.h | 5 ++ src/metadata_storage_stb_ds.cc | 18 +++++++ src/rpc_thallium.cc | 6 +-- 7 files changed, 107 insertions(+), 15 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index acde279b6..41fc2d54c 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -16,6 +16,7 @@ #include "hermes.h" #include "buffer_organizer.h" +#include "metadata_storage.h" #include "data_placement_engine.h" namespace hermes { @@ -434,15 +435,16 @@ void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, ViolationInfo info) { u32 target_node = info.target_id.bits.node_id; if (target_node == rpc->node_id) { - LocalEnforceCapacityThresholds(context, info); + LocalEnforceCapacityThresholds(context, rpc, info); } else { RpcCall(rpc, target_node, "RemoteEnforceCapacityThresholds", info); } } void LocalEnforceCapacityThresholds(SharedMemoryContext *context, - ViolationInfo info) { - (void)context; + RpcContext *rpc, ViolationInfo info) { + MetadataManager *mdm = GetMetadataManagerFromContext(context); + switch (info.violation) { case ThresholdViolation::kMin: { // while (min is violated) @@ -453,12 +455,79 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, break; } case ThresholdViolation::kMax: { - // while (max is violated) - // find least important blob in Target - // f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); - // Choose largest buffer from least important Blob - // Move to lower tier - // Ensure info.violation_size has been moved + Target *target = GetTargetFromId(context, info.target_id); + + f32 min_importance = FLT_MAX; + BlobID least_important_blob = {}; + + BeginTicketMutex(&target->effective_blobs_lock); + std::vector blob_ids = GetChunkedIdList(mdm, + target->effective_blobs); + EndTicketMutex(&target->effective_blobs_lock); + + // Find least important blob in violated Target + for (size_t i = 0; i < blob_ids.size(); ++i) { + BlobID blob_id = {}; + blob_id.as_int = blob_ids[i]; + f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); + if (importance_score < min_importance) { + min_importance = importance_score; + least_important_blob = blob_id; + } + } + + assert(!IsNullBlobId(least_important_blob)); + + std::vector all_buffer_ids = + LocalGetBufferIdList(mdm, least_important_blob); + std::vector buffer_ids_in_target; + // Filter out BufferIDs not in this Target + for (size_t i = 0; i < all_buffer_ids.size(); ++i) { + BufferHeader *header = GetHeaderByBufferId(context, all_buffer_ids[i]); + DeviceID device_id = header->device_id; + if (device_id == info.target_id.bits.device_id) { + // TODO(chogan): Needs to changes when we support num_devices != + // num_targets + buffer_ids_in_target.push_back(all_buffer_ids[i]); + } + } + + std::vector buffer_info = + GetBufferInfo(context, rpc, buffer_ids_in_target); + auto buffer_info_comparator = [](const BufferInfo &lhs, + const BufferInfo &rhs) { + return lhs.size > rhs.size; + }; + // Sort in descending order + std::sort(buffer_info.begin(), buffer_info.end(), buffer_info_comparator); + + size_t bytes_moved = 0; + std::vector buffers_to_move; + size_t index = 0; + // Choose largest buffer until we've moved info.violation_size + while (bytes_moved < info.violation_size) { + buffers_to_move.push_back(buffer_info[index]); + bytes_moved += buffer_info[index].size; + index++; + } + // TODO(chogan): which target? + TargetID target_dest = {}; + // TODO(chogan): combine src buffers into dest (need slab size info) + PlacementSchema schema; + schema.push_back(std::pair(bytes_moved, target_dest)); + std::vector dests = GetBuffers(context, schema); + BoMoveList moves; + // TODO(chogan): + // moves.push_back(std::pair(src, dest)); + + // Queue BO task to move to lower tier + BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, + least_important_blob); + std::string blob_name = + LocalGetBlobNameFromId(context, least_important_blob); + std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); + EnqueueBoMove(rpc, moves, least_important_blob, bucket_id, internal_name, + BoPriority::kLow); break; } default: { diff --git a/src/buffer_organizer.h b/src/buffer_organizer.h index 87178c7b4..968adf19d 100644 --- a/src/buffer_organizer.h +++ b/src/buffer_organizer.h @@ -116,7 +116,7 @@ void EnqueueBoMove(RpcContext *rpc, const BoMoveList &moves, BlobID blob_id, void EnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, ViolationInfo info); void LocalEnforceCapacityThresholds(SharedMemoryContext *context, - ViolationInfo info); + RpcContext *rpc, ViolationInfo info); } // namespace hermes #endif // HERMES_BUFFER_ORGANIZER_H_ diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc index 0614d2f42..1df4c85e1 100644 --- a/src/buffer_pool.cc +++ b/src/buffer_pool.cc @@ -204,7 +204,7 @@ inline BufferHeader *GetHeaderByIndex(SharedMemoryContext *context, u32 index) { } BufferHeader *GetHeaderByBufferId(SharedMemoryContext *context, - BufferID id) { + BufferID id) { BufferHeader *result = GetHeaderByIndex(context, id.bits.header_index); return result; diff --git a/src/hermes_types.h b/src/hermes_types.h index 2af2d0f32..baa3dbbb3 100644 --- a/src/hermes_types.h +++ b/src/hermes_types.h @@ -315,7 +315,7 @@ union BlobID { i32 node_id; } bits; - /** The BlobID as a unsigned 64-bit integer */ + /** The BlobID as an unsigned 64-bit integer */ u64 as_int; }; diff --git a/src/metadata_storage.h b/src/metadata_storage.h index e78cf97ce..9f54dee3e 100644 --- a/src/metadata_storage.h +++ b/src/metadata_storage.h @@ -92,6 +92,11 @@ template u64 *GetIdsPtr(MetadataManager *mdm, T id_list); void ReleaseIdsPtr(MetadataManager *mdm); +/** + * + */ +std::vector GetChunkedIdList(MetadataManager *mdm, ChunkedIdList id_list); + } // namespace hermes #endif // HERMES_METADATA_STORAGE_H_ diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index cb328f36f..f043353af 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -359,6 +359,24 @@ u32 AppendToChunkedIdList(MetadataManager *mdm, ChunkedIdList *id_list, return result; } +/** + * Assumes the caller has protected @p id_list with a lock. + * + * @return A vector of the IDs. + */ +std::vector GetChunkedIdList(MetadataManager *mdm, ChunkedIdList id_list) { + std::vector result(id_list.length); + if (id_list.length > 0) { + u64 *head = GetIdsPtr(mdm, id_list); + for (u32 i = 0; i < id_list.length; ++i) { + result[i] = head[i]; + } + ReleaseIdsPtr(mdm); + } + + return result; +} + u64 GetChunkedIdListElement(MetadataManager *mdm, ChunkedIdList *id_list, u32 index) { u64 result = 0; diff --git a/src/rpc_thallium.cc b/src/rpc_thallium.cc index c5bd5e353..0edc3af61 100644 --- a/src/rpc_thallium.cc +++ b/src/rpc_thallium.cc @@ -435,9 +435,9 @@ void ThalliumStartRpcServer(SharedMemoryContext *context, RpcContext *rpc, req.respond(true); }; - auto rpc_enforce_capacity_thresholds = [context](const request &req, - ViolationInfo info) { - LocalEnforceCapacityThresholds(context, info); + auto rpc_enforce_capacity_thresholds = [context, rpc](const request &req, + ViolationInfo info) { + LocalEnforceCapacityThresholds(context, rpc, info); // TODO(chogan): Can this be async? req.respond(true); }; From 62e01e9d11398428633fafea84276ef9b6d3e3d3 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 28 Jun 2022 08:13:46 -0500 Subject: [PATCH 59/85] [skip ci] Keep old behavior of DPE test --- test/dpe_optimization_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/dpe_optimization_test.cc b/test/dpe_optimization_test.cc index 5ae476af4..b650b9e5d 100644 --- a/test/dpe_optimization_test.cc +++ b/test/dpe_optimization_test.cc @@ -30,10 +30,12 @@ void MinimizeIoTimePlaceBlob(std::vector &blob_sizes, << blob_sizes[0] << " to targets\n" << std::flush; std::vector targets = testing::GetDefaultTargets(node_state.num_devices); + api::Context ctx; + ctx.minimize_io_time_options = api::MinimizeIoTimeOptions(0, 0, true); Status result = MinimizeIoTimePlacement(blob_sizes, node_state.bytes_available, node_state.bandwidth, targets, - schemas_tmp); + schemas_tmp, ctx); if (result.Failed()) { std::cout << "\nMinimizeIoTimePlacement failed\n" << std::flush; exit(1); From 90a62f1d0aefa1741ded56bdedd3792a300a84bb Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 29 Jun 2022 13:00:17 -0500 Subject: [PATCH 60/85] Remove unnecessary #undef --- test/buffer_organizer_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 7a831ad5a..9d486ee93 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -283,8 +283,6 @@ int main(int argc, char *argv[]) { HERMES_ADD_TEST(TestOrganizeBlob); HERMES_ADD_TEST(TestWriteOnlyBucket); -#undef HERMES_ADD_TEST - MPI_Finalize(); return 0; From 80936f531b85658b187e12ccec1f82d2c31dfe99 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 29 Jun 2022 16:17:41 -0500 Subject: [PATCH 61/85] [skip ci] WIP: selecting Targets and Buffers to move --- src/buffer_organizer.cc | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 41fc2d54c..1c953e4a6 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -510,15 +510,29 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, bytes_moved += buffer_info[index].size; index++; } - // TODO(chogan): which target? - TargetID target_dest = {}; + + + // TODO(chogan): Allow sorting Targets by any metric. This implementation + // only works if the Targets are listed in the configuration in order of + // decreasing bandwidth. + + // Select Target 1 Tier lower than violated Target + u16 target_index = info.target_id.bits.index + 1; + assert(target_index < mdm->node_targets.length); + TargetID target_dest = { + info.target_id.bits.node_id, target_index, target_index + }; + + // TODO(chogan): + // for (i in buffers_to_move) { + + // } // TODO(chogan): combine src buffers into dest (need slab size info) PlacementSchema schema; schema.push_back(std::pair(bytes_moved, target_dest)); std::vector dests = GetBuffers(context, schema); BoMoveList moves; - // TODO(chogan): - // moves.push_back(std::pair(src, dest)); + moves.push_back(std::pair(src, dests)); // Queue BO task to move to lower tier BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, From cf1d10bfb1049366e7b70a6f83de15f7783e0e8a Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 30 Jun 2022 08:39:38 -0500 Subject: [PATCH 62/85] Initial BORG implementation complete --- src/buffer_organizer.cc | 67 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 1c953e4a6..1895478c7 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -511,37 +511,44 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, index++; } - - // TODO(chogan): Allow sorting Targets by any metric. This implementation - // only works if the Targets are listed in the configuration in order of - // decreasing bandwidth. - - // Select Target 1 Tier lower than violated Target - u16 target_index = info.target_id.bits.index + 1; - assert(target_index < mdm->node_targets.length); - TargetID target_dest = { - info.target_id.bits.node_id, target_index, target_index - }; - - // TODO(chogan): - // for (i in buffers_to_move) { - - // } - // TODO(chogan): combine src buffers into dest (need slab size info) - PlacementSchema schema; - schema.push_back(std::pair(bytes_moved, target_dest)); - std::vector dests = GetBuffers(context, schema); BoMoveList moves; - moves.push_back(std::pair(src, dests)); - - // Queue BO task to move to lower tier - BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, - least_important_blob); - std::string blob_name = - LocalGetBlobNameFromId(context, least_important_blob); - std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); - EnqueueBoMove(rpc, moves, least_important_blob, bucket_id, internal_name, - BoPriority::kLow); + for (size_t i = 0; i < buffers_to_move.size(); ++i) { + // TODO(chogan): Allow sorting Targets by any metric. This + // implementation only works if the Targets are listed in the + // configuration in order of decreasing bandwidth. + for (u16 target_index = info.target_id.bits.index + 1; + target_index < mdm->node_targets.length; + ++target_index) { + // Select Target 1 Tier lower than violated Target + TargetID target_dest = { + info.target_id.bits.node_id, target_index, target_index + }; + + // TODO(chogan): combine src buffers into dest (need slab size info) + PlacementSchema schema; + schema.push_back(std::pair(bytes_moved, + target_dest)); + std::vector dests = GetBuffers(context, schema); + if (dests.size() == 0) { + continue; + } + moves.push_back(std::pair(buffers_to_move[i].id, dests)); + } + } + + if (moves.size() > 0) { + // Queue BO task to move to lower tier + BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, + least_important_blob); + std::string blob_name = + LocalGetBlobNameFromId(context, least_important_blob); + std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); + EnqueueBoMove(rpc, moves, least_important_blob, bucket_id, + internal_name, BoPriority::kLow); + } else { + LOG(WARNING) + << "BufferOrganizer: No capacity available in lower Targets.\n"; + } break; } default: { From a6becfeb6887100308abc7693b959718efc5d691 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 30 Jun 2022 15:19:26 -0500 Subject: [PATCH 63/85] Fix BORG bugs and add mixed workload test --- src/buffer_organizer.cc | 9 ++++--- src/metadata_management.cc | 11 ++++---- test/buffer_organizer_test.cc | 49 +++++++++++++++++++++++++++++++++++ test/data/hermes.conf | 2 -- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 1895478c7..6b7af6865 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -512,6 +512,8 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, } BoMoveList moves; + // TODO(chogan): Combine multiple smaller buffers into fewer larger + // buffers for (size_t i = 0; i < buffers_to_move.size(); ++i) { // TODO(chogan): Allow sorting Targets by any metric. This // implementation only works if the Targets are listed in the @@ -524,15 +526,14 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, info.target_id.bits.node_id, target_index, target_index }; - // TODO(chogan): combine src buffers into dest (need slab size info) PlacementSchema schema; schema.push_back(std::pair(bytes_moved, target_dest)); std::vector dests = GetBuffers(context, schema); - if (dests.size() == 0) { - continue; + if (dests.size() != 0) { + moves.push_back(std::pair(buffers_to_move[i].id, dests)); + break; } - moves.push_back(std::pair(buffers_to_move[i].id, dests)); } } diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 6a5cd78a9..3a835c777 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -1011,17 +1011,16 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, ViolationInfo info = {}; float percentage_violation = 0.0f; + f32 percentage_used = 1.0f - percentage_available; - if (percentage_available > - state->bo_capacity_thresholds[device_idx].max) { + if (percentage_used > state->bo_capacity_thresholds[device_idx].max) { percentage_violation = - percentage_available - state->bo_capacity_thresholds[device_idx].max; + percentage_used - state->bo_capacity_thresholds[device_idx].max; info.violation = ThresholdViolation::kMax; } - if (percentage_available < - state->bo_capacity_thresholds[device_idx].min) { + if (percentage_used < state->bo_capacity_thresholds[device_idx].min) { percentage_violation = - state->bo_capacity_thresholds[device_idx].max - percentage_available; + state->bo_capacity_thresholds[device_idx].max - percentage_used; info.violation = ThresholdViolation::kMin; } diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 9d486ee93..22b6cbca3 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -25,6 +25,7 @@ namespace hapi = hermes::api; using HermesPtr = std::shared_ptr; using hermes::u8; using hermes::f32; +using hermes::u64; using hermes::SharedMemoryContext; using hermes::RpcContext; using hermes::BoTask; @@ -269,6 +270,53 @@ static void TestWriteOnlyBucket() { hermes->Finalize(true); } +void TestMixedWorkload() { + hermes::Config config = {}; + InitDefaultConfig(&config); + + size_t cap = MEGABYTES(1); + config.capacities[0] = cap; + config.capacities[1] = cap; + config.capacities[2] = cap; + config.capacities[3] = cap; + + for (int i = 0; i < config.num_devices; ++i) { + config.num_slabs[i] = 1; + config.desired_slab_percentages[i][0] = 1.0; + } + + f32 min = 0.25f; + f32 max = 0.75f; + config.bo_capacity_thresholds[0] = {min, max}; + config.bo_capacity_thresholds[1] = {min, max}; + config.bo_capacity_thresholds[2] = {min, max}; + + HermesPtr hermes = hermes::InitHermesDaemon(&config); + + hermes::RoundRobinState rr_state; + rr_state.SetCurrentDeviceIndex(1); + hapi::Context ctx; + ctx.policy = hapi::PlacementPolicy::kRoundRobin; + Bucket bkt(__func__, hermes, ctx); + // Exceed maximum capacity of Target 1 by 4KiB + const size_t kBlobSize = (max * MEGABYTES(1)) + KILOBYTES(4); + hapi::Blob blob(kBlobSize, 'q'); + Assert(bkt.Put("1", blob).Succeeded()); + + // Let the BORG run. It should move 4KiB from Target 1 to 2 + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Check remaining capacities + std::vector targets = {{1, 1, 1}, {1, 2, 2}}; + std::vector capacities = + GetRemainingTargetCapacities(&hermes->context_, &hermes->rpc_, targets); + Assert(capacities[0] == cap - kBlobSize + KILOBYTES(4)); + Assert(capacities[1] == cap - KILOBYTES(4)); + + bkt.Destroy(); + hermes->Finalize(true); +} + int main(int argc, char *argv[]) { int mpi_threads_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); @@ -282,6 +330,7 @@ int main(int argc, char *argv[]) { HERMES_ADD_TEST(TestBoMove); HERMES_ADD_TEST(TestOrganizeBlob); HERMES_ADD_TEST(TestWriteOnlyBucket); + HERMES_ADD_TEST(TestMixedWorkload); MPI_Finalize(); diff --git a/test/data/hermes.conf b/test/data/hermes.conf index 42f0baab1..5bde5ca8d 100644 --- a/test/data/hermes.conf +++ b/test/data/hermes.conf @@ -1,7 +1,5 @@ # Example Hermes configuration file -# TODO(chogan): Allow specifying capacity values in bytes, KiB, or GiB. - # The number of buffering tiers available. For example, RAM, NVMe, burst # buffer, and parallel file system would be 4 tiers. num_devices = 4; From cd1d8ed15f1bc89bed5106e9628a250c5c249f91 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 6 Jul 2022 08:02:14 -0500 Subject: [PATCH 64/85] Handle kMin case in EnforceCapacityThresholds --- adapter/test/pubsub/pubsub_topic_test.cc | 4 + src/buffer_organizer.cc | 95 ++++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/adapter/test/pubsub/pubsub_topic_test.cc b/adapter/test/pubsub/pubsub_topic_test.cc index ad818dc65..aa8de63ac 100644 --- a/adapter/test/pubsub/pubsub_topic_test.cc +++ b/adapter/test/pubsub/pubsub_topic_test.cc @@ -34,4 +34,8 @@ int main(int argc, char **argv) { auto disconnect_ret = hermes::pubsub::disconnect(); Assert(disconnect_ret.Succeeded()); + + MPI_Finalize(); + + return 0; } diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 6b7af6865..515d2c342 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -447,13 +447,108 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, switch (info.violation) { case ThresholdViolation::kMin: { + // TODO(chogan): // while (min is violated) // find most important blob in Target::effective_blobs // Choose largest buffer from most important Blob // Move to higher tier // Ensure info.violation_size has been moved + Target *target = GetTargetFromId(context, info.target_id); + + f32 max_importance = FLT_MIN; + BlobID most_important_blob = {}; + + BeginTicketMutex(&target->effective_blobs_lock); + std::vector blob_ids = GetChunkedIdList(mdm, + target->effective_blobs); + EndTicketMutex(&target->effective_blobs_lock); + + // Find most important blob in violated Target + for (size_t i = 0; i < blob_ids.size(); ++i) { + BlobID blob_id = {}; + blob_id.as_int = blob_ids[i]; + f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); + if (importance_score > max_importance) { + max_importance = importance_score; + most_important_blob = blob_id; + } + } + + assert(!IsNullBlobId(most_important_blob)); + + std::vector all_buffer_ids = + LocalGetBufferIdList(mdm, most_important_blob); + std::vector buffer_ids_in_target; + // Filter out BufferIDs not in this Target + for (size_t i = 0; i < all_buffer_ids.size(); ++i) { + BufferHeader *header = GetHeaderByBufferId(context, all_buffer_ids[i]); + DeviceID device_id = header->device_id; + if (device_id == info.target_id.bits.device_id) { + // TODO(chogan): Needs to changes when we support num_devices != + // num_targets + buffer_ids_in_target.push_back(all_buffer_ids[i]); + } + } + + std::vector buffer_info = + GetBufferInfo(context, rpc, buffer_ids_in_target); + auto buffer_info_comparator = [](const BufferInfo &lhs, + const BufferInfo &rhs) { + return lhs.size > rhs.size; + }; + // Sort in descending order + std::sort(buffer_info.begin(), buffer_info.end(), buffer_info_comparator); + + size_t bytes_moved = 0; + std::vector buffers_to_move; + size_t index = 0; + // Choose largest buffer until we've moved info.violation_size + while (bytes_moved < info.violation_size) { + buffers_to_move.push_back(buffer_info[index]); + bytes_moved += buffer_info[index].size; + index++; + } + + BoMoveList moves; + for (size_t i = 0; i < buffers_to_move.size(); ++i) { + // TODO(chogan): Allow sorting Targets by any metric. This + // implementation only works if the Targets are listed in the + // configuration in order of decreasing bandwidth. + for (int target_index = (int)info.target_id.bits.index - 1; + target_index >= 0; + --target_index) { + // Select Target 1 Tier lower than violated Target + TargetID target_dest = { + info.target_id.bits.node_id, (u16)target_index, (u16)target_index + }; + + PlacementSchema schema; + schema.push_back(std::pair(bytes_moved, + target_dest)); + std::vector dests = GetBuffers(context, schema); + if (dests.size() != 0) { + moves.push_back(std::pair(buffers_to_move[i].id, dests)); + break; + } + } + } + + if (moves.size() > 0) { + // Queue BO task to move to lower tier + BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, + most_important_blob); + std::string blob_name = + LocalGetBlobNameFromId(context, most_important_blob); + std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); + EnqueueBoMove(rpc, moves, most_important_blob, bucket_id, + internal_name, BoPriority::kLow); + } else { + LOG(WARNING) + << "BufferOrganizer: No capacity available in higher Targets.\n"; + } break; } + case ThresholdViolation::kMax: { Target *target = GetTargetFromId(context, info.target_id); From e5b85cac4c404389ea880d1fc159e8df01b4f194 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 6 Jul 2022 11:15:22 -0500 Subject: [PATCH 65/85] BORG min violation working with test --- src/buffer_organizer.cc | 173 +++++++++++++++++----------------- src/metadata_management.cc | 68 ++++++------- test/buffer_organizer_test.cc | 54 ++++++++++- 3 files changed, 174 insertions(+), 121 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 515d2c342..f9a612eae 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -445,106 +445,109 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, RpcContext *rpc, ViolationInfo info) { MetadataManager *mdm = GetMetadataManagerFromContext(context); + // TODO(chogan): Factor out the common code in the kMin and kMax cases switch (info.violation) { case ThresholdViolation::kMin: { - // TODO(chogan): - // while (min is violated) - // find most important blob in Target::effective_blobs - // Choose largest buffer from most important Blob - // Move to higher tier - // Ensure info.violation_size has been moved - Target *target = GetTargetFromId(context, info.target_id); - - f32 max_importance = FLT_MIN; - BlobID most_important_blob = {}; - - BeginTicketMutex(&target->effective_blobs_lock); - std::vector blob_ids = GetChunkedIdList(mdm, - target->effective_blobs); - EndTicketMutex(&target->effective_blobs_lock); - - // Find most important blob in violated Target - for (size_t i = 0; i < blob_ids.size(); ++i) { - BlobID blob_id = {}; - blob_id.as_int = blob_ids[i]; - f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); - if (importance_score > max_importance) { - max_importance = importance_score; - most_important_blob = blob_id; + // TODO(chogan): Allow sorting Targets by any metric. This + // implementation only works if the Targets are listed in the + // configuration in order of decreasing bandwidth. + for (u16 target_index = mdm->node_targets.length - 1; + target_index != info.target_id.bits.index; + --target_index) { + TargetID src_target_id = { + info.target_id.bits.node_id, target_index, target_index + }; + + Target *src_target = GetTargetFromId(context, src_target_id); + BeginTicketMutex(&src_target->effective_blobs_lock); + std::vector blob_ids = + GetChunkedIdList(mdm, src_target->effective_blobs); + EndTicketMutex(&src_target->effective_blobs_lock); + + f32 max_importance = -FLT_MAX; + BlobID most_important_blob = {}; + + // Find most important blob in source Target + for (size_t i = 0; i < blob_ids.size(); ++i) { + BlobID blob_id = {}; + blob_id.as_int = blob_ids[i]; + f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); + if (importance_score > max_importance) { + max_importance = importance_score; + most_important_blob = blob_id; + } } - } - assert(!IsNullBlobId(most_important_blob)); - - std::vector all_buffer_ids = - LocalGetBufferIdList(mdm, most_important_blob); - std::vector buffer_ids_in_target; - // Filter out BufferIDs not in this Target - for (size_t i = 0; i < all_buffer_ids.size(); ++i) { - BufferHeader *header = GetHeaderByBufferId(context, all_buffer_ids[i]); - DeviceID device_id = header->device_id; - if (device_id == info.target_id.bits.device_id) { - // TODO(chogan): Needs to changes when we support num_devices != - // num_targets - buffer_ids_in_target.push_back(all_buffer_ids[i]); + if (IsNullBlobId(most_important_blob)) { + continue; } - } - - std::vector buffer_info = - GetBufferInfo(context, rpc, buffer_ids_in_target); - auto buffer_info_comparator = [](const BufferInfo &lhs, - const BufferInfo &rhs) { - return lhs.size > rhs.size; - }; - // Sort in descending order - std::sort(buffer_info.begin(), buffer_info.end(), buffer_info_comparator); - size_t bytes_moved = 0; - std::vector buffers_to_move; - size_t index = 0; - // Choose largest buffer until we've moved info.violation_size - while (bytes_moved < info.violation_size) { - buffers_to_move.push_back(buffer_info[index]); - bytes_moved += buffer_info[index].size; - index++; - } + std::vector all_buffer_ids = + LocalGetBufferIdList(mdm, most_important_blob); + std::vector buffer_ids_in_target; + // Filter out BufferIDs not in the Target + for (size_t i = 0; i < all_buffer_ids.size(); ++i) { + BufferHeader *header = GetHeaderByBufferId(context, + all_buffer_ids[i]); + DeviceID device_id = header->device_id; + if (device_id == src_target_id.bits.device_id) { + // TODO(chogan): Needs to changes when we support num_devices != + // num_targets + buffer_ids_in_target.push_back(all_buffer_ids[i]); + } + } - BoMoveList moves; - for (size_t i = 0; i < buffers_to_move.size(); ++i) { - // TODO(chogan): Allow sorting Targets by any metric. This - // implementation only works if the Targets are listed in the - // configuration in order of decreasing bandwidth. - for (int target_index = (int)info.target_id.bits.index - 1; - target_index >= 0; - --target_index) { - // Select Target 1 Tier lower than violated Target - TargetID target_dest = { - info.target_id.bits.node_id, (u16)target_index, (u16)target_index - }; + std::vector buffer_info = + GetBufferInfo(context, rpc, buffer_ids_in_target); + auto buffer_info_comparator = [](const BufferInfo &lhs, + const BufferInfo &rhs) { + return lhs.size > rhs.size; + }; + // Sort in descending order + std::sort(buffer_info.begin(), buffer_info.end(), + buffer_info_comparator); + + size_t bytes_moved = 0; + std::vector buffers_to_move; + size_t index = 0; + size_t num_buffers = buffer_info.size(); + + if (num_buffers > 0) { + // Choose largest buffer until we've moved info.violation_size or we + // run out of buffers + while (index < num_buffers && bytes_moved < info.violation_size) { + buffers_to_move.push_back(buffer_info[index]); + bytes_moved += buffer_info[index].size; + index++; + } + } + BoMoveList moves; + for (size_t i = 0; i < buffers_to_move.size(); ++i) { PlacementSchema schema; - schema.push_back(std::pair(bytes_moved, - target_dest)); + schema.push_back(std::pair(buffers_to_move[i].size, + info.target_id)); std::vector dests = GetBuffers(context, schema); if (dests.size() != 0) { moves.push_back(std::pair(buffers_to_move[i].id, dests)); - break; } } - } - if (moves.size() > 0) { - // Queue BO task to move to lower tier - BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, - most_important_blob); - std::string blob_name = - LocalGetBlobNameFromId(context, most_important_blob); - std::string internal_name = MakeInternalBlobName(blob_name, bucket_id); - EnqueueBoMove(rpc, moves, most_important_blob, bucket_id, - internal_name, BoPriority::kLow); - } else { - LOG(WARNING) - << "BufferOrganizer: No capacity available in higher Targets.\n"; + if (moves.size() > 0) { + // Queue BO task to move to lower tier + BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, + most_important_blob); + std::string blob_name = + LocalGetBlobNameFromId(context, most_important_blob); + std::string internal_name = MakeInternalBlobName(blob_name, + bucket_id); + EnqueueBoMove(rpc, moves, most_important_blob, bucket_id, + internal_name, BoPriority::kLow); + } + + if (bytes_moved >= info.violation_size) { + break; + } } break; } diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 3a835c777..846e542d8 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -995,48 +995,48 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, std::vector result; for (size_t device_idx = 0; device_idx < adjustments.size(); ++device_idx) { GlobalSystemViewState *state = GetGlobalSystemViewState(context); + u32 target_idx = ((node_id - 1) * adjustments.size()) + device_idx; if (adjustments[device_idx]) { - u32 target_idx = ((node_id - 1) * adjustments.size()) + device_idx; state->bytes_available[target_idx].fetch_add(adjustments[device_idx]); DLOG(INFO) << "DeviceID " << device_idx << " on node " << node_id << " adjusted by " << adjustments[device_idx] << " bytes\n"; + } - // Collect devices for which to trigger the BufferOrganizer if the - // capacities are beyond the min/max thresholds - float percentage_available = 0.0f; - if (state->bytes_available[target_idx] > 0) { - percentage_available = ((f32)state->bytes_available[target_idx].load() / - (f32)state->capacities[device_idx]); - } + // Collect devices for which to trigger the BufferOrganizer if the + // capacities are beyond the min/max thresholds + float percentage_available = 0.0f; + if (state->bytes_available[target_idx] > 0) { + percentage_available = ((f32)state->bytes_available[target_idx].load() / + (f32)state->capacities[device_idx]); + } - ViolationInfo info = {}; - float percentage_violation = 0.0f; - f32 percentage_used = 1.0f - percentage_available; + ViolationInfo info = {}; + float percentage_violation = 0.0f; + f32 percentage_used = 1.0f - percentage_available; - if (percentage_used > state->bo_capacity_thresholds[device_idx].max) { - percentage_violation = - percentage_used - state->bo_capacity_thresholds[device_idx].max; - info.violation = ThresholdViolation::kMax; - } - if (percentage_used < state->bo_capacity_thresholds[device_idx].min) { - percentage_violation = - state->bo_capacity_thresholds[device_idx].max - percentage_used; - info.violation = ThresholdViolation::kMin; - } + if (percentage_used > state->bo_capacity_thresholds[device_idx].max) { + percentage_violation = + percentage_used - state->bo_capacity_thresholds[device_idx].max; + info.violation = ThresholdViolation::kMax; + } + if (percentage_used < state->bo_capacity_thresholds[device_idx].min) { + percentage_violation = + state->bo_capacity_thresholds[device_idx].min - percentage_used; + info.violation = ThresholdViolation::kMin; + } - if (percentage_violation > 0.0f) { - TargetID target_id = {}; - target_id.bits.node_id = node_id; - target_id.bits.device_id = (DeviceID)device_idx; - // TODO(chogan): This needs to change when we support num_devices != - // num_targets - target_id.bits.index = device_idx; - - info.target_id = target_id; - info.violation_size = - (size_t)(percentage_violation * state->capacities[device_idx]); - result.push_back(info); - } + if (percentage_violation > 0.0f) { + TargetID target_id = {}; + target_id.bits.node_id = node_id; + target_id.bits.device_id = (DeviceID)device_idx; + // TODO(chogan): This needs to change when we support num_devices != + // num_targets + target_id.bits.index = device_idx; + + info.target_id = target_id; + info.violation_size = + (size_t)(percentage_violation * state->capacities[device_idx]); + result.push_back(info); } } diff --git a/test/buffer_organizer_test.cc b/test/buffer_organizer_test.cc index 22b6cbca3..8b54f6ad4 100644 --- a/test/buffer_organizer_test.cc +++ b/test/buffer_organizer_test.cc @@ -270,7 +270,7 @@ static void TestWriteOnlyBucket() { hermes->Finalize(true); } -void TestMixedWorkload() { +void TestMinThresholdViolation() { hermes::Config config = {}; InitDefaultConfig(&config); @@ -287,6 +287,55 @@ void TestMixedWorkload() { f32 min = 0.25f; f32 max = 0.75f; + config.bo_capacity_thresholds[0] = {0, max}; + config.bo_capacity_thresholds[1] = {min, max}; + config.bo_capacity_thresholds[2] = {0, max}; + + HermesPtr hermes = hermes::InitHermesDaemon(&config); + + + hermes::RoundRobinState rr_state; + rr_state.SetCurrentDeviceIndex(2); + hapi::Context ctx; + ctx.policy = hapi::PlacementPolicy::kRoundRobin; + Bucket bkt(__func__, hermes, ctx); + // Blob is big enough to exceed minimum capacity of Target 1 + const size_t kBlobSize = (min * cap) + KILOBYTES(4); + hapi::Blob blob(kBlobSize, 'q'); + Assert(bkt.Put("1", blob).Succeeded()); + + // Let the BORG run. It should move enough data from Target 2 to Target 1 to + // fill > the minimum capacity threshold + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Check remaining capacities + std::vector targets = {{1, 1, 1}, {1, 2, 2}}; + std::vector capacities = + GetRemainingTargetCapacities(&hermes->context_, &hermes->rpc_, targets); + Assert(capacities[0] == cap - kBlobSize + KILOBYTES(4)); + Assert(capacities[1] == cap - KILOBYTES(4)); + + bkt.Destroy(); + hermes->Finalize(true); +} + +void TestMaxThresholdViolation() { + hermes::Config config = {}; + InitDefaultConfig(&config); + + size_t cap = MEGABYTES(1); + config.capacities[0] = cap; + config.capacities[1] = cap; + config.capacities[2] = cap; + config.capacities[3] = cap; + + for (int i = 0; i < config.num_devices; ++i) { + config.num_slabs[i] = 1; + config.desired_slab_percentages[i][0] = 1.0; + } + + f32 min = 0.0f; + f32 max = 0.75f; config.bo_capacity_thresholds[0] = {min, max}; config.bo_capacity_thresholds[1] = {min, max}; config.bo_capacity_thresholds[2] = {min, max}; @@ -330,7 +379,8 @@ int main(int argc, char *argv[]) { HERMES_ADD_TEST(TestBoMove); HERMES_ADD_TEST(TestOrganizeBlob); HERMES_ADD_TEST(TestWriteOnlyBucket); - HERMES_ADD_TEST(TestMixedWorkload); + HERMES_ADD_TEST(TestMinThresholdViolation); + HERMES_ADD_TEST(TestMaxThresholdViolation); MPI_Finalize(); From 0e5d0a22815663b1ae27523f2bfc87ffe998273c Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 6 Jul 2022 16:04:48 -0500 Subject: [PATCH 66/85] Reorganize borg_bench in preperation for mixed workload --- benchmarks/borg_bench.cc | 80 +++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 9ccf14e74..848be1a64 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -30,6 +30,8 @@ struct Options { bool verbose; bool debug; long sleep_ms; + size_t blob_size; + int iters; char *output_filename; }; @@ -136,35 +138,13 @@ std::string MakeBlobName(int rank, int i) { return result; } -int main(int argc, char *argv[]) { - const size_t kBlobSize = KILOBYTES(32); - const int kIters = 2000; - - Options options = HandleArgs(argc, argv); - - int mpi_threads_provided; - MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); - if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { - fprintf(stderr, "Didn't receive appropriate MPI threading specification\n"); - return 1; - } - - if (options.debug) { - int gdb_iii = 0; - char gdb_DEBUG_hostname[256]; - gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); - printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); - fflush(stdout); - while (0 == gdb_iii) - sleep(5); - } - +void WriteOnlyWorkload(const Options &options) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { int rank = hermes->GetProcessRank(); const int kNumRanks = hermes->GetNumProcesses(); - const size_t kTotalBytes = kNumRanks * kBlobSize * kIters; + const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; hermes::testing::Timer timer; hapi::Context ctx; @@ -187,9 +167,9 @@ int main(int argc, char *argv[]) { size_t failed_puts = 0; size_t failed_links = 0; size_t retries = 0; - for (int i = 0; i < kIters; ++i) { + for (int i = 0; i < options.iters; ++i) { std::string blob_name = MakeBlobName(rank, i); - hapi::Blob blob(kBlobSize, i % 255); + hapi::Blob blob(options.blob_size, i % 255); timer.resumeTime(); put_timer.resumeTime(); @@ -220,11 +200,10 @@ int main(int argc, char *argv[]) { if (options.time_puts && i > 0 && i % kReportFrequency == 0) { Assert(kNumRanks == 1); - constexpr double total_mb = - (kBlobSize * kReportFrequency) / 1024.0 / 1024.0; + double total_mb = + (options.blob_size * kReportFrequency) / 1024.0 / 1024.0; - std::cout << i << ", " << total_mb / put_timer.getElapsedTime() - << "\n"; + std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; put_timer.reset(); } hermes->AppBarrier(); @@ -251,11 +230,11 @@ int main(int argc, char *argv[]) { auto offset_map = std::unordered_map(); for (int i = 0; i < kNumRanks; ++i) { - for (int j = 0; j < kIters; ++j) { + for (int j = 0; j < options.iters; ++j) { std::string blob_name = MakeBlobName(i, j); file_vbucket.Link(blob_name, bkt_name, ctx); - const size_t kBytesPerRank = kIters * kBlobSize; - size_t offset = (i * kBytesPerRank) + (j * kBlobSize); + const size_t kBytesPerRank = options.iters * options.blob_size; + size_t offset = (i * kBytesPerRank) + (j * options.blob_size); offset_map.emplace(blob_name, offset); } } @@ -286,6 +265,31 @@ int main(int argc, char *argv[]) { } hermes->Finalize(); +} + +int main(int argc, char *argv[]) { + Options options = HandleArgs(argc, argv); + options.iters = 2000; + options.blob_size = KILOBYTES(32); + + int mpi_threads_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); + if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { + fprintf(stderr, "Didn't receive appropriate MPI threading specification\n"); + return 1; + } + + if (options.debug) { + int gdb_iii = 0; + char gdb_DEBUG_hostname[256]; + gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); + printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); + fflush(stdout); + while (0 == gdb_iii) + sleep(5); + } + + WriteOnlyWorkload(options); int my_rank; int comm_size; @@ -293,7 +297,7 @@ int main(int argc, char *argv[]) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); const size_t kAppCores = comm_size - 1; - const size_t kTotalBytes = kAppCores * kIters * kBlobSize; + const size_t kTotalBytes = kAppCores * options.iters * options.blob_size; if (options.verify && my_rank == 0) { std::vector data(kTotalBytes); @@ -311,10 +315,10 @@ int main(int argc, char *argv[]) { Assert(result == 1); for (size_t rank = 0; rank < kAppCores; ++rank) { - for (size_t iter = 0; iter < kIters; ++iter) { - for (size_t byte = 0; byte < kBlobSize; ++byte) { - Assert(data[(rank * kIters * kBlobSize) + (iter * kBlobSize) + byte] - == iter % 255); + for (int iter = 0; iter < options.iters; ++iter) { + for (size_t byte = 0; byte < options.blob_size; ++byte) { + Assert(data[(rank * options.iters * options.blob_size) + + (iter * options.blob_size) + byte] == iter % 255); } } } From ab119ecf19ad3dcfc3cbd75d11169ad156da0b40 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 7 Jul 2022 16:14:53 -0500 Subject: [PATCH 67/85] Adding Mixed BORG benchmark --- benchmarks/borg_bench.cc | 147 ++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 34 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 848be1a64..2eac382b3 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -21,43 +21,63 @@ #include "hermes.h" #include "bucket.h" #include "vbucket.h" +#include "metadata_management_internal.h" #include "test_utils.h" +namespace hapi = hermes::api; +using HermesPtr = std::shared_ptr; + +const int kDefaultIters = 2000; +const size_t kDefaultBlobSize = KILOBYTES(32); + struct Options { bool use_borg; bool verify; bool time_puts; bool verbose; bool debug; + bool write_only; + bool mixed; long sleep_ms; size_t blob_size; int iters; char *output_filename; }; -void PrintUsage(char *program) { +static void PrintUsage(char *program) { fprintf(stderr, "Usage: %s [-b ] [-f] \n", program); fprintf(stderr, " -b\n"); - fprintf(stderr, " If present, enable the BORG.\n"); + fprintf(stderr, " Enable the BORG for the write-only case.\n"); fprintf(stderr, " -d\n"); fprintf(stderr, " If present, enable MPI breakpoint for debugging.\n"); fprintf(stderr, " -f\n"); fprintf(stderr, " The filename of the persisted data (for correctness" "verification).\n"); + fprintf(stderr, " -i\n"); + fprintf(stderr, " Number of iterations (default: %d)\n", kDefaultIters); + fprintf(stderr, " -m\n"); + fprintf(stderr, " Run mixed workload.\n"); fprintf(stderr, " -p\n"); fprintf(stderr, " Get average for groups of puts.\n"); fprintf(stderr, " -s\n"); fprintf(stderr, " Sleep ms between each Put.\n"); fprintf(stderr, " -v\n"); fprintf(stderr, " Print verbose information.\n"); + fprintf(stderr, " -w\n"); + fprintf(stderr, " Run write only workload.\n"); fprintf(stderr, " -x\n"); fprintf(stderr, " If present, verify results at the end.\n"); + fprintf(stderr, " -z\n"); + fprintf(stderr, " Blob size in bytes (default: %zu).\n", kDefaultBlobSize); } -Options HandleArgs(int argc, char **argv) { +static Options HandleArgs(int argc, char **argv) { Options result = {}; + result.iters = kDefaultIters; + result.blob_size = kDefaultBlobSize; + int option = -1; - while ((option = getopt(argc, argv, "bdf:hps:vx")) != -1) { + while ((option = getopt(argc, argv, "bdf:hi:ps:vxwz:")) != -1) { switch (option) { case 'h': { PrintUsage(argv[0]); @@ -75,6 +95,14 @@ Options HandleArgs(int argc, char **argv) { result.output_filename = optarg; break; } + case 'i': { + result.iters = strtol(optarg, NULL, 0); + break; + } + case 'm': { + result.mixed = true; + break; + } case 'p': { result.time_puts = true; break; @@ -91,6 +119,14 @@ Options HandleArgs(int argc, char **argv) { result.verify = true; break; } + case 'w': { + result.write_only = true; + break; + } + case 'z': { + result.blob_size = strtoll(optarg, NULL, 0); + break; + } default: { PrintUsage(argv[0]); exit(1); @@ -112,11 +148,7 @@ Options HandleArgs(int argc, char **argv) { return result; } - -namespace hapi = hermes::api; -using HermesPtr = std::shared_ptr; - -double GetMPIAverage(double rank_seconds, int num_ranks, MPI_Comm comm) { +static double GetMPIAverage(double rank_seconds, int num_ranks, MPI_Comm comm) { double total_secs = 0; MPI_Reduce(&rank_seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, comm); double result = total_secs / num_ranks; @@ -124,7 +156,7 @@ double GetMPIAverage(double rank_seconds, int num_ranks, MPI_Comm comm) { return result; } -double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, +static double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, int ranks) { double avg_total_seconds = GetMPIAverage(total_elapsed, ranks, comm); double result = total_mb / avg_total_seconds; @@ -132,13 +164,13 @@ double GetBandwidth(double total_elapsed, double total_mb, MPI_Comm comm, return result; } -std::string MakeBlobName(int rank, int i) { +static std::string MakeBlobName(int rank, int i) { std::string result = std::to_string(rank) + "_" + std::to_string(i); return result; } -void WriteOnlyWorkload(const Options &options) { +static void WriteOnlyWorkload(const Options &options) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { @@ -150,7 +182,7 @@ void WriteOnlyWorkload(const Options &options) { hapi::Context ctx; // Disable swapping of Blobs ctx.disable_swap = true; - // ctx.policy = hapi::PlacementPolicy::kRoundRobin; + ctx.policy = hapi::PlacementPolicy::kMinimizeIoTime; std::string bkt_name = "BORG_" + std::to_string(rank); hapi::VBucket vbkt(bkt_name, hermes); @@ -204,7 +236,7 @@ void WriteOnlyWorkload(const Options &options) { (options.blob_size * kReportFrequency) / 1024.0 / 1024.0; std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; - put_timer.reset(); + } hermes->AppBarrier(); } @@ -267,30 +299,42 @@ void WriteOnlyWorkload(const Options &options) { hermes->Finalize(); } -int main(int argc, char *argv[]) { - Options options = HandleArgs(argc, argv); - options.iters = 2000; - options.blob_size = KILOBYTES(32); +static void MixedWorkload(const Options &options) { + HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); - int mpi_threads_provided; - MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); - if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { - fprintf(stderr, "Didn't receive appropriate MPI threading specification\n"); - return 1; - } + if (hermes->IsApplicationCore()) { + using namespace hermes; + MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); + std::vector targets(mdm->node_targets.length); + + for (u16 i = 0; i < mdm->node_targets.length; ++i) { + targets[i] = {1, i, i}; + } + + std::vector capacities = + GetRemainingTargetCapacities(&hermes->context_, &hermes->rpc_, targets); + + // See how many blobs we can fit in each Target + std::vector num_blobs(capacities.size()); + for (size_t i = 0; i < num_blobs.size(); ++i) { + num_blobs[i] = capacities[i] / options.blob_size; + } + + // Optimize reads + // Fill hierarchy + // Delete all RAM Blobs + // BORG moves BB Blobs to RAM + // Read all BB Blobs at RAM BW + + // Optimize writes + // Fill hierarchy - if (options.debug) { - int gdb_iii = 0; - char gdb_DEBUG_hostname[256]; - gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); - printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); - fflush(stdout); - while (0 == gdb_iii) - sleep(5); } - WriteOnlyWorkload(options); + hermes->Finalize(); +} +static void Verify(const Options &options) { int my_rank; int comm_size; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -298,7 +342,7 @@ int main(int argc, char *argv[]) { const size_t kAppCores = comm_size - 1; const size_t kTotalBytes = kAppCores * options.iters * options.blob_size; - if (options.verify && my_rank == 0) { + if (my_rank == 0) { std::vector data(kTotalBytes); if (options.verbose) { @@ -323,6 +367,41 @@ int main(int argc, char *argv[]) { } } } +} + +static void DebugBreak() { + int gdb_iii = 0; + char gdb_DEBUG_hostname[256]; + gethostname(gdb_DEBUG_hostname, sizeof(gdb_DEBUG_hostname)); + printf("PID %d on %s ready for attach\n", getpid(), gdb_DEBUG_hostname); + fflush(stdout); + while (0 == gdb_iii) + sleep(5); +} + +int main(int argc, char *argv[]) { + Options options = HandleArgs(argc, argv); + + int mpi_threads_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided); + if (mpi_threads_provided < MPI_THREAD_MULTIPLE) { + fprintf(stderr, "Didn't receive appropriate MPI threading specification\n"); + return 1; + } + + if (options.debug) { + DebugBreak(); + } + + if (options.write_only) { + WriteOnlyWorkload(options); + } + if (options.mixed) { + MixedWorkload(options); + } + if (options.verify) { + Verify(options); + } MPI_Finalize(); From 00e2ba588bba0d2a64a842378e4af32fb2cc2115 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 8 Jul 2022 14:25:44 -0500 Subject: [PATCH 68/85] BORG read benchmark basics --- benchmarks/borg_bench.cc | 80 +++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 2eac382b3..4f4a5c86b 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -45,7 +45,7 @@ struct Options { }; static void PrintUsage(char *program) { - fprintf(stderr, "Usage: %s [-b ] [-f] \n", program); + fprintf(stderr, "Usage: %s [-bdmpvwx] [-f ]\n", program); fprintf(stderr, " -b\n"); fprintf(stderr, " Enable the BORG for the write-only case.\n"); fprintf(stderr, " -d\n"); @@ -77,7 +77,7 @@ static Options HandleArgs(int argc, char **argv) { result.blob_size = kDefaultBlobSize; int option = -1; - while ((option = getopt(argc, argv, "bdf:hi:ps:vxwz:")) != -1) { + while ((option = getopt(argc, argv, "bdf:hi:mps:vxwz:")) != -1) { switch (option) { case 'h': { PrintUsage(argv[0]); @@ -299,11 +299,21 @@ static void WriteOnlyWorkload(const Options &options) { hermes->Finalize(); } -static void MixedWorkload(const Options &options) { +static void OptimizeReads(const Options &options) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); if (hermes->IsApplicationCore()) { + // Optimize reads + // Fill hierarchy + // Delete all RAM Blobs + // BORG moves BB Blobs to RAM + // Read all BB Blobs at RAM BW + using namespace hermes; + + int rank = hermes->GetProcessRank(); + // const int kNumRanks = hermes->GetNumProcesses(); + // const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); std::vector targets(mdm->node_targets.length); @@ -315,20 +325,64 @@ static void MixedWorkload(const Options &options) { GetRemainingTargetCapacities(&hermes->context_, &hermes->rpc_, targets); // See how many blobs we can fit in each Target - std::vector num_blobs(capacities.size()); - for (size_t i = 0; i < num_blobs.size(); ++i) { - num_blobs[i] = capacities[i] / options.blob_size; + std::vector blobs_per_target(capacities.size()); + for (size_t i = 0; i < blobs_per_target.size(); ++i) { + blobs_per_target[i] = capacities[i] / options.blob_size; } - // Optimize reads - // Fill hierarchy - // Delete all RAM Blobs - // BORG moves BB Blobs to RAM - // Read all BB Blobs at RAM BW + hermes::testing::Timer timer; + hapi::Context ctx; + // Disable swapping of Blobs + ctx.disable_swap = true; + ctx.policy = hapi::PlacementPolicy::kMinimizeIoTime; + + std::string bkt_name = __func__ + std::to_string(rank); + hapi::Bucket bkt(bkt_name, hermes, ctx); + + // MinIoTime with retry + // const int kReportFrequency = 30; + hermes::testing::Timer put_timer; + size_t failed_puts = 0; + size_t retries = 0; - // Optimize writes // Fill hierarchy + for (size_t target_idx = 0; target_idx < blobs_per_target.size(); ++target_idx) { + for (int i = 0; i < blobs_per_target[target_idx]; ++i) { + std::string blob_name = (std::to_string(rank) + "_" + + std::to_string(target_idx) + "_" + + std::to_string(i)); + hapi::Blob blob(options.blob_size, i % 255); + + hapi::Status status; + int consecutive_fails = 0; + + timer.resumeTime(); + while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { + retries++; + if (++consecutive_fails > 10) { + failed_puts++; + break; + } + } + timer.pauseTime(); + } + hermes->AppBarrier(); + } + + Assert(failed_puts == 0); + if (options.verbose) { + std::cout << "Rank " << rank << " failed puts: " << failed_puts << "\n"; + std::cout << "Rank " << rank << " Put retries: " << retries << "\n"; + } + if (!hermes->IsFirstRankOnNode()) { + bkt.Release(); + } + hermes->AppBarrier(); + if (hermes->IsFirstRankOnNode()) { + bkt.Destroy(); + } + hermes->AppBarrier(); } hermes->Finalize(); @@ -397,7 +451,7 @@ int main(int argc, char *argv[]) { WriteOnlyWorkload(options); } if (options.mixed) { - MixedWorkload(options); + OptimizeReads(options); } if (options.verify) { Verify(options); From 4a4a2512693d191e63dfb1882fcbd16fe83bcfb4 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 8 Jul 2022 15:46:05 -0500 Subject: [PATCH 69/85] Optimized-read BORG bench complete --- benchmarks/borg_bench.cc | 52 ++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 4f4a5c86b..b9270553b 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -236,7 +236,6 @@ static void WriteOnlyWorkload(const Options &options) { (options.blob_size * kReportFrequency) / 1024.0 / 1024.0; std::cout << i << ", " << total_mb / put_timer.getElapsedTime() << "\n"; - } hermes->AppBarrier(); } @@ -309,11 +308,11 @@ static void OptimizeReads(const Options &options) { // BORG moves BB Blobs to RAM // Read all BB Blobs at RAM BW - using namespace hermes; + using namespace hermes; // NOLINT(*) int rank = hermes->GetProcessRank(); - // const int kNumRanks = hermes->GetNumProcesses(); - // const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; + const int kNumRanks = hermes->GetNumProcesses(); + const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); std::vector targets(mdm->node_targets.length); @@ -346,7 +345,8 @@ static void OptimizeReads(const Options &options) { size_t retries = 0; // Fill hierarchy - for (size_t target_idx = 0; target_idx < blobs_per_target.size(); ++target_idx) { + for (size_t target_idx = 0; target_idx < blobs_per_target.size(); + ++target_idx) { for (int i = 0; i < blobs_per_target[target_idx]; ++i) { std::string blob_name = (std::to_string(rank) + "_" + std::to_string(target_idx) + "_" @@ -356,7 +356,6 @@ static void OptimizeReads(const Options &options) { hapi::Status status; int consecutive_fails = 0; - timer.resumeTime(); while (!((status = bkt.Put(blob_name, blob)).Succeeded())) { retries++; if (++consecutive_fails > 10) { @@ -364,7 +363,6 @@ static void OptimizeReads(const Options &options) { break; } } - timer.pauseTime(); } hermes->AppBarrier(); } @@ -375,6 +373,36 @@ static void OptimizeReads(const Options &options) { std::cout << "Rank " << rank << " Put retries: " << retries << "\n"; } + // Delete all RAM and NVMe Blobs + for (size_t j = 0; j < blobs_per_target.size() - 1; ++j) { + for (int i = 0; i < blobs_per_target[j]; ++i) { + std::string blob_name = (std::to_string(rank) + "_" + + std::to_string(j) + "_" + + std::to_string(i)); + Assert(bkt.DeleteBlob(blob_name).Succeeded()); + } + } + + // Give the BORG time to move BB Blobs to RAM and NVMe + std::this_thread::sleep_for(std::chrono::seconds(3)); + + // Read all BB Blobs at RAM and NVMe BW + const int kBbIndex = 2; + for (int i = 0; i < blobs_per_target[kBbIndex]; ++i) { + std::string blob_name = (std::to_string(rank) + "_" + + std::to_string(kBbIndex) + "_" + + std::to_string(i)); + + hapi::Blob blob(options.blob_size); + timer.resumeTime(); + Assert(bkt.Get(blob_name, blob) == options.blob_size); + timer.pauseTime(); + + // Verify + hapi::Blob expected_blob(options.blob_size, i % 255); + Assert(blob == expected_blob); + } + if (!hermes->IsFirstRankOnNode()) { bkt.Release(); } @@ -383,6 +411,16 @@ static void OptimizeReads(const Options &options) { bkt.Destroy(); } hermes->AppBarrier(); + + MPI_Comm *comm = (MPI_Comm *)hermes->GetAppCommunicator(); + double total_mb = kTotalBytes / 1024.0 / 1024.0; + double bandwidth = GetBandwidth(timer.getElapsedTime(), total_mb, *comm, + kNumRanks); + + if (hermes->IsFirstRankOnNode()) { + std::cout << bandwidth << "," << kNumRanks << "," << options.use_borg + << "," << options.sleep_ms << "\n"; + } } hermes->Finalize(); From acdfd566f1dcd38b4547a442c54d314e0c4b91f6 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 15 Jul 2022 08:04:23 -0500 Subject: [PATCH 70/85] Fix bug in DPE --- src/data_placement_engine.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data_placement_engine.cc b/src/data_placement_engine.cc index e773130dc..b963c8a02 100644 --- a/src/data_placement_engine.cc +++ b/src/data_placement_engine.cc @@ -370,6 +370,8 @@ Status MinimizeIoTimePlacement(const std::vector &blob_sizes, last4 = ij; } } + } else { + last4 = last3; } // Objective to minimize IO time From 88fa3056bdfcb523b82338a739a46808fa132326 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 19 Jul 2022 09:28:12 -0500 Subject: [PATCH 71/85] Fix new warnings revealed by updated compiler --- benchmarks/dpe_bench.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/dpe_bench.cc b/benchmarks/dpe_bench.cc index 75c6017d6..50d88faaf 100644 --- a/benchmarks/dpe_bench.cc +++ b/benchmarks/dpe_bench.cc @@ -17,6 +17,7 @@ #include "hermes.h" #include "utils.h" +#include "test_utils.h" #include "data_placement_engine.h" /* example usage: ./bin/dpe_bench -m -s 4096 */ @@ -28,6 +29,7 @@ const auto now = std::chrono::high_resolution_clock::now; const u64 dpe_total_targets = 10; const size_t dpe_total_num_blobs = 10; const size_t dpe_total_blob_size = GIGABYTES(10); +const size_t kDefaultBlobSize = KILOBYTES(4); void PrintUsage(char *program) { fprintf(stderr, "Usage %s [-r]\n", program); @@ -54,7 +56,7 @@ int main(int argc, char **argv) { bool fixed_total_num_blobs {true}, fixed_total_blob_size {false}; int option = -1; char *rvalue = NULL; - size_t each_blob_size; + size_t each_blob_size = kDefaultBlobSize; size_t total_placed_size; double dpe_seconds; api::Status result; @@ -84,7 +86,7 @@ int main(int argc, char **argv) { PrintUsage(argv[0]); policy = api::PlacementPolicy::kRandom; fixed_total_blob_size = true; - each_blob_size = 4096; + each_blob_size = kDefaultBlobSize; std::cout << "Using Random policy for data placement engine.\n" << "Using fixed number of blobs of size 4KB for test.\n\n"; } @@ -174,7 +176,7 @@ int main(int argc, char **argv) { for (auto schema : output_tmp) { placed_size += testing::UpdateDeviceState(schema, tgt_state); } - assert(placed_size == total_placed_size); + Assert(placed_size == total_placed_size); // Aggregate placement schemas from the same target if (result.Succeeded()) { From 5ba703c5926910caaaa53afe1799d46ee677ab14 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Tue, 19 Jul 2022 12:53:56 -0500 Subject: [PATCH 72/85] Expose GetIdsPtr for testing --- benchmarks/borg_bench.cc | 2 +- src/metadata_storage.h | 4 ++-- src/metadata_storage_stb_ds.cc | 16 +++++++++++----- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index b9270553b..74cb3f8fe 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -52,7 +52,7 @@ static void PrintUsage(char *program) { fprintf(stderr, " If present, enable MPI breakpoint for debugging.\n"); fprintf(stderr, " -f\n"); fprintf(stderr, " The filename of the persisted data (for correctness" - "verification).\n"); + " verification).\n"); fprintf(stderr, " -i\n"); fprintf(stderr, " Number of iterations (default: %d)\n", kDefaultIters); fprintf(stderr, " -m\n"); diff --git a/src/metadata_storage.h b/src/metadata_storage.h index 9f54dee3e..1d219a60c 100644 --- a/src/metadata_storage.h +++ b/src/metadata_storage.h @@ -88,8 +88,8 @@ BlobInfo *GetBlobInfoPtr(MetadataManager *mdm, BlobID blob_id); */ void ReleaseBlobInfoPtr(MetadataManager *mdm); -template -u64 *GetIdsPtr(MetadataManager *mdm, T id_list); +u64 *GetIdsPtr(MetadataManager *mdm, IdList id_list); +u64 *GetIdsPtr(MetadataManager *mdm, ChunkedIdList id_list); void ReleaseIdsPtr(MetadataManager *mdm); /** diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index f043353af..40c427201 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -223,12 +223,18 @@ Stats LocalGetBlobStats(SharedMemoryContext *context, BlobID blob_id) { * Return a pointer to the the internal array of IDs that the `id_list` * represents. * - * T must be an `IdList` or a `ChunkedIdList`. This call acquires a lock, and - * must be paired with a corresponding call to `ReleaseIdsPtr` to release the - * lock. + * This call acquires a lock, and must be paired with a corresponding call to + * `ReleaseIdsPtr` to release the lock. */ -template -u64 *GetIdsPtr(MetadataManager *mdm, T id_list) { +u64 *GetIdsPtr(MetadataManager *mdm, IdList id_list) { + Heap *id_heap = GetIdHeap(mdm); + BeginTicketMutex(&mdm->id_mutex); + u64 *result = (u64 *)HeapOffsetToPtr(id_heap, id_list.head_offset); + + return result; +} + +u64 *GetIdsPtr(MetadataManager *mdm, ChunkedIdList id_list) { Heap *id_heap = GetIdHeap(mdm); BeginTicketMutex(&mdm->id_mutex); u64 *result = (u64 *)HeapOffsetToPtr(id_heap, id_list.head_offset); From 0e0a3af9f3921dc757e5ad8364e9a6eda37b7b73 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 22 Jul 2022 09:40:06 -0500 Subject: [PATCH 73/85] Use VLOG for DPE logging --- src/data_placement_engine.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data_placement_engine.cc b/src/data_placement_engine.cc index b963c8a02..70e965466 100644 --- a/src/data_placement_engine.cc +++ b/src/data_placement_engine.cc @@ -258,7 +258,7 @@ Status MinimizeIoTimePlacement(const std::vector &blob_sizes, ctx.minimize_io_time_options.capacity_change_threshold; size_t constraints_per_target = 1; - DLOG(INFO) << "MinimizeIoTimePlacement()::minimum_remaining_capacity=" << + VLOG(1) << "MinimizeIoTimePlacement()::minimum_remaining_capacity=" << minimum_remaining_capacity; if (minimum_remaining_capacity != 0) { constraints_per_target++; @@ -266,7 +266,7 @@ Status MinimizeIoTimePlacement(const std::vector &blob_sizes, if (capacity_change_threshold != 0) { constraints_per_target++; } - DLOG(INFO) << "MinimizeIoTimePlacement()::constraints_per_target=" << + VLOG(1) << "MinimizeIoTimePlacement()::constraints_per_target=" << constraints_per_target; const size_t total_constraints = num_blobs + (num_targets * constraints_per_target) - 1; @@ -382,7 +382,7 @@ Status MinimizeIoTimePlacement(const std::vector &blob_sizes, static_cast(blob_sizes[i])/bandwidths[j]); } } - DLOG(INFO) << "MinimizeIoTimePlacement()::last4=" << last4; + VLOG(1) << "MinimizeIoTimePlacement()::last4=" << last4; glp_load_matrix(lp, last4, ia, ja, ar); glp_smcp parm; From f419848bef043dd49476ce1c6d17dcba1899671e Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 22 Jul 2022 09:40:21 -0500 Subject: [PATCH 74/85] Clear memory on heap free --- src/memory_management.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/memory_management.cc b/src/memory_management.cc index cacbe01d6..3d367d344 100644 --- a/src/memory_management.cc +++ b/src/memory_management.cc @@ -415,6 +415,8 @@ void HeapFree(Heap *heap, void *ptr) { new_block = (FreeBlock *)((u8 *)(header + 1) + header->size - sizeof(FreeBlock)); } + + memset(ptr, 0, size); new_block->size = size + sizeof(FreeBlockHeader); HERMES_DEBUG_TRACK_FREE(header, new_block->size, heap->grows_up); From fd3325af1ff4b26261de9a8ba5ccc73a58ef5fb3 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 22 Jul 2022 09:40:41 -0500 Subject: [PATCH 75/85] Refactor to avoid long names --- src/metadata_management.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/metadata_management.cc b/src/metadata_management.cc index 846e542d8..bd0269359 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -1013,15 +1013,15 @@ LocalUpdateGlobalSystemViewState(SharedMemoryContext *context, u32 node_id, ViolationInfo info = {}; float percentage_violation = 0.0f; f32 percentage_used = 1.0f - percentage_available; + float min_threshold = state->bo_capacity_thresholds[device_idx].min; + float max_threshold = state->bo_capacity_thresholds[device_idx].max; - if (percentage_used > state->bo_capacity_thresholds[device_idx].max) { - percentage_violation = - percentage_used - state->bo_capacity_thresholds[device_idx].max; + if (percentage_used > max_threshold) { + percentage_violation = percentage_used - max_threshold; info.violation = ThresholdViolation::kMax; } - if (percentage_used < state->bo_capacity_thresholds[device_idx].min) { - percentage_violation = - state->bo_capacity_thresholds[device_idx].min - percentage_used; + if (percentage_used < min_threshold) { + percentage_violation = min_threshold - percentage_used; info.violation = ThresholdViolation::kMin; } From 473f745e64c03baed52cd91cfeaea4507a162def Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 27 Jul 2022 16:23:01 -0500 Subject: [PATCH 76/85] WIP: Fixing LocalEnforceCapacityThresholds --- src/buffer_organizer.cc | 144 +++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 74 deletions(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index f9a612eae..8237a477b 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -464,89 +464,85 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, GetChunkedIdList(mdm, src_target->effective_blobs); EndTicketMutex(&src_target->effective_blobs_lock); - f32 max_importance = -FLT_MAX; - BlobID most_important_blob = {}; - - // Find most important blob in source Target - for (size_t i = 0; i < blob_ids.size(); ++i) { - BlobID blob_id = {}; - blob_id.as_int = blob_ids[i]; - f32 importance_score = LocalGetBlobImportanceScore(context, blob_id); - if (importance_score > max_importance) { - max_importance = importance_score; - most_important_blob = blob_id; - } - } + auto compare_importance = [context](const u64 lhs, const u64 rhs) { + BlobID lhs_blob_id = {}; + lhs_blob_id.as_int = lhs; + f32 lhs_importance_score = LocalGetBlobImportanceScore(context, + lhs_blob_id); + + BlobID rhs_blob_id = {}; + rhs_blob_id.as_int = rhs; + f32 rhs_importance_score = LocalGetBlobImportanceScore(context, + rhs_blob_id); + + return lhs_importance_score < rhs_importance_score; + }; - if (IsNullBlobId(most_important_blob)) { - continue; - } + std::sort(blob_ids.begin(), blob_ids.end(), compare_importance); - std::vector all_buffer_ids = - LocalGetBufferIdList(mdm, most_important_blob); - std::vector buffer_ids_in_target; - // Filter out BufferIDs not in the Target - for (size_t i = 0; i < all_buffer_ids.size(); ++i) { - BufferHeader *header = GetHeaderByBufferId(context, - all_buffer_ids[i]); - DeviceID device_id = header->device_id; - if (device_id == src_target_id.bits.device_id) { - // TODO(chogan): Needs to changes when we support num_devices != - // num_targets - buffer_ids_in_target.push_back(all_buffer_ids[i]); - } - } - - std::vector buffer_info = - GetBufferInfo(context, rpc, buffer_ids_in_target); - auto buffer_info_comparator = [](const BufferInfo &lhs, - const BufferInfo &rhs) { - return lhs.size > rhs.size; - }; - // Sort in descending order - std::sort(buffer_info.begin(), buffer_info.end(), - buffer_info_comparator); + // TODO(chogan): Get enough buffer_ids to cover info.violation_size size_t bytes_moved = 0; std::vector buffers_to_move; - size_t index = 0; - size_t num_buffers = buffer_info.size(); - - if (num_buffers > 0) { - // Choose largest buffer until we've moved info.violation_size or we - // run out of buffers - while (index < num_buffers && bytes_moved < info.violation_size) { - buffers_to_move.push_back(buffer_info[index]); - bytes_moved += buffer_info[index].size; - index++; - } - } - BoMoveList moves; - for (size_t i = 0; i < buffers_to_move.size(); ++i) { - PlacementSchema schema; - schema.push_back(std::pair(buffers_to_move[i].size, - info.target_id)); - std::vector dests = GetBuffers(context, schema); - if (dests.size() != 0) { - moves.push_back(std::pair(buffers_to_move[i].id, dests)); + for (size_t idx = 0; + idx < blob_ids.size() && bytes_moved < info.violation_size; + ++idx) { + BlobID most_important_blob {}; + most_important_blob.as_int = blob_ids[idx]; + std::vector buffer_ids = + LocalGetBufferIdList(mdm, most_important_blob); + + // Filter out BufferIDs not in the Target + std::vector buffer_ids_in_target; + for (size_t i = 0; i < buffer_ids.size(); ++i) { + BufferHeader *header = GetHeaderByBufferId(context, buffer_ids[i]); + DeviceID device_id = header->device_id; + if (device_id == src_target_id.bits.device_id) { + // TODO(chogan): Needs to changes when we support num_devices != + // num_targets + buffer_ids_in_target.push_back(buffer_ids[i]); + } + } + std::vector buffer_info = + GetBufferInfo(context, rpc, buffer_ids_in_target); + auto buffer_info_comparator = [](const BufferInfo &lhs, + const BufferInfo &rhs) { + return lhs.size > rhs.size; + }; + // Sort in descending order + std::sort(buffer_info.begin(), buffer_info.end(), + buffer_info_comparator); + for (size_t j = 0; + j < buffer_info.size() && bytes_moved < info.violation_size; + ++j) { + buffers_to_move.push_back(buffer_info[j]); + bytes_moved += buffer_info[j].size; } - } - if (moves.size() > 0) { - // Queue BO task to move to lower tier - BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, - most_important_blob); - std::string blob_name = - LocalGetBlobNameFromId(context, most_important_blob); - std::string internal_name = MakeInternalBlobName(blob_name, - bucket_id); - EnqueueBoMove(rpc, moves, most_important_blob, bucket_id, - internal_name, BoPriority::kLow); - } + BoMoveList moves; + for (size_t i = 0; i < buffers_to_move.size(); ++i) { + PlacementSchema schema; + using SchemaPair = std::pair; + schema.push_back(SchemaPair(buffers_to_move[i].size, + info.target_id)); + std::vector dests = GetBuffers(context, schema); + if (dests.size() != 0) { + moves.push_back(std::pair(buffers_to_move[i].id, dests)); + } + } - if (bytes_moved >= info.violation_size) { - break; + if (moves.size() > 0) { + // Queue BO task to move to lower tier + BucketID bucket_id = GetBucketIdFromBlobId(context, rpc, + most_important_blob); + std::string blob_name = + LocalGetBlobNameFromId(context, most_important_blob); + std::string internal_name = MakeInternalBlobName(blob_name, + bucket_id); + EnqueueBoMove(rpc, moves, most_important_blob, bucket_id, + internal_name, BoPriority::kLow); + } } } break; From b088c55edbe5346d2e643f727e8cd47243794dcb Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 28 Jul 2022 08:24:23 -0500 Subject: [PATCH 77/85] Fix scope of buffers_to_move --- src/buffer_organizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 8237a477b..4847444da 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -483,12 +483,12 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, // TODO(chogan): Get enough buffer_ids to cover info.violation_size size_t bytes_moved = 0; - std::vector buffers_to_move; for (size_t idx = 0; idx < blob_ids.size() && bytes_moved < info.violation_size; ++idx) { BlobID most_important_blob {}; + std::vector buffers_to_move; most_important_blob.as_int = blob_ids[idx]; std::vector buffer_ids = LocalGetBufferIdList(mdm, most_important_blob); From fbd13339dee459c3370ce5445d9c5f0bd37a5f02 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 28 Jul 2022 08:39:19 -0500 Subject: [PATCH 78/85] Use bytes_read instead of kTotalBytes --- benchmarks/borg_bench.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 74cb3f8fe..16026a1d5 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -312,7 +312,7 @@ static void OptimizeReads(const Options &options) { int rank = hermes->GetProcessRank(); const int kNumRanks = hermes->GetNumProcesses(); - const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; + // const size_t kTotalBytes = kNumRanks * options.blob_size * options.iters; MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); std::vector targets(mdm->node_targets.length); @@ -413,7 +413,8 @@ static void OptimizeReads(const Options &options) { hermes->AppBarrier(); MPI_Comm *comm = (MPI_Comm *)hermes->GetAppCommunicator(); - double total_mb = kTotalBytes / 1024.0 / 1024.0; + size_t bytes_read = blobs_per_target[kBbIndex] * options.blob_size; + double total_mb = bytes_read / 1024.0 / 1024.0; double bandwidth = GetBandwidth(timer.getElapsedTime(), total_mb, *comm, kNumRanks); From 61d636b4f6a354876e34a328bd007fb23b2c4f4c Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 29 Jul 2022 08:57:43 -0500 Subject: [PATCH 79/85] Only read the number of blobs that fit in the first 2 tiers --- benchmarks/borg_bench.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 16026a1d5..48f8f707d 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -320,6 +320,10 @@ static void OptimizeReads(const Options &options) { targets[i] = {1, i, i}; } + GlobalSystemViewState *gsvs = GetGlobalSystemViewState(&hermes->context_); + f32 ram_min_threshold = gsvs->bo_capacity_thresholds[0].min; + f32 nvme_min_threshold = gsvs->bo_capacity_thresholds[1].min; + std::vector capacities = GetRemainingTargetCapacities(&hermes->context_, &hermes->rpc_, targets); @@ -388,7 +392,14 @@ static void OptimizeReads(const Options &options) { // Read all BB Blobs at RAM and NVMe BW const int kBbIndex = 2; - for (int i = 0; i < blobs_per_target[kBbIndex]; ++i) { + + int blobs_to_read = blobs_per_target[kBbIndex]; + if (ram_min_threshold > 0) { + blobs_to_read = (ram_min_threshold * blobs_per_target[0] + + nvme_min_threshold * blobs_per_target[1]); + } + int stopping_index = blobs_per_target[kBbIndex] - blobs_to_read; + for (int i = blobs_per_target[kBbIndex] - 1; i > stopping_index; --i) { std::string blob_name = (std::to_string(rank) + "_" + std::to_string(kBbIndex) + "_" + std::to_string(i)); From c1ba0ced76865ebc90faba7729bf1bc5a8dbf218 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 29 Jul 2022 10:22:15 -0500 Subject: [PATCH 80/85] Pass sleep on cli --- benchmarks/borg_bench.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 48f8f707d..552d569ae 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -298,8 +298,11 @@ static void WriteOnlyWorkload(const Options &options) { hermes->Finalize(); } -static void OptimizeReads(const Options &options) { +static void OptimizeReads(Options &options) { HermesPtr hermes = hapi::InitHermes(getenv("HERMES_CONF")); + if (options.sleep_ms == 0) { + options.sleep_ms = 3000; + } if (hermes->IsApplicationCore()) { // Optimize reads @@ -388,7 +391,7 @@ static void OptimizeReads(const Options &options) { } // Give the BORG time to move BB Blobs to RAM and NVMe - std::this_thread::sleep_for(std::chrono::seconds(3)); + std::this_thread::sleep_for(std::chrono::seconds(options.sleep_ms)); // Read all BB Blobs at RAM and NVMe BW const int kBbIndex = 2; From 61ceeabf78fb812be38810b9a44678db1e2d0849 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 29 Jul 2022 10:26:04 -0500 Subject: [PATCH 81/85] Seconds -> milliseconds --- benchmarks/borg_bench.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 552d569ae..64461258f 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -391,7 +391,7 @@ static void OptimizeReads(Options &options) { } // Give the BORG time to move BB Blobs to RAM and NVMe - std::this_thread::sleep_for(std::chrono::seconds(options.sleep_ms)); + std::this_thread::sleep_for(std::chrono::milliseconds(options.sleep_ms)); // Read all BB Blobs at RAM and NVMe BW const int kBbIndex = 2; From ec47df76b7ea72022343bfae0eba01fb1ecdb185 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Fri, 19 Aug 2022 08:46:15 -0500 Subject: [PATCH 82/85] Skip low_buf test for now. --- adapter/test/stdio/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/adapter/test/stdio/CMakeLists.txt b/adapter/test/stdio/CMakeLists.txt index ffec2d182..30003a636 100644 --- a/adapter/test/stdio/CMakeLists.txt +++ b/adapter/test/stdio/CMakeLists.txt @@ -56,7 +56,10 @@ target_link_libraries(hermes_stdio_low_buf_adapter_test hermes_stdio) add_dependencies(hermes_stdio_low_buf_adapter_test hermes_stdio hermes_daemon) set_target_properties(hermes_stdio_low_buf_adapter_test PROPERTIES COMPILE_FLAGS "-DHERMES_INTERCEPT=1") gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small "") -gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small async) +# TODO: The DPE doesn't respect available buffering space. In this test, it +# gives out over 1 MiB of RAM even though the RAM tier only has ~756 KiB +# available. See issue #TODO +# gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small async) add_executable(hermes_stdio_adapter_mode_test stdio_adapter_mode_test.cpp ${ADAPTER_COMMON}) target_link_libraries(hermes_stdio_adapter_mode_test hermes_stdio) From d8ed4c260fdffe1ebaefb6d170029ea780e2a306 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 24 Aug 2022 16:02:16 -0500 Subject: [PATCH 83/85] Cleanup before PR --- adapter/test/stdio/CMakeLists.txt | 2 +- benchmarks/borg_bench.cc | 1 - src/api/traits.cc | 9 --------- src/buffer_organizer.cc | 2 -- src/metadata_management.cc | 2 -- src/metadata_storage_stb_ds.cc | 5 ----- test/mdm_test.cc | 1 - 7 files changed, 1 insertion(+), 21 deletions(-) diff --git a/adapter/test/stdio/CMakeLists.txt b/adapter/test/stdio/CMakeLists.txt index 30003a636..b258faa56 100644 --- a/adapter/test/stdio/CMakeLists.txt +++ b/adapter/test/stdio/CMakeLists.txt @@ -58,7 +58,7 @@ set_target_properties(hermes_stdio_low_buf_adapter_test PROPERTIES COMPILE_FLAGS gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small "") # TODO: The DPE doesn't respect available buffering space. In this test, it # gives out over 1 MiB of RAM even though the RAM tier only has ~756 KiB -# available. See issue #TODO +# available. See issue #439. # gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small async) add_executable(hermes_stdio_adapter_mode_test stdio_adapter_mode_test.cpp ${ADAPTER_COMMON}) diff --git a/benchmarks/borg_bench.cc b/benchmarks/borg_bench.cc index 64461258f..6e8516f22 100644 --- a/benchmarks/borg_bench.cc +++ b/benchmarks/borg_bench.cc @@ -346,7 +346,6 @@ static void OptimizeReads(Options &options) { hapi::Bucket bkt(bkt_name, hermes, ctx); // MinIoTime with retry - // const int kReportFrequency = 30; hermes::testing::Timer put_timer; size_t failed_puts = 0; size_t retries = 0; diff --git a/src/api/traits.cc b/src/api/traits.cc index f74e060ee..5626bfa68 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -141,15 +141,6 @@ void WriteOnlyTrait::onUnlink(HermesPtr hermes, TraitInput &input, (void)hermes; (void)input; (void)trait; - -#if 0 - BucketID bucket_id = GetBucketId(&hermes->context_, &hermes->rpc_, - input.bucket_name.c_str()); - BlobID blob_id = GetBlobId(&hermes->context_, &hermes->rpc_, input.blob_name, - bucket_id, false); - MetadataManager *mdm = GetMetadataManagerFromContext(&hermes->context_); - WaitForOutstandingBlobOps(mdm, blob_id); -#endif } } // namespace api diff --git a/src/buffer_organizer.cc b/src/buffer_organizer.cc index 4847444da..798daa1b1 100644 --- a/src/buffer_organizer.cc +++ b/src/buffer_organizer.cc @@ -480,8 +480,6 @@ void LocalEnforceCapacityThresholds(SharedMemoryContext *context, std::sort(blob_ids.begin(), blob_ids.end(), compare_importance); - // TODO(chogan): Get enough buffer_ids to cover info.violation_size - size_t bytes_moved = 0; for (size_t idx = 0; diff --git a/src/metadata_management.cc b/src/metadata_management.cc index bd0269359..ff3283f4c 100644 --- a/src/metadata_management.cc +++ b/src/metadata_management.cc @@ -1049,7 +1049,6 @@ void UpdateGlobalSystemViewState(SharedMemoryContext *context, BufferPool *pool = GetBufferPoolFromContext(context); bool update_needed = false; - // TODO(chogan): BufferPool code should post adjustments via 1-sided rpc. std::vector adjustments(pool->num_devices); for (size_t i = 0; i < adjustments.size(); ++i) { adjustments[i] = pool->capacity_adjustments[i].exchange(0); @@ -1215,7 +1214,6 @@ void InitMetadataManager(MetadataManager *mdm, RpcContext *rpc, Arena *arena, // Initialize Global SystemViewState - // TODO(chogan): if (node_id == 1) { // NOTE(chogan): Only Node 1 has the Global SystemViewState GlobalSystemViewState *global_state = diff --git a/src/metadata_storage_stb_ds.cc b/src/metadata_storage_stb_ds.cc index 40c427201..f83ebc374 100644 --- a/src/metadata_storage_stb_ds.cc +++ b/src/metadata_storage_stb_ds.cc @@ -660,14 +660,9 @@ bool LocalDestroyBucket(SharedMemoryContext *context, RpcContext *rpc, } ReleaseIdsPtr(mdm); - // NOTE(chogan): Holding the mdm->bucket_mutex while destroying Blobs can - // result in deadlock if the BORG is in the middle of moving a Blob's - // Buffers. - // EndTicketMutex(&mdm->bucket_mutex); for (auto blob_id : blobs_to_destroy) { DestroyBlobById(context, rpc, blob_id, bucket_id); } - // BeginTicketMutex(&mdm->bucket_mutex); // Delete BlobId list FreeIdList(mdm, info->blobs); diff --git a/test/mdm_test.cc b/test/mdm_test.cc index 0b2444145..18bfce2ed 100644 --- a/test/mdm_test.cc +++ b/test/mdm_test.cc @@ -392,7 +392,6 @@ static void TestEffectiveTarget() { HermesPtr hermes = hermes::InitHermesDaemon(&config); hermes::RoundRobinState rr_state; - // size_t num_devices = rr_state.GetNumDevices(); rr_state.SetCurrentDeviceIndex(0); std::string bucket_name(__func__); From 747310a9fd13f18fde6e0173af94d104c5adc7e2 Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Wed, 24 Aug 2022 16:55:21 -0500 Subject: [PATCH 84/85] Disable low_buf adapter test for now --- adapter/test/stdio/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adapter/test/stdio/CMakeLists.txt b/adapter/test/stdio/CMakeLists.txt index b258faa56..bb03f38cb 100644 --- a/adapter/test/stdio/CMakeLists.txt +++ b/adapter/test/stdio/CMakeLists.txt @@ -55,10 +55,10 @@ add_executable(hermes_stdio_low_buf_adapter_test stdio_adapter_low_buffer_space_ target_link_libraries(hermes_stdio_low_buf_adapter_test hermes_stdio) add_dependencies(hermes_stdio_low_buf_adapter_test hermes_stdio hermes_daemon) set_target_properties(hermes_stdio_low_buf_adapter_test PROPERTIES COMPILE_FLAGS "-DHERMES_INTERCEPT=1") -gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small "") # TODO: The DPE doesn't respect available buffering space. In this test, it # gives out over 1 MiB of RAM even though the RAM tier only has ~756 KiB # available. See issue #439. +# gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small "") # gcc_hermes(hermes_stdio_low_buf_adapter_test "" "" hermes_small async) add_executable(hermes_stdio_adapter_mode_test stdio_adapter_mode_test.cpp ${ADAPTER_COMMON}) From 5f3e95179ae471c09d58a83cea71146dd021855c Mon Sep 17 00:00:00 2001 From: Chris Hogan Date: Thu, 25 Aug 2022 07:50:32 -0500 Subject: [PATCH 85/85] Fix codacity warning --- src/api/traits.cc | 3 ++- src/api/traits.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api/traits.cc b/src/api/traits.cc index 5626bfa68..e607c130b 100644 --- a/src/api/traits.cc +++ b/src/api/traits.cc @@ -19,7 +19,8 @@ namespace hermes { namespace api { -Trait::Trait(TraitID id, std::vector conflict_traits, TraitType type) +Trait::Trait(TraitID id, const std::vector &conflict_traits, + TraitType type) : id(id), conflict_traits(conflict_traits), type(type), diff --git a/src/api/traits.h b/src/api/traits.h index 2dcff3b24..5006a1179 100644 --- a/src/api/traits.h +++ b/src/api/traits.h @@ -61,7 +61,8 @@ struct Trait { OnLinkCallback onGetFn; Trait() {} - Trait(TraitID id, std::vector conflict_traits, TraitType type); + Trait(TraitID id, const std::vector &conflict_traits, + TraitType type); }; /** (File) Persistence trait */