Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes and improvements related to Hermes VPIC proxy #207

Merged
merged 41 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
8659b52
Add initial Hermes version of vpic
ChristopherHogan Jun 2, 2020
f3c6b8f
VPIC bench arguments
ChristopherHogan Apr 23, 2021
c11a021
VPIC posix ready for Ares
ChristopherHogan Apr 27, 2021
5bee977
Add num nodes as input
ChristopherHogan Apr 28, 2021
6ec9983
Add direct io option
ChristopherHogan Apr 29, 2021
3c88b5f
Aligned writes for direct I/O
ChristopherHogan Apr 29, 2021
0b2a769
Use global aligned_alloc because Ares has older gcc
ChristopherHogan Apr 29, 2021
77d3b21
Add flush/sync flag
ChristopherHogan Apr 29, 2021
608e25c
Include fsync in timing
ChristopherHogan Apr 29, 2021
2ce66b8
Include sync flag in output
ChristopherHogan Apr 29, 2021
a4e619d
Update Hermes portion of VPIC
ChristopherHogan Apr 29, 2021
25b3492
Running Hermes vpic with all DPE policies
ChristopherHogan Apr 30, 2021
17e6f8b
Add sync point
ChristopherHogan Apr 30, 2021
8987ad3
More sync
ChristopherHogan Apr 30, 2021
72e0cf7
Add TODOs for bug investigation
ChristopherHogan Apr 30, 2021
155dd11
Add option to pass config path
ChristopherHogan May 4, 2021
cf5b569
Sleep between puts
ChristopherHogan May 4, 2021
8087d27
Calculate RAM buffering capacity based on headers
ChristopherHogan May 6, 2021
4f20f9d
Allow 0 RAM buffers and don't consider Targets that have no capacity
ChristopherHogan May 6, 2021
4a9c826
Fixed round robin DPE for case when we have no RAM buffers
ChristopherHogan May 7, 2021
08ec453
gethostbyname error handling
ChristopherHogan May 7, 2021
cd746c5
Use threadsafe version of gethostbyname
ChristopherHogan May 7, 2021
d7da627
Debugging
ChristopherHogan May 18, 2021
8657655
Fix finalization for daemon/client case in adapter
ChristopherHogan May 18, 2021
bda830d
Silence linter
ChristopherHogan May 18, 2021
9af59bf
Update vpic to new spec
ChristopherHogan May 20, 2021
3d0a0d5
Tweaks for webinar
ChristopherHogan May 20, 2021
9fc825b
posix_fallocate is too slow on OrangeFS
ChristopherHogan May 21, 2021
c37b163
Add is_shared_device option to config
ChristopherHogan May 21, 2021
e75b13d
Properly initialize files on shared devices
ChristopherHogan May 21, 2021
f41bf17
Try multiple puts before resorting to swap
ChristopherHogan May 21, 2021
d0f1d72
Never go to swap
ChristopherHogan May 21, 2021
c6d553e
Vpic working up to 512 MiB
ChristopherHogan May 25, 2021
bf83bfe
Set option defaults, add option for smaller io size, time open and close
ChristopherHogan May 26, 2021
18b3c89
Clean up for PR
ChristopherHogan May 28, 2021
b7b49a9
Fix logging typo
ChristopherHogan Jun 2, 2021
8af0cfe
Revert noisy formatting changes
ChristopherHogan Jun 18, 2021
b7cb4cf
Fix syntax error
ChristopherHogan Jun 18, 2021
a42add7
Protect round robin current device and rename it's wrapper class
ChristopherHogan Jun 18, 2021
1df0098
Fix MPI-IO adapter init and finalize
ChristopherHogan Jun 22, 2021
e54bc29
Documentation clarification
ChristopherHogan Jun 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions adapter/src/hermes/adapter/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,13 @@ const char kPathDelimiter = ',';
const char* kAdapterDefaultMode = "DEFAULT";
const char* kAdapterBypassMode = "BYPASS";
const char* kAdapterScratchMode = "SCRATCH";

/**
* If the \c HERMES_STOP_DAEMON environment variable is unset or has a non-zero
* value, the adapter client will kill the running Hermes daemon when it
* finishes execution.
*
* Default value: \c 1
*/
const char* kStopDaemon = "HERMES_STOP_DAEMON";
#endif // HERMES_ADAPTER_CONSTANTS_H
12 changes: 3 additions & 9 deletions adapter/src/hermes/adapter/mpiio/metadata_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class MetadataManager {
char* hermes_config = getenv(kHermesConf);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
// TODO(chogan): Need a better way to distinguish between client and
// daemon. https://github.com/HDFGroup/hermes/issues/206
if (comm_size > 1) {
hermes = hermes::InitHermesClient(hermes_config);
} else {
Expand All @@ -98,15 +100,7 @@ class MetadataManager {
*/
void FinalizeHermes() {
if (ref == 1) {
if (this->comm_size > 1) {
MPI_Barrier(MPI_COMM_WORLD);
if (this->rank == 0) {
hermes->RemoteFinalize();
}
hermes->Finalize();
} else {
hermes->Finalize(true);
}
hermes->FinalizeClient();
}
ref--;
}
Expand Down
2 changes: 0 additions & 2 deletions adapter/src/hermes/adapter/mpiio/mpiio.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ int simple_open(MPI_Comm &comm, const char *path, int &amode, MPI_Info &info,
}
stat.info = info;
stat.comm = comm;
mdm->InitializeHermes();
hapi::Context ctx;
stat.st_bkid = std::make_shared<hapi::Bucket>(path, mdm->GetHermes(), ctx);
mdm->Create(fh, stat);
Expand Down Expand Up @@ -612,7 +611,6 @@ int HERMES_DECL(MPI_File_close)(MPI_File *fh) {
INTERCEPTOR_LIST->hermes_flush_exclusion.erase(filename);
}
existing.first.st_bkid->Destroy(ctx);
mdm->FinalizeHermes();
if (existing.first.a_mode & MPI_MODE_DELETE_ON_CLOSE) {
fs::remove(filename);
}
Expand Down
15 changes: 12 additions & 3 deletions adapter/src/hermes/adapter/stdio/metadata_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ class MetadataManager {
if (this->is_mpi) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
// TODO(chogan): Need a better way to distinguish between client and
// daemon. https://github.com/HDFGroup/hermes/issues/206
if (comm_size > 1) {
hermes = hermes::InitHermesClient(hermes_config);
} else {
Expand All @@ -108,10 +110,17 @@ class MetadataManager {
if (ref == 1) {
if (this->is_mpi) {
MPI_Barrier(MPI_COMM_WORLD);
if (this->rank == 0) {
hermes->RemoteFinalize();
char *stop_daemon = getenv(kStopDaemon);
bool shutdown_daemon = true;

if (stop_daemon && stop_daemon[0] == '0') {
HERMES_NOT_IMPLEMENTED_YET;
// TODO(chogan): The Hermes core needs a few tweaks before it can
// support this feature. https://github.com/HDFGroup/hermes/issues/181
shutdown_daemon = false;
}
hermes->Finalize();

hermes->FinalizeClient(shutdown_daemon);
} else {
hermes->Finalize(true);
}
Expand Down
2 changes: 1 addition & 1 deletion adapter/src/hermes/adapter/stdio/stdio.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ size_t write_internal(std::pair<AdapterStat, bool> &existing, const void *ptr,
FileStruct(mdm->Convert(fp), existing.first.st_ptr, total_size));
size_t data_offset = 0;
auto filename = existing.first.st_bkid->GetName();
LOG(INFO) << "Mapping for read has " << mapping.size() << " mapping."
LOG(INFO) << "Mapping for write has " << mapping.size() << " mapping."
<< std::endl;
for (const auto &item : mapping) {
hapi::Context ctx;
Expand Down
2 changes: 1 addition & 1 deletion adapter/test/hermes_daemon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ int main(int argc, char* argv[]) {
hermes_config = getenv(kHermesConf);
}
auto hermes = hermes::InitHermesDaemon(hermes_config);
hermes->Finalize();
hermes->RunDaemon();
MPI_Finalize();
}
19 changes: 9 additions & 10 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
include_directories(
${PROJECT_SOURCE_DIR}/src/api
${PROJECT_SOURCE_DIR}/test
)

add_executable(mdm_bench mdm_bench.cc)
target_link_libraries(mdm_bench hermes MPI::MPI_CXX
$<$<BOOL:${HERMES_RPC_THALLIUM}>:thallium>)
target_compile_definitions(mdm_bench
PRIVATE $<$<BOOL:${HERMES_RPC_THALLIUM}>:HERMES_RPC_THALLIUM>)
set(BENCHMARKS mdm_bench dpe_bench)

add_executable(dpe_bench dpe_bench.cc)
target_link_libraries(dpe_bench hermes MPI::MPI_CXX
$<$<BOOL:${HERMES_RPC_THALLIUM}>:thallium>)
target_compile_definitions(dpe_bench
PRIVATE $<$<BOOL:${HERMES_RPC_THALLIUM}>:HERMES_RPC_THALLIUM>)
foreach(benchmark ${BENCHMARKS})
add_executable(${benchmark} ${benchmark}.cc)
target_link_libraries(${benchmark} hermes MPI::MPI_CXX
$<$<BOOL:${HERMES_RPC_THALLIUM}>:thallium>)
target_compile_definitions(${benchmark}
PRIVATE $<$<BOOL:${HERMES_RPC_THALLIUM}>:HERMES_RPC_THALLIUM>)
endforeach()
4 changes: 4 additions & 0 deletions src/api/bucket.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ Status Bucket::PlaceBlobs(std::vector<PlacementSchema> &schemas,
<< "'" << std::endl;
result = PlaceBlob(&hermes_->context_, &hermes_->rpc_, schema, blob,
names[i], id_, ctx);
if (result.Failed()) {
// TODO(chogan): Need to return a std::vector<Status>
break;
}
}

return result;
Expand Down
21 changes: 20 additions & 1 deletion src/api/hermes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

namespace hermes {

std::vector<DeviceID> RoundRobinState::devices_;

namespace api {

int Context::default_buffer_organizer_retries;
Expand Down Expand Up @@ -112,10 +114,19 @@ void Hermes::Finalize(bool force_rpc_shutdown) {
is_initialized = false;
}

void Hermes::FinalizeClient(bool stop_daemon) {
hermes::FinalizeClient(&context_, &rpc_, &comm_, &trans_arena_, stop_daemon);
}

void Hermes::RemoteFinalize() {
hermes::RpcCall<void>(&rpc_, rpc_.node_id, "RemoteFinalize");
}

void Hermes::RunDaemon() {
hermes::RunDaemon(&context_, &rpc_, &comm_, &trans_arena_,
shmem_name_.c_str());
}

} // namespace api

ArenaInfo GetArenaInfo(Config *config) {
Expand Down Expand Up @@ -257,7 +268,8 @@ std::shared_ptr<api::Hermes> InitHermes(Config *config, bool is_daemon,
}
bool create_shared_files = (comm.proc_kind == ProcessKind::kHermes &&
comm.first_on_node);
InitFilesForBuffering(&context, create_shared_files);
InitFilesForBuffering(&context, create_shared_files, comm.node_id,
comm.first_on_node);

WorldBarrier(&comm);

Expand Down Expand Up @@ -301,6 +313,13 @@ std::shared_ptr<api::Hermes> InitHermes(Config *config, bool is_daemon,
config->num_buffer_organizer_retries;
api::Context::default_placement_policy = config->default_placement_policy;

RoundRobinState::devices_.reserve(config->num_devices);
for (DeviceID id = 0; id < config->num_devices; ++id) {
if (GetNumBuffersAvailable(&result->context_, id)) {
RoundRobinState::devices_.push_back(id);
}
}

InitRpcClients(&result->rpc_);

// NOTE(chogan): Can only initialize the neighborhood Targets once the RPC
Expand Down
2 changes: 2 additions & 0 deletions src/api/hermes.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ class Hermes {
int GetNumProcesses();
void *GetAppCommunicator();
void Finalize(bool force_rpc_shutdown = false);
void FinalizeClient(bool stop_daemon = true);
void RemoteFinalize();
void RunDaemon();

bool BucketContainsBlob(const std::string &bucket_name,
const std::string &blob_name);
Expand Down
Loading