Skip to content

Commit

Permalink
Add faster hex string to u64 conversion and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherHogan committed Mar 10, 2021
1 parent 2652ba9 commit 61ff438
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 16 deletions.
6 changes: 6 additions & 0 deletions src/api/hermes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ namespace api {

std::shared_ptr<Hermes> InitHermes(const char *config_file, bool is_daemon,
bool is_adapter) {
u16 endian_test = 0x1;
char *endian_ptr = (char *)&endian_test;
if (endian_ptr[0] != 1) {
LOG(FATAL) << "Big endian machines not supported yet." << std::endl;
}

hermes::Config config = {};
const size_t kConfigMemorySize = KILOBYTES(16);
hermes::u8 config_memory[kConfigMemorySize];
Expand Down
3 changes: 2 additions & 1 deletion src/hermes_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ union BucketID {
// NOTE(chogan): We reserve sizeof(BucketID) * 2 bytes in order to embed the
// BucketID into the Blob name. See MakeInternalBlobName() for a description of
// why we need double the bytes of a BucketID.
constexpr int kMaxBlobNameSize = 64 - (sizeof(BucketID) * 2);
constexpr int kBucketIdStringSize = sizeof(BucketID) * 2;
constexpr int kMaxBlobNameSize = 64 - kBucketIdStringSize;

union VBucketID {
struct {
Expand Down
50 changes: 37 additions & 13 deletions src/metadata_management.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,17 @@ VBucketID GetVBucketId(SharedMemoryContext *context, RpcContext *rpc,
}

std::string MakeInternalBlobName(const std::string &name, BucketID id) {
static_assert(sizeof(BucketID) <= sizeof(unsigned long long));

unsigned long long id_as_uint = id.as_int;
std::stringstream ss;

// NOTE(chogan): Store the bytes of the blob_id at the beginning of the name.
// We can't just stick the raw bytes in there because the Blob name will
// eventually treated as a C string, which means a null byte will be treated
// as a null terminator. Instead, we store the string representation of each
// byte, which means we need two bytes to represent one byte.
for (int i = sizeof(unsigned long long) - 1; i >= 0 ; --i) {
// TODO(chogan): May require an endian swap
u8 *byte = (u8 *)&id_as_uint + i;
// NOTE(chogan): Store the bytes of \p id at the beginning of the name. We
// can't just stick the raw bytes in there because the Blob name will
// eventually be treated as a C string, which means a null byte will be
// treated as a null terminator. Instead, we store the string representation
// of each byte in hex, which means we need two bytes to represent one byte.
for (int i = sizeof(BucketID) - 1; i >= 0 ; --i) {
// TODO(chogan): @portability Need to perform this loop in reverse on a
// big-endian platform
u8 *byte = ((u8 *)&id.as_int) + i;
ss << std::hex << std::setw(2) << std::setfill('0') << (int)(*byte);
}
ss << name;
Expand Down Expand Up @@ -267,13 +265,39 @@ std::string GetBlobNameFromId(SharedMemoryContext *context, RpcContext *rpc,
return result;
}

// NOTE(chogan): Lookup table for HexStringToU64()
static const u64 hextable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12,
13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
};

u64 HexStringToU64(const std::string &s) {
u64 result = 0;
for (size_t i = 0; i < kBucketIdStringSize; ++i) {
result = (result << 4) | hextable[(int)s[i]];
}

return result;
}

BucketID LocalGetBucketIdFromBlobId(SharedMemoryContext *context, BlobID id) {
MetadataManager *mdm = GetMetadataManagerFromContext(context);
std::string internal_name = ReverseGetFromStorage(mdm, id.as_int,
kMapType_Blob);
BucketID result = {};
int base = 16;
result.as_int = (u64)std::stoull(internal_name, nullptr, base);
if (internal_name.size() > kBucketIdStringSize) {
result.as_int = HexStringToU64(internal_name);
}

return result;
}
Expand Down
21 changes: 21 additions & 0 deletions src/metadata_management_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,5 +103,26 @@ BucketID LocalGetBucketIdFromBlobId(SharedMemoryContext *context, BlobID id);
std::string LocalGetBlobNameFromId(SharedMemoryContext *context,
BlobID blob_id);

/**
* Faster version of std::stoull.
*
* This is 4.1x faster than std::stoull. Since we generate all the numbers that
* we use this function on, we can guarantee the following:
* - The size will always be kBucketIdStringSize.
* - The number will always be unsigned and within the range of a u64.
* - There will never be invalid characters passed in (only 0-9 and a-f).
*
* Avoiding all this input sanitization and error checking is how we can get a
* 4.1x speedup.
*
* \param s A string with size at least kBucketIdStringSize, where the first
* kBucketIdStringSize characters consist only of 0-9 and a-f.
*
* \return The u64 representation of the first kBucketIdStringSize characters of
* \p s.
*/
u64 HexStringToU64(const std::string &s);
std::string MakeInternalBlobName(const std::string &name, BucketID id);

} // namespace hermes
#endif // HERMES_METADATA_MANAGEMENT_INTERNAL_H_
59 changes: 57 additions & 2 deletions test/mdm_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ static void TestMaxNameLength(HermesPtr hermes) {
bucket.Destroy(ctx);
}

void TestGetRelativeNodeId() {
static void TestGetRelativeNodeId() {
RpcContext rpc = {};
rpc.num_nodes = 10;
rpc.node_id = 1;
Expand All @@ -215,7 +215,7 @@ void TestGetRelativeNodeId() {
Assert(GetPreviousNode(&rpc) == 9);
}

void TestDuplicateBlobNames(HermesPtr hermes) {
static void TestDuplicateBlobNames(HermesPtr hermes) {
hapi::Context ctx;
const size_t blob_size = 8;
hapi::Bucket b1("b1", hermes, ctx);
Expand All @@ -225,6 +225,8 @@ void TestDuplicateBlobNames(HermesPtr hermes) {
hapi::Blob blob2(blob_size, 'z');

Assert(b1.Put(blob_name, blob1, ctx).Succeeded());
Assert(!b2.ContainsBlob(blob_name));

Assert(b2.Put(blob_name, blob2, ctx).Succeeded());

Assert(b1.ContainsBlob(blob_name));
Expand All @@ -240,6 +242,56 @@ void TestDuplicateBlobNames(HermesPtr hermes) {
Assert(b2.Destroy(ctx).Succeeded());
}

static void TestGetBucketIdFromBlobId(HermesPtr hermes) {
hapi::Context ctx;
const size_t blob_size = 8;
hapi::Bucket b1("b1", hermes, ctx);
std::string blob_name("blob1");
hapi::Blob blob1(blob_size, 'x');
Assert(b1.Put(blob_name, blob1, ctx).Succeeded());

BucketID b1_id = {};
b1_id.as_int = b1.GetId();
BlobID blob_id =
hermes::GetBlobId(&hermes->context_, &hermes->rpc_, blob_name, b1_id);

BucketID bucket_id =
hermes::GetBucketIdFromBlobId(&hermes->context_, &hermes->rpc_, blob_id);

Assert(bucket_id.as_int == b1.GetId());
Assert(b1.Destroy(ctx).Succeeded());
}

static void TestHexStringToU64() {
std::string zero1("0");
std::string zero2 = zero1 + zero1;
std::string zero4 = zero2 + zero2;
std::string zero8 = zero4 + zero4;

std::string one_str = zero8 + zero4 + zero2 + "01";
std::string ff_str = zero8 + zero4 + zero2 + "ff";
std::string all_f_str("ffffffffffffffff");
std::string bucket_id_str = zero4 + zero2 + "01" + zero4 + zero2 + "0e";
std::string count_str("123456789abcdef0");

u64 one = 0x1ULL;
u64 ff = 0xffULL;
u64 bucket_id = 0x10000000eULL;
u64 all_f = 0xffffffffffffffffULL;
u64 count = 0x123456789abcdef0ULL;

BucketID id = {};
id.as_int = 1311768467463790320;
std::string blob_name = MakeInternalBlobName(std::string("my_blob"), id);

Assert(HexStringToU64(one_str) == one);
Assert(HexStringToU64(ff_str) == ff);
Assert(HexStringToU64(bucket_id_str) == bucket_id);
Assert(HexStringToU64(all_f_str) == all_f);
Assert(HexStringToU64(count_str) == count);
Assert(HexStringToU64(blob_name) == count);
}

int main(int argc, char **argv) {
int mpi_threads_provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided);
Expand All @@ -259,6 +311,9 @@ int main(int argc, char **argv) {
TestBucketRefCounting(hermes);
TestMaxNameLength(hermes);
TestGetRelativeNodeId();
TestDuplicateBlobNames(hermes);
TestGetBucketIdFromBlobId(hermes);
TestHexStringToU64();

hermes->Finalize(true);

Expand Down

0 comments on commit 61ff438

Please sign in to comment.