Add faster hex string to u64 conversion and tests

HDFGroup · Mar 10, 2021 · 61ff438 · 61ff438
1 parent 2652ba9
commit 61ff438
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 16 deletions.
diff --git a/src/api/hermes.cc b/src/api/hermes.cc
@@ -311,6 +311,12 @@ namespace api {
 
 std::shared_ptr<Hermes> InitHermes(const char *config_file, bool is_daemon,
                                    bool is_adapter) {
+  u16 endian_test = 0x1;
+  char *endian_ptr = (char *)&endian_test;
+  if (endian_ptr[0] != 1) {
+    LOG(FATAL) << "Big endian machines not supported yet." << std::endl;
+  }
+
   hermes::Config config = {};
   const size_t kConfigMemorySize = KILOBYTES(16);
   hermes::u8 config_memory[kConfigMemorySize];

diff --git a/src/hermes_types.h b/src/hermes_types.h
@@ -179,7 +179,8 @@ union BucketID {
 // NOTE(chogan): We reserve sizeof(BucketID) * 2 bytes in order to embed the
 // BucketID into the Blob name. See MakeInternalBlobName() for a description of
 // why we need double the bytes of a BucketID.
-constexpr int kMaxBlobNameSize = 64 - (sizeof(BucketID) * 2);
+constexpr int kBucketIdStringSize = sizeof(BucketID) * 2;
+constexpr int kMaxBlobNameSize = 64 - kBucketIdStringSize;
 
 union VBucketID {
   struct {

diff --git a/src/metadata_management.cc b/src/metadata_management.cc
@@ -151,19 +151,17 @@ VBucketID GetVBucketId(SharedMemoryContext *context, RpcContext *rpc,
 }
 
 std::string MakeInternalBlobName(const std::string &name, BucketID id) {
-  static_assert(sizeof(BucketID) <= sizeof(unsigned long long));
-
-  unsigned long long id_as_uint = id.as_int;
   std::stringstream ss;
 
-  // NOTE(chogan): Store the bytes of the blob_id at the beginning of the name.
-  // We can't just stick the raw bytes in there because the Blob name will
-  // eventually treated as a C string, which means a null byte will be treated
-  // as a null terminator. Instead, we store the string representation of each
-  // byte, which means we need two bytes to represent one byte.
-  for (int i = sizeof(unsigned long long) - 1; i >= 0 ; --i) {
-    // TODO(chogan): May require an endian swap
-    u8 *byte = (u8 *)&id_as_uint + i;
+  // NOTE(chogan): Store the bytes of \p id at the beginning of the name. We
+  // can't just stick the raw bytes in there because the Blob name will
+  // eventually be treated as a C string, which means a null byte will be
+  // treated as a null terminator. Instead, we store the string representation
+  // of each byte in hex, which means we need two bytes to represent one byte.
+  for (int i = sizeof(BucketID) - 1; i >= 0 ; --i) {
+    // TODO(chogan): @portability Need to perform this loop in reverse on a
+    // big-endian platform
+    u8 *byte = ((u8 *)&id.as_int) + i;
     ss << std::hex << std::setw(2) << std::setfill('0') << (int)(*byte);
   }
   ss << name;
@@ -267,13 +265,39 @@ std::string GetBlobNameFromId(SharedMemoryContext *context, RpcContext *rpc,
   return result;
 }
 
+// NOTE(chogan): Lookup table for HexStringToU64()
+static const u64 hextable[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12,
+  13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0
+};
+
+u64 HexStringToU64(const std::string &s) {
+  u64 result = 0;
+  for (size_t i = 0; i < kBucketIdStringSize; ++i) {
+    result = (result << 4) | hextable[(int)s[i]];
+  }
+
+  return result;
+}
+
 BucketID LocalGetBucketIdFromBlobId(SharedMemoryContext *context, BlobID id) {
   MetadataManager *mdm = GetMetadataManagerFromContext(context);
   std::string internal_name = ReverseGetFromStorage(mdm, id.as_int,
                                                     kMapType_Blob);
   BucketID result = {};
-  int base = 16;
-  result.as_int = (u64)std::stoull(internal_name, nullptr, base);
+  if (internal_name.size() > kBucketIdStringSize) {
+    result.as_int = HexStringToU64(internal_name);
+  }
 
   return result;
 }

diff --git a/src/metadata_management_internal.h b/src/metadata_management_internal.h
@@ -103,5 +103,26 @@ BucketID LocalGetBucketIdFromBlobId(SharedMemoryContext *context, BlobID id);
 std::string LocalGetBlobNameFromId(SharedMemoryContext *context,
                                    BlobID blob_id);
 
+/**
+ * Faster version of std::stoull.
+ *
+ * This is 4.1x faster than std::stoull. Since we generate all the numbers that
+ * we use this function on, we can guarantee the following:
+ *   - The size will always be kBucketIdStringSize.
+ *   - The number will always be unsigned and within the range of a u64.
+ *   - There will never be invalid characters passed in (only 0-9 and a-f).
+ *
+ * Avoiding all this input sanitization and error checking is how we can get a
+ * 4.1x speedup.
+ *
+ * \param s A string with size at least kBucketIdStringSize, where the first
+ *          kBucketIdStringSize characters consist only of 0-9 and a-f.
+ *
+ * \return The u64 representation of the first kBucketIdStringSize characters of
+ *         \p s.
+ */
+u64 HexStringToU64(const std::string &s);
+std::string MakeInternalBlobName(const std::string &name, BucketID id);
+
 }  // namespace hermes
 #endif  // HERMES_METADATA_MANAGEMENT_INTERNAL_H_
diff --git a/test/mdm_test.cc b/test/mdm_test.cc
@@ -202,7 +202,7 @@ static void TestMaxNameLength(HermesPtr hermes) {
   bucket.Destroy(ctx);
 }
 
-void TestGetRelativeNodeId() {
+static void TestGetRelativeNodeId() {
   RpcContext rpc = {};
   rpc.num_nodes = 10;
   rpc.node_id = 1;
@@ -215,7 +215,7 @@ void TestGetRelativeNodeId() {
   Assert(GetPreviousNode(&rpc) == 9);
 }
 
-void TestDuplicateBlobNames(HermesPtr hermes) {
+static void TestDuplicateBlobNames(HermesPtr hermes) {
   hapi::Context ctx;
   const size_t blob_size = 8;
   hapi::Bucket b1("b1", hermes, ctx);
@@ -225,6 +225,8 @@ void TestDuplicateBlobNames(HermesPtr hermes) {
   hapi::Blob blob2(blob_size, 'z');
 
   Assert(b1.Put(blob_name, blob1, ctx).Succeeded());
+  Assert(!b2.ContainsBlob(blob_name));
+
   Assert(b2.Put(blob_name, blob2, ctx).Succeeded());
 
   Assert(b1.ContainsBlob(blob_name));
@@ -240,6 +242,56 @@ void TestDuplicateBlobNames(HermesPtr hermes) {
   Assert(b2.Destroy(ctx).Succeeded());
 }
 
+static void TestGetBucketIdFromBlobId(HermesPtr hermes) {
+  hapi::Context ctx;
+  const size_t blob_size = 8;
+  hapi::Bucket b1("b1", hermes, ctx);
+  std::string blob_name("blob1");
+  hapi::Blob blob1(blob_size, 'x');
+  Assert(b1.Put(blob_name, blob1, ctx).Succeeded());
+
+  BucketID b1_id = {};
+  b1_id.as_int = b1.GetId();
+  BlobID blob_id =
+    hermes::GetBlobId(&hermes->context_, &hermes->rpc_, blob_name, b1_id);
+
+  BucketID bucket_id =
+    hermes::GetBucketIdFromBlobId(&hermes->context_, &hermes->rpc_, blob_id);
+
+  Assert(bucket_id.as_int == b1.GetId());
+  Assert(b1.Destroy(ctx).Succeeded());
+}
+
+static void TestHexStringToU64() {
+  std::string zero1("0");
+  std::string zero2 = zero1 + zero1;
+  std::string zero4 = zero2 + zero2;
+  std::string zero8 = zero4 + zero4;
+
+  std::string one_str = zero8 + zero4 + zero2 + "01";
+  std::string ff_str = zero8 + zero4 + zero2 + "ff";
+  std::string all_f_str("ffffffffffffffff");
+  std::string bucket_id_str = zero4 + zero2 + "01" + zero4 + zero2 + "0e";
+  std::string count_str("123456789abcdef0");
+
+  u64 one = 0x1ULL;
+  u64 ff = 0xffULL;
+  u64 bucket_id = 0x10000000eULL;
+  u64 all_f = 0xffffffffffffffffULL;
+  u64 count = 0x123456789abcdef0ULL;
+
+  BucketID id = {};
+  id.as_int = 1311768467463790320;
+  std::string blob_name = MakeInternalBlobName(std::string("my_blob"), id);
+
+  Assert(HexStringToU64(one_str) == one);
+  Assert(HexStringToU64(ff_str) == ff);
+  Assert(HexStringToU64(bucket_id_str) == bucket_id);
+  Assert(HexStringToU64(all_f_str) == all_f);
+  Assert(HexStringToU64(count_str) == count);
+  Assert(HexStringToU64(blob_name) == count);
+}
+
 int main(int argc, char **argv) {
   int mpi_threads_provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_threads_provided);
@@ -259,6 +311,9 @@ int main(int argc, char **argv) {
   TestBucketRefCounting(hermes);
   TestMaxNameLength(hermes);
   TestGetRelativeNodeId();
+  TestDuplicateBlobNames(hermes);
+  TestGetBucketIdFromBlobId(hermes);
+  TestHexStringToU64();
 
   hermes->Finalize(true);