Re-implement MVMPtrHashTable as a Robin Hood Hash.

Like the previous commits, this re-implements in C the design and optimisations of Martin Ankerl's C++ templated Robin Hood Hash. (Again, not all optimisations are implemented yet. This is the "Minimal Viable Product" edition.) Rename struct MVMPtrHashHandle to struct MVMPtrHashEntry.
MoarVM · Jul 2, 2020 · 4e503a4 · 4e503a4
1 parent f352a5c
commit 4e503a4
Show file tree

Hide file tree

Showing 5 changed files with 336 additions and 347 deletions.
diff --git a/src/core/ptr_hash_table.c b/src/core/ptr_hash_table.c
@@ -1,104 +1,243 @@
 #include "moar.h"
 
+#define PTR_LOAD_FACTOR 0.75
+#define PTR_INITIAL_SIZE 8
+#define PTR_INITIAL_KEY_RIGHT_SHIFT (8 * sizeof(uintptr_t) - 3)
+
+/* Frees the entire contents of the hash, leaving you just the hashtable itself,
+   which you allocated (heap, stack, inside another struct, wherever) */
 void MVM_ptr_hash_demolish(MVMThreadContext *tc, MVMPtrHashTable *hashtable) {
-    /* Never allocated? (or already demolished?) */
-    if (MVM_UNLIKELY(hashtable->log2_num_buckets == 0))
-        return;
-
-    struct MVMPtrHashBucket *bucket = hashtable->buckets;
-    const struct MVMPtrHashBucket *const bucket_end
-        = hashtable->buckets + hashtable->num_buckets;
-
-    do {
-        struct MVMPtrHashHandle *head = bucket->hh_head;
-        while (head) {
-            struct MVMPtrHashHandle *next = head->hh_next;
-            MVM_fixed_size_free(tc, tc->instance->fsa, sizeof(struct MVMPtrHashHandle), head);
-            head = next;
+    free(hashtable->entries);
+    free(hashtable->metadata);
+}
+/* and then free memory if you allocated it */
+
+
+MVM_STATIC_INLINE MVMuint32 hash_true_size(MVMPtrHashTable *hashtable) {
+    MVMuint32 true_size = hashtable->official_size + hashtable->max_items - 1;
+    if (hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE < true_size) {
+        true_size = hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE;
+    }
+    return true_size;
+}
+
+MVM_STATIC_INLINE void hash_allocate_common(MVMPtrHashTable *hashtable) {
+    hashtable->max_items = hashtable->official_size * PTR_LOAD_FACTOR;
+    size_t actual_items = hash_true_size(hashtable);
+    hashtable->entries = malloc(sizeof(struct MVMPtrHashEntry) * actual_items);
+    hashtable->metadata = calloc(actual_items + 1, 1);
+    /* A sentinel. This marks an occupied slot, at its ideal position. */
+    hashtable->metadata[actual_items] = 1;
+}
+
+MVM_STATIC_INLINE void hash_initial_allocate(MVMPtrHashTable *hashtable) {
+    hashtable->key_right_shift = PTR_INITIAL_KEY_RIGHT_SHIFT;
+    hashtable->official_size = PTR_INITIAL_SIZE;
+
+    hash_allocate_common(hashtable);
+}
+
+/* make sure you still have your copies of entries and metadata before you
+   call this. */
+MVM_STATIC_INLINE void hash_grow(MVMPtrHashTable *hashtable) {
+    --hashtable->key_right_shift;
+    hashtable->official_size *= 2;
+
+    hash_allocate_common(hashtable);
+}
+
+MVM_STATIC_INLINE struct MVMPtrHashEntry *hash_insert_internal(MVMThreadContext *tc,
+                                                               MVMPtrHashTable *hashtable,
+                                                               const void *key) {
+    if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) {
+        MVM_oops(tc, "oops, attempt to recursively call grow when adding %p",
+                 key);
+    }
+
+    unsigned int probe_distance = 1;
+    MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
+    char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry);
+    MVMuint8 *metadata = hashtable->metadata + bucket;
+    while (1) {
+        if (*metadata < probe_distance) {
+            /* this is our slot. occupied or not, it is our rightful place. */
+
+            if (*metadata == 0) {
+                /* Open goal. Score! */
+            } else {
+                /* make room. */
+
+                /* Optimisation first seen in Martin Ankerl's implementation -
+                   we don't need actually implement the "stealing" by swapping
+                   elements and carrying on with insert. The invariant of the
+                   hash is that probe distances are never out of order, and as
+                   all the following elements have probe distances in order, we
+                   can maintain the invariant just as well by moving everything
+                   along by one. */
+                MVMuint8 *find_me_a_gap = metadata;
+                MVMuint8 old_probe_distance = *metadata;
+                do {
+                    MVMuint8 new_probe_distance = 1 + old_probe_distance;
+                    if (new_probe_distance == MVM_HASH_MAX_PROBE_DISTANCE) {
+                        /* Optimisation from Martin Ankerl's implementation:
+                           setting this to zero forces a resize on any insert,
+                           *before* the actual insert, so that we never end up
+                           having to handle overflow *during* this loop. This
+                           loop can always complete. */
+                        hashtable->max_items = 0;
+                    }
+                    /* a swap: */
+                    old_probe_distance = *++find_me_a_gap;
+                    *find_me_a_gap = new_probe_distance;
+                } while (old_probe_distance);
+
+                MVMuint32 entries_to_move = find_me_a_gap - metadata;
+                memmove(entry_raw + sizeof(struct MVMPtrHashEntry), entry_raw,
+                        sizeof(struct MVMPtrHashEntry) * entries_to_move);
+            }
+
+            *metadata = probe_distance;
+            struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
+            entry->key = NULL;
+            return entry;
+        }
+
+        if (*metadata == probe_distance) {
+            struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
+            if (entry->key == key) {
+                return entry;
+            }
+        }
+        ++probe_distance;
+        ++metadata;
+        entry_raw += sizeof(struct MVMPtrHashEntry);
+        assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE);
+        assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items);
+        assert(metadata < hashtable->metadata + hashtable->official_size + 256);
+    }
+}
+
+struct MVMPtrHashEntry *MVM_ptr_hash_lvalue_fetch(MVMThreadContext *tc,
+                                                  MVMPtrHashTable *hashtable,
+                                                  const void *key) {
+    if (MVM_UNLIKELY(hashtable->entries == NULL)) {
+        hash_initial_allocate(hashtable);
+    }
+    else if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) {
+        MVMuint32 true_size =  hash_true_size(hashtable);
+        char *entry_raw_orig = hashtable->entries;
+        MVMuint8 *metadata_orig = hashtable->metadata;
+
+        hash_grow(hashtable);
+
+        char *entry_raw = entry_raw_orig;
+        MVMuint8 *metadata = metadata_orig;
+        MVMHashNumItems bucket = 0;
+        while (bucket < true_size) {
+            if (*metadata) {
+                struct MVMPtrHashEntry *old_entry = (struct MVMPtrHashEntry *) entry_raw;
+                struct MVMPtrHashEntry *new_entry =
+                    hash_insert_internal(tc, hashtable, old_entry->key);
+                assert(new_entry->key == NULL);
+                *new_entry = *old_entry;
+            }
+            ++bucket;
+            ++metadata;
+            entry_raw += sizeof(struct MVMPtrHashEntry);
         }
-    } while (++bucket < bucket_end);
-
-    MVM_fixed_size_free(tc, tc->instance->fsa,
-                        hashtable->num_buckets*sizeof(struct MVMPtrHashBucket),
-                        hashtable->buckets);
-    /* We shouldn't need either of these, but make something foolproof and they
-       invent a better fool: */
-    hashtable->buckets = NULL;
-    hashtable->log2_num_buckets = 0;
-    hashtable->num_items = 0;
+        free(entry_raw_orig);
+        free(metadata_orig);
+    }
+    struct MVMPtrHashEntry *new_entry
+        = hash_insert_internal(tc, hashtable, key);
+    if (!new_entry->key) {
+        ++hashtable->cur_items;
+    }
+    return new_entry;
 }
 
-/* Bucket expansion has the effect of doubling the number of buckets
- * and redistributing the items into the new buckets. Ideally the
- * items will distribute more or less evenly into the new buckets
- * (the extent to which this is true is a measure of the quality of
- * the hash function as it applies to the key domain).
- *
- * With the items distributed into more buckets, the chain length
- * (item count) in each bucket is reduced. Thus by expanding buckets
- * the hash keeps a bound on the chain length. This bounded chain
- * length is the essence of how a hash provides constant time lookup.
- *
- * The calculation of tbl->ideal_chain_maxlen below deserves some
- * explanation. First, keep in mind that we're calculating the ideal
- * maximum chain length based on the *new* (doubled) bucket count.
- * In fractions this is just n/b (n=number of items,b=new num buckets).
- * Since the ideal chain length is an integer, we want to calculate
- * ceil(n/b). We don't depend on floating point arithmetic in this
- * hash, so to calculate ceil(n/b) with integers we could write
- *
- *      ceil(n/b) = (n/b) + ((n%b)?1:0)
- *
- * and in fact a previous version of this hash did just that.
- * But now we have improved things a bit by recognizing that b is
- * always a power of two. We keep its base 2 log handy (call it lb),
- * so now we can write this with a bit shift and logical AND:
- *
- *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
- *
- */
-void MVM_ptr_hash_expand_buckets(MVMThreadContext *tc, MVMPtrHashTable *tbl) {
-    MVMHashBktNum he_bkt;
-    MVMHashBktNum he_bkt_i;
-    struct MVMPtrHashHandle *he_thh, *_he_hh_nxt;
-    struct MVMPtrHashBucket *he_new_buckets, *_he_newbkt;
-    MVMHashBktNum new_num_bkts = tbl->num_buckets * 2;
-    MVMHashUInt new_log2_num_buckets = tbl->log2_num_buckets + 1;
-    he_new_buckets =
-        MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa,
-                                    new_num_bkts * sizeof(struct MVMPtrHashBucket));
-    tbl->ideal_chain_maxlen =
-        (tbl->num_items >> new_log2_num_buckets) +
-        ((tbl->num_items & (new_num_bkts-1)) ? 1 : 0);
-    tbl->nonideal_items = 0;
-    /* Iterate the buckets */
-    for(he_bkt_i = 0; he_bkt_i < tbl->num_buckets; he_bkt_i++) {
-        he_thh = tbl->buckets[ he_bkt_i ].hh_head;
-        /* Iterate items in the bucket */
-        while (he_thh) {
-            _he_hh_nxt = he_thh->hh_next;
-            he_bkt = MVM_ptr_hash_bucket(he_thh->key, new_log2_num_buckets);
-            _he_newbkt = &(he_new_buckets[ he_bkt ]);
-            if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) {
-                tbl->nonideal_items++;
-                _he_newbkt->expand_mult = _he_newbkt->count /
-                                          tbl->ideal_chain_maxlen;
-           }
-           he_thh->hh_next = _he_newbkt->hh_head;
-           _he_newbkt->hh_head = he_thh;
-           he_thh = _he_hh_nxt;
+/* UNCONDITIONALLY creates a new hash entry with the given key and value.
+ * Doesn't check if the key already exists. Use with care.
+ * (well that's the official line. As you can see, the XXX suggests we currently
+ * don't exploit the documented freedom. */
+void MVM_ptr_hash_insert(MVMThreadContext *tc,
+                         MVMPtrHashTable *hashtable,
+                         const void *key,
+                         uintptr_t value) {
+    struct MVMPtrHashEntry *new_entry = MVM_ptr_hash_lvalue_fetch(tc, hashtable, key);
+    if (new_entry->key) {
+        if (value != new_entry->value) {
+            MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
+            /* definately XXX - what should we do here? */
+            MVM_oops(tc, "insert conflict, %p is %u, %"PRIu64"x != %"PRIu64"x",
+                     key, bucket, (MVMuint64) value, (MVMuint64) new_entry->value);
         }
+    } else {
+        new_entry->key = key;
+        new_entry->value = value;
+    }
+}
+
+uintptr_t MVM_ptr_hash_fetch_and_delete(MVMThreadContext *tc,
+                                        MVMPtrHashTable *hashtable,
+                                        const void *key) {
+    if (MVM_UNLIKELY(hashtable->entries == NULL)) {
+        /* Should this be an oops? */
+        return 0;
     }
-    MVM_fixed_size_free(tc, tc->instance->fsa,
-                        tbl->num_buckets*sizeof(struct MVMPtrHashBucket),
-                        tbl->buckets);
-    tbl->num_buckets = new_num_bkts;
-    tbl->log2_num_buckets = new_log2_num_buckets;
-    tbl->buckets = he_new_buckets;
-    tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1))
-        ? (tbl->ineff_expands+1)
-        : 0;
-    if (tbl->ineff_expands > 1) {
-        tbl->noexpand=1;
+    unsigned int probe_distance = 1;
+    MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
+    char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry);
+    uint8_t *metadata = hashtable->metadata + bucket;
+    while (1) {
+        if (*metadata == probe_distance) {
+            struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
+            if (entry->key == key) {
+                /* Target acquired. */
+                uintptr_t retval = entry->value;
+
+                uint8_t *metadata_target = metadata;
+                /* Look at the next slot */
+                uint8_t old_probe_distance = metadata_target[1];
+                while (old_probe_distance > 1) {
+                    /* OK, we can move this one. */
+                    *metadata_target = old_probe_distance - 1;
+                    /* Try the next one, etc */
+                    ++metadata_target;
+                    old_probe_distance = metadata_target[1];
+                }
+                /* metadata_target now points to the metadata for the last thing
+                   we did move. (possibly still our target). */
+
+                uint32_t entries_to_move = metadata_target - metadata;
+                if (entries_to_move) {
+                    memmove(entry_raw, entry_raw + sizeof(struct MVMPtrHashEntry),
+                            sizeof(struct MVMPtrHashEntry) * entries_to_move);
+                }
+                /* and this slot is now emtpy. */
+                *metadata_target = 0;
+                --hashtable->cur_items;
+
+                /* Job's a good 'un. */
+                return retval;
+            }
+        }
+        /* There's a sentinel at the end. This will terminate: */
+        if (*metadata < probe_distance) {
+            /* So, if we hit 0, the bucket is empty. "Not found".
+               If we hit something with a lower probe distance then...
+               consider what would have happened had this key been inserted into
+               the hash table - it would have stolen this slot, and the key we
+               find here now would have been displaced futher on. Hence, the key
+               we seek can't be in the hash table. */
+            /* Strange. Not in the hash. Should this be an oops? */
+            return 0;
+        }
+        ++probe_distance;
+        ++metadata;
+        entry_raw += sizeof(struct MVMPtrHashEntry);
+        assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE);
+        assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items);
+        assert(metadata < hashtable->metadata + hashtable->official_size + 256);
     }
 }