Skip to content

Commit

Permalink
Re-implement MVMPtrHashTable as a Robin Hood Hash.
Browse files Browse the repository at this point in the history
Like the previous commits, this re-implements in C the design and
optimisations of Martin Ankerl's C++ templated Robin Hood Hash.

(Again, not all optimisations are implemented yet. This is the "Minimal Viable
Product" edition.)

Rename struct MVMPtrHashHandle to struct MVMPtrHashEntry.
  • Loading branch information
nwc10 committed Jul 2, 2020
1 parent f352a5c commit 4e503a4
Show file tree
Hide file tree
Showing 5 changed files with 336 additions and 347 deletions.
327 changes: 233 additions & 94 deletions src/core/ptr_hash_table.c
@@ -1,104 +1,243 @@
#include "moar.h"

#define PTR_LOAD_FACTOR 0.75
#define PTR_INITIAL_SIZE 8
#define PTR_INITIAL_KEY_RIGHT_SHIFT (8 * sizeof(uintptr_t) - 3)

/* Frees the entire contents of the hash, leaving you just the hashtable itself,
which you allocated (heap, stack, inside another struct, wherever) */
void MVM_ptr_hash_demolish(MVMThreadContext *tc, MVMPtrHashTable *hashtable) {
/* Never allocated? (or already demolished?) */
if (MVM_UNLIKELY(hashtable->log2_num_buckets == 0))
return;

struct MVMPtrHashBucket *bucket = hashtable->buckets;
const struct MVMPtrHashBucket *const bucket_end
= hashtable->buckets + hashtable->num_buckets;

do {
struct MVMPtrHashHandle *head = bucket->hh_head;
while (head) {
struct MVMPtrHashHandle *next = head->hh_next;
MVM_fixed_size_free(tc, tc->instance->fsa, sizeof(struct MVMPtrHashHandle), head);
head = next;
free(hashtable->entries);
free(hashtable->metadata);
}
/* and then free memory if you allocated it */


MVM_STATIC_INLINE MVMuint32 hash_true_size(MVMPtrHashTable *hashtable) {
MVMuint32 true_size = hashtable->official_size + hashtable->max_items - 1;
if (hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE < true_size) {
true_size = hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE;
}
return true_size;
}

MVM_STATIC_INLINE void hash_allocate_common(MVMPtrHashTable *hashtable) {
hashtable->max_items = hashtable->official_size * PTR_LOAD_FACTOR;
size_t actual_items = hash_true_size(hashtable);
hashtable->entries = malloc(sizeof(struct MVMPtrHashEntry) * actual_items);
hashtable->metadata = calloc(actual_items + 1, 1);
/* A sentinel. This marks an occupied slot, at its ideal position. */
hashtable->metadata[actual_items] = 1;
}

MVM_STATIC_INLINE void hash_initial_allocate(MVMPtrHashTable *hashtable) {
hashtable->key_right_shift = PTR_INITIAL_KEY_RIGHT_SHIFT;
hashtable->official_size = PTR_INITIAL_SIZE;

hash_allocate_common(hashtable);
}

/* make sure you still have your copies of entries and metadata before you
call this. */
MVM_STATIC_INLINE void hash_grow(MVMPtrHashTable *hashtable) {
--hashtable->key_right_shift;
hashtable->official_size *= 2;

hash_allocate_common(hashtable);
}

MVM_STATIC_INLINE struct MVMPtrHashEntry *hash_insert_internal(MVMThreadContext *tc,
MVMPtrHashTable *hashtable,
const void *key) {
if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) {
MVM_oops(tc, "oops, attempt to recursively call grow when adding %p",
key);
}

unsigned int probe_distance = 1;
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry);
MVMuint8 *metadata = hashtable->metadata + bucket;
while (1) {
if (*metadata < probe_distance) {
/* this is our slot. occupied or not, it is our rightful place. */

if (*metadata == 0) {
/* Open goal. Score! */
} else {
/* make room. */

/* Optimisation first seen in Martin Ankerl's implementation -
we don't need actually implement the "stealing" by swapping
elements and carrying on with insert. The invariant of the
hash is that probe distances are never out of order, and as
all the following elements have probe distances in order, we
can maintain the invariant just as well by moving everything
along by one. */
MVMuint8 *find_me_a_gap = metadata;
MVMuint8 old_probe_distance = *metadata;
do {
MVMuint8 new_probe_distance = 1 + old_probe_distance;
if (new_probe_distance == MVM_HASH_MAX_PROBE_DISTANCE) {
/* Optimisation from Martin Ankerl's implementation:
setting this to zero forces a resize on any insert,
*before* the actual insert, so that we never end up
having to handle overflow *during* this loop. This
loop can always complete. */
hashtable->max_items = 0;
}
/* a swap: */
old_probe_distance = *++find_me_a_gap;
*find_me_a_gap = new_probe_distance;
} while (old_probe_distance);

MVMuint32 entries_to_move = find_me_a_gap - metadata;
memmove(entry_raw + sizeof(struct MVMPtrHashEntry), entry_raw,
sizeof(struct MVMPtrHashEntry) * entries_to_move);
}

*metadata = probe_distance;
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
entry->key = NULL;
return entry;
}

if (*metadata == probe_distance) {
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
if (entry->key == key) {
return entry;
}
}
++probe_distance;
++metadata;
entry_raw += sizeof(struct MVMPtrHashEntry);
assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE);
assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items);
assert(metadata < hashtable->metadata + hashtable->official_size + 256);
}
}

struct MVMPtrHashEntry *MVM_ptr_hash_lvalue_fetch(MVMThreadContext *tc,
MVMPtrHashTable *hashtable,
const void *key) {
if (MVM_UNLIKELY(hashtable->entries == NULL)) {
hash_initial_allocate(hashtable);
}
else if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) {
MVMuint32 true_size = hash_true_size(hashtable);
char *entry_raw_orig = hashtable->entries;
MVMuint8 *metadata_orig = hashtable->metadata;

hash_grow(hashtable);

char *entry_raw = entry_raw_orig;
MVMuint8 *metadata = metadata_orig;
MVMHashNumItems bucket = 0;
while (bucket < true_size) {
if (*metadata) {
struct MVMPtrHashEntry *old_entry = (struct MVMPtrHashEntry *) entry_raw;
struct MVMPtrHashEntry *new_entry =
hash_insert_internal(tc, hashtable, old_entry->key);
assert(new_entry->key == NULL);
*new_entry = *old_entry;
}
++bucket;
++metadata;
entry_raw += sizeof(struct MVMPtrHashEntry);
}
} while (++bucket < bucket_end);

MVM_fixed_size_free(tc, tc->instance->fsa,
hashtable->num_buckets*sizeof(struct MVMPtrHashBucket),
hashtable->buckets);
/* We shouldn't need either of these, but make something foolproof and they
invent a better fool: */
hashtable->buckets = NULL;
hashtable->log2_num_buckets = 0;
hashtable->num_items = 0;
free(entry_raw_orig);
free(metadata_orig);
}
struct MVMPtrHashEntry *new_entry
= hash_insert_internal(tc, hashtable, key);
if (!new_entry->key) {
++hashtable->cur_items;
}
return new_entry;
}

/* Bucket expansion has the effect of doubling the number of buckets
* and redistributing the items into the new buckets. Ideally the
* items will distribute more or less evenly into the new buckets
* (the extent to which this is true is a measure of the quality of
* the hash function as it applies to the key domain).
*
* With the items distributed into more buckets, the chain length
* (item count) in each bucket is reduced. Thus by expanding buckets
* the hash keeps a bound on the chain length. This bounded chain
* length is the essence of how a hash provides constant time lookup.
*
* The calculation of tbl->ideal_chain_maxlen below deserves some
* explanation. First, keep in mind that we're calculating the ideal
* maximum chain length based on the *new* (doubled) bucket count.
* In fractions this is just n/b (n=number of items,b=new num buckets).
* Since the ideal chain length is an integer, we want to calculate
* ceil(n/b). We don't depend on floating point arithmetic in this
* hash, so to calculate ceil(n/b) with integers we could write
*
* ceil(n/b) = (n/b) + ((n%b)?1:0)
*
* and in fact a previous version of this hash did just that.
* But now we have improved things a bit by recognizing that b is
* always a power of two. We keep its base 2 log handy (call it lb),
* so now we can write this with a bit shift and logical AND:
*
* ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
*
*/
void MVM_ptr_hash_expand_buckets(MVMThreadContext *tc, MVMPtrHashTable *tbl) {
MVMHashBktNum he_bkt;
MVMHashBktNum he_bkt_i;
struct MVMPtrHashHandle *he_thh, *_he_hh_nxt;
struct MVMPtrHashBucket *he_new_buckets, *_he_newbkt;
MVMHashBktNum new_num_bkts = tbl->num_buckets * 2;
MVMHashUInt new_log2_num_buckets = tbl->log2_num_buckets + 1;
he_new_buckets =
MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa,
new_num_bkts * sizeof(struct MVMPtrHashBucket));
tbl->ideal_chain_maxlen =
(tbl->num_items >> new_log2_num_buckets) +
((tbl->num_items & (new_num_bkts-1)) ? 1 : 0);
tbl->nonideal_items = 0;
/* Iterate the buckets */
for(he_bkt_i = 0; he_bkt_i < tbl->num_buckets; he_bkt_i++) {
he_thh = tbl->buckets[ he_bkt_i ].hh_head;
/* Iterate items in the bucket */
while (he_thh) {
_he_hh_nxt = he_thh->hh_next;
he_bkt = MVM_ptr_hash_bucket(he_thh->key, new_log2_num_buckets);
_he_newbkt = &(he_new_buckets[ he_bkt ]);
if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) {
tbl->nonideal_items++;
_he_newbkt->expand_mult = _he_newbkt->count /
tbl->ideal_chain_maxlen;
}
he_thh->hh_next = _he_newbkt->hh_head;
_he_newbkt->hh_head = he_thh;
he_thh = _he_hh_nxt;
/* UNCONDITIONALLY creates a new hash entry with the given key and value.
* Doesn't check if the key already exists. Use with care.
* (well that's the official line. As you can see, the XXX suggests we currently
* don't exploit the documented freedom. */
void MVM_ptr_hash_insert(MVMThreadContext *tc,
MVMPtrHashTable *hashtable,
const void *key,
uintptr_t value) {
struct MVMPtrHashEntry *new_entry = MVM_ptr_hash_lvalue_fetch(tc, hashtable, key);
if (new_entry->key) {
if (value != new_entry->value) {
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
/* definately XXX - what should we do here? */
MVM_oops(tc, "insert conflict, %p is %u, %"PRIu64"x != %"PRIu64"x",
key, bucket, (MVMuint64) value, (MVMuint64) new_entry->value);
}
} else {
new_entry->key = key;
new_entry->value = value;
}
}

uintptr_t MVM_ptr_hash_fetch_and_delete(MVMThreadContext *tc,
MVMPtrHashTable *hashtable,
const void *key) {
if (MVM_UNLIKELY(hashtable->entries == NULL)) {
/* Should this be an oops? */
return 0;
}
MVM_fixed_size_free(tc, tc->instance->fsa,
tbl->num_buckets*sizeof(struct MVMPtrHashBucket),
tbl->buckets);
tbl->num_buckets = new_num_bkts;
tbl->log2_num_buckets = new_log2_num_buckets;
tbl->buckets = he_new_buckets;
tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1))
? (tbl->ineff_expands+1)
: 0;
if (tbl->ineff_expands > 1) {
tbl->noexpand=1;
unsigned int probe_distance = 1;
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift;
char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry);
uint8_t *metadata = hashtable->metadata + bucket;
while (1) {
if (*metadata == probe_distance) {
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw;
if (entry->key == key) {
/* Target acquired. */
uintptr_t retval = entry->value;

uint8_t *metadata_target = metadata;
/* Look at the next slot */
uint8_t old_probe_distance = metadata_target[1];
while (old_probe_distance > 1) {
/* OK, we can move this one. */
*metadata_target = old_probe_distance - 1;
/* Try the next one, etc */
++metadata_target;
old_probe_distance = metadata_target[1];
}
/* metadata_target now points to the metadata for the last thing
we did move. (possibly still our target). */

uint32_t entries_to_move = metadata_target - metadata;
if (entries_to_move) {
memmove(entry_raw, entry_raw + sizeof(struct MVMPtrHashEntry),
sizeof(struct MVMPtrHashEntry) * entries_to_move);
}
/* and this slot is now emtpy. */
*metadata_target = 0;
--hashtable->cur_items;

/* Job's a good 'un. */
return retval;
}
}
/* There's a sentinel at the end. This will terminate: */
if (*metadata < probe_distance) {
/* So, if we hit 0, the bucket is empty. "Not found".
If we hit something with a lower probe distance then...
consider what would have happened had this key been inserted into
the hash table - it would have stolen this slot, and the key we
find here now would have been displaced futher on. Hence, the key
we seek can't be in the hash table. */
/* Strange. Not in the hash. Should this be an oops? */
return 0;
}
++probe_distance;
++metadata;
entry_raw += sizeof(struct MVMPtrHashEntry);
assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE);
assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items);
assert(metadata < hashtable->metadata + hashtable->official_size + 256);
}
}

0 comments on commit 4e503a4

Please sign in to comment.