Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Re-implement MVMPtrHashTable as a Robin Hood Hash.
Like the previous commits, this re-implements in C the design and optimisations of Martin Ankerl's C++ templated Robin Hood Hash. (Again, not all optimisations are implemented yet. This is the "Minimal Viable Product" edition.) Rename struct MVMPtrHashHandle to struct MVMPtrHashEntry.
- Loading branch information
Showing
5 changed files
with
336 additions
and
347 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,104 +1,243 @@ | ||
#include "moar.h" | ||
|
||
#define PTR_LOAD_FACTOR 0.75 | ||
#define PTR_INITIAL_SIZE 8 | ||
#define PTR_INITIAL_KEY_RIGHT_SHIFT (8 * sizeof(uintptr_t) - 3) | ||
|
||
/* Frees the entire contents of the hash, leaving you just the hashtable itself, | ||
which you allocated (heap, stack, inside another struct, wherever) */ | ||
void MVM_ptr_hash_demolish(MVMThreadContext *tc, MVMPtrHashTable *hashtable) { | ||
/* Never allocated? (or already demolished?) */ | ||
if (MVM_UNLIKELY(hashtable->log2_num_buckets == 0)) | ||
return; | ||
|
||
struct MVMPtrHashBucket *bucket = hashtable->buckets; | ||
const struct MVMPtrHashBucket *const bucket_end | ||
= hashtable->buckets + hashtable->num_buckets; | ||
|
||
do { | ||
struct MVMPtrHashHandle *head = bucket->hh_head; | ||
while (head) { | ||
struct MVMPtrHashHandle *next = head->hh_next; | ||
MVM_fixed_size_free(tc, tc->instance->fsa, sizeof(struct MVMPtrHashHandle), head); | ||
head = next; | ||
free(hashtable->entries); | ||
free(hashtable->metadata); | ||
} | ||
/* and then free memory if you allocated it */ | ||
|
||
|
||
MVM_STATIC_INLINE MVMuint32 hash_true_size(MVMPtrHashTable *hashtable) { | ||
MVMuint32 true_size = hashtable->official_size + hashtable->max_items - 1; | ||
if (hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE < true_size) { | ||
true_size = hashtable->official_size + MVM_HASH_MAX_PROBE_DISTANCE; | ||
} | ||
return true_size; | ||
} | ||
|
||
MVM_STATIC_INLINE void hash_allocate_common(MVMPtrHashTable *hashtable) { | ||
hashtable->max_items = hashtable->official_size * PTR_LOAD_FACTOR; | ||
size_t actual_items = hash_true_size(hashtable); | ||
hashtable->entries = malloc(sizeof(struct MVMPtrHashEntry) * actual_items); | ||
hashtable->metadata = calloc(actual_items + 1, 1); | ||
/* A sentinel. This marks an occupied slot, at its ideal position. */ | ||
hashtable->metadata[actual_items] = 1; | ||
} | ||
|
||
MVM_STATIC_INLINE void hash_initial_allocate(MVMPtrHashTable *hashtable) { | ||
hashtable->key_right_shift = PTR_INITIAL_KEY_RIGHT_SHIFT; | ||
hashtable->official_size = PTR_INITIAL_SIZE; | ||
|
||
hash_allocate_common(hashtable); | ||
} | ||
|
||
/* make sure you still have your copies of entries and metadata before you | ||
call this. */ | ||
MVM_STATIC_INLINE void hash_grow(MVMPtrHashTable *hashtable) { | ||
--hashtable->key_right_shift; | ||
hashtable->official_size *= 2; | ||
|
||
hash_allocate_common(hashtable); | ||
} | ||
|
||
MVM_STATIC_INLINE struct MVMPtrHashEntry *hash_insert_internal(MVMThreadContext *tc, | ||
MVMPtrHashTable *hashtable, | ||
const void *key) { | ||
if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) { | ||
MVM_oops(tc, "oops, attempt to recursively call grow when adding %p", | ||
key); | ||
} | ||
|
||
unsigned int probe_distance = 1; | ||
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift; | ||
char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry); | ||
MVMuint8 *metadata = hashtable->metadata + bucket; | ||
while (1) { | ||
if (*metadata < probe_distance) { | ||
/* this is our slot. occupied or not, it is our rightful place. */ | ||
|
||
if (*metadata == 0) { | ||
/* Open goal. Score! */ | ||
} else { | ||
/* make room. */ | ||
|
||
/* Optimisation first seen in Martin Ankerl's implementation - | ||
we don't need actually implement the "stealing" by swapping | ||
elements and carrying on with insert. The invariant of the | ||
hash is that probe distances are never out of order, and as | ||
all the following elements have probe distances in order, we | ||
can maintain the invariant just as well by moving everything | ||
along by one. */ | ||
MVMuint8 *find_me_a_gap = metadata; | ||
MVMuint8 old_probe_distance = *metadata; | ||
do { | ||
MVMuint8 new_probe_distance = 1 + old_probe_distance; | ||
if (new_probe_distance == MVM_HASH_MAX_PROBE_DISTANCE) { | ||
/* Optimisation from Martin Ankerl's implementation: | ||
setting this to zero forces a resize on any insert, | ||
*before* the actual insert, so that we never end up | ||
having to handle overflow *during* this loop. This | ||
loop can always complete. */ | ||
hashtable->max_items = 0; | ||
} | ||
/* a swap: */ | ||
old_probe_distance = *++find_me_a_gap; | ||
*find_me_a_gap = new_probe_distance; | ||
} while (old_probe_distance); | ||
|
||
MVMuint32 entries_to_move = find_me_a_gap - metadata; | ||
memmove(entry_raw + sizeof(struct MVMPtrHashEntry), entry_raw, | ||
sizeof(struct MVMPtrHashEntry) * entries_to_move); | ||
} | ||
|
||
*metadata = probe_distance; | ||
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw; | ||
entry->key = NULL; | ||
return entry; | ||
} | ||
|
||
if (*metadata == probe_distance) { | ||
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw; | ||
if (entry->key == key) { | ||
return entry; | ||
} | ||
} | ||
++probe_distance; | ||
++metadata; | ||
entry_raw += sizeof(struct MVMPtrHashEntry); | ||
assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE); | ||
assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items); | ||
assert(metadata < hashtable->metadata + hashtable->official_size + 256); | ||
} | ||
} | ||
|
||
struct MVMPtrHashEntry *MVM_ptr_hash_lvalue_fetch(MVMThreadContext *tc, | ||
MVMPtrHashTable *hashtable, | ||
const void *key) { | ||
if (MVM_UNLIKELY(hashtable->entries == NULL)) { | ||
hash_initial_allocate(hashtable); | ||
} | ||
else if (MVM_UNLIKELY(hashtable->cur_items >= hashtable->max_items)) { | ||
MVMuint32 true_size = hash_true_size(hashtable); | ||
char *entry_raw_orig = hashtable->entries; | ||
MVMuint8 *metadata_orig = hashtable->metadata; | ||
|
||
hash_grow(hashtable); | ||
|
||
char *entry_raw = entry_raw_orig; | ||
MVMuint8 *metadata = metadata_orig; | ||
MVMHashNumItems bucket = 0; | ||
while (bucket < true_size) { | ||
if (*metadata) { | ||
struct MVMPtrHashEntry *old_entry = (struct MVMPtrHashEntry *) entry_raw; | ||
struct MVMPtrHashEntry *new_entry = | ||
hash_insert_internal(tc, hashtable, old_entry->key); | ||
assert(new_entry->key == NULL); | ||
*new_entry = *old_entry; | ||
} | ||
++bucket; | ||
++metadata; | ||
entry_raw += sizeof(struct MVMPtrHashEntry); | ||
} | ||
} while (++bucket < bucket_end); | ||
|
||
MVM_fixed_size_free(tc, tc->instance->fsa, | ||
hashtable->num_buckets*sizeof(struct MVMPtrHashBucket), | ||
hashtable->buckets); | ||
/* We shouldn't need either of these, but make something foolproof and they | ||
invent a better fool: */ | ||
hashtable->buckets = NULL; | ||
hashtable->log2_num_buckets = 0; | ||
hashtable->num_items = 0; | ||
free(entry_raw_orig); | ||
free(metadata_orig); | ||
} | ||
struct MVMPtrHashEntry *new_entry | ||
= hash_insert_internal(tc, hashtable, key); | ||
if (!new_entry->key) { | ||
++hashtable->cur_items; | ||
} | ||
return new_entry; | ||
} | ||
|
||
/* Bucket expansion has the effect of doubling the number of buckets | ||
* and redistributing the items into the new buckets. Ideally the | ||
* items will distribute more or less evenly into the new buckets | ||
* (the extent to which this is true is a measure of the quality of | ||
* the hash function as it applies to the key domain). | ||
* | ||
* With the items distributed into more buckets, the chain length | ||
* (item count) in each bucket is reduced. Thus by expanding buckets | ||
* the hash keeps a bound on the chain length. This bounded chain | ||
* length is the essence of how a hash provides constant time lookup. | ||
* | ||
* The calculation of tbl->ideal_chain_maxlen below deserves some | ||
* explanation. First, keep in mind that we're calculating the ideal | ||
* maximum chain length based on the *new* (doubled) bucket count. | ||
* In fractions this is just n/b (n=number of items,b=new num buckets). | ||
* Since the ideal chain length is an integer, we want to calculate | ||
* ceil(n/b). We don't depend on floating point arithmetic in this | ||
* hash, so to calculate ceil(n/b) with integers we could write | ||
* | ||
* ceil(n/b) = (n/b) + ((n%b)?1:0) | ||
* | ||
* and in fact a previous version of this hash did just that. | ||
* But now we have improved things a bit by recognizing that b is | ||
* always a power of two. We keep its base 2 log handy (call it lb), | ||
* so now we can write this with a bit shift and logical AND: | ||
* | ||
* ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) | ||
* | ||
*/ | ||
void MVM_ptr_hash_expand_buckets(MVMThreadContext *tc, MVMPtrHashTable *tbl) { | ||
MVMHashBktNum he_bkt; | ||
MVMHashBktNum he_bkt_i; | ||
struct MVMPtrHashHandle *he_thh, *_he_hh_nxt; | ||
struct MVMPtrHashBucket *he_new_buckets, *_he_newbkt; | ||
MVMHashBktNum new_num_bkts = tbl->num_buckets * 2; | ||
MVMHashUInt new_log2_num_buckets = tbl->log2_num_buckets + 1; | ||
he_new_buckets = | ||
MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, | ||
new_num_bkts * sizeof(struct MVMPtrHashBucket)); | ||
tbl->ideal_chain_maxlen = | ||
(tbl->num_items >> new_log2_num_buckets) + | ||
((tbl->num_items & (new_num_bkts-1)) ? 1 : 0); | ||
tbl->nonideal_items = 0; | ||
/* Iterate the buckets */ | ||
for(he_bkt_i = 0; he_bkt_i < tbl->num_buckets; he_bkt_i++) { | ||
he_thh = tbl->buckets[ he_bkt_i ].hh_head; | ||
/* Iterate items in the bucket */ | ||
while (he_thh) { | ||
_he_hh_nxt = he_thh->hh_next; | ||
he_bkt = MVM_ptr_hash_bucket(he_thh->key, new_log2_num_buckets); | ||
_he_newbkt = &(he_new_buckets[ he_bkt ]); | ||
if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { | ||
tbl->nonideal_items++; | ||
_he_newbkt->expand_mult = _he_newbkt->count / | ||
tbl->ideal_chain_maxlen; | ||
} | ||
he_thh->hh_next = _he_newbkt->hh_head; | ||
_he_newbkt->hh_head = he_thh; | ||
he_thh = _he_hh_nxt; | ||
/* UNCONDITIONALLY creates a new hash entry with the given key and value. | ||
* Doesn't check if the key already exists. Use with care. | ||
* (well that's the official line. As you can see, the XXX suggests we currently | ||
* don't exploit the documented freedom. */ | ||
void MVM_ptr_hash_insert(MVMThreadContext *tc, | ||
MVMPtrHashTable *hashtable, | ||
const void *key, | ||
uintptr_t value) { | ||
struct MVMPtrHashEntry *new_entry = MVM_ptr_hash_lvalue_fetch(tc, hashtable, key); | ||
if (new_entry->key) { | ||
if (value != new_entry->value) { | ||
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift; | ||
/* definately XXX - what should we do here? */ | ||
MVM_oops(tc, "insert conflict, %p is %u, %"PRIu64"x != %"PRIu64"x", | ||
key, bucket, (MVMuint64) value, (MVMuint64) new_entry->value); | ||
} | ||
} else { | ||
new_entry->key = key; | ||
new_entry->value = value; | ||
} | ||
} | ||
|
||
uintptr_t MVM_ptr_hash_fetch_and_delete(MVMThreadContext *tc, | ||
MVMPtrHashTable *hashtable, | ||
const void *key) { | ||
if (MVM_UNLIKELY(hashtable->entries == NULL)) { | ||
/* Should this be an oops? */ | ||
return 0; | ||
} | ||
MVM_fixed_size_free(tc, tc->instance->fsa, | ||
tbl->num_buckets*sizeof(struct MVMPtrHashBucket), | ||
tbl->buckets); | ||
tbl->num_buckets = new_num_bkts; | ||
tbl->log2_num_buckets = new_log2_num_buckets; | ||
tbl->buckets = he_new_buckets; | ||
tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) | ||
? (tbl->ineff_expands+1) | ||
: 0; | ||
if (tbl->ineff_expands > 1) { | ||
tbl->noexpand=1; | ||
unsigned int probe_distance = 1; | ||
MVMHashNumItems bucket = MVM_ptr_hash_code(key) >> hashtable->key_right_shift; | ||
char *entry_raw = hashtable->entries + bucket * sizeof(struct MVMPtrHashEntry); | ||
uint8_t *metadata = hashtable->metadata + bucket; | ||
while (1) { | ||
if (*metadata == probe_distance) { | ||
struct MVMPtrHashEntry *entry = (struct MVMPtrHashEntry *) entry_raw; | ||
if (entry->key == key) { | ||
/* Target acquired. */ | ||
uintptr_t retval = entry->value; | ||
|
||
uint8_t *metadata_target = metadata; | ||
/* Look at the next slot */ | ||
uint8_t old_probe_distance = metadata_target[1]; | ||
while (old_probe_distance > 1) { | ||
/* OK, we can move this one. */ | ||
*metadata_target = old_probe_distance - 1; | ||
/* Try the next one, etc */ | ||
++metadata_target; | ||
old_probe_distance = metadata_target[1]; | ||
} | ||
/* metadata_target now points to the metadata for the last thing | ||
we did move. (possibly still our target). */ | ||
|
||
uint32_t entries_to_move = metadata_target - metadata; | ||
if (entries_to_move) { | ||
memmove(entry_raw, entry_raw + sizeof(struct MVMPtrHashEntry), | ||
sizeof(struct MVMPtrHashEntry) * entries_to_move); | ||
} | ||
/* and this slot is now emtpy. */ | ||
*metadata_target = 0; | ||
--hashtable->cur_items; | ||
|
||
/* Job's a good 'un. */ | ||
return retval; | ||
} | ||
} | ||
/* There's a sentinel at the end. This will terminate: */ | ||
if (*metadata < probe_distance) { | ||
/* So, if we hit 0, the bucket is empty. "Not found". | ||
If we hit something with a lower probe distance then... | ||
consider what would have happened had this key been inserted into | ||
the hash table - it would have stolen this slot, and the key we | ||
find here now would have been displaced futher on. Hence, the key | ||
we seek can't be in the hash table. */ | ||
/* Strange. Not in the hash. Should this be an oops? */ | ||
return 0; | ||
} | ||
++probe_distance; | ||
++metadata; | ||
entry_raw += sizeof(struct MVMPtrHashEntry); | ||
assert(probe_distance <= MVM_HASH_MAX_PROBE_DISTANCE); | ||
assert(metadata < hashtable->metadata + hashtable->official_size + hashtable->max_items); | ||
assert(metadata < hashtable->metadata + hashtable->official_size + 256); | ||
} | ||
} |
Oops, something went wrong.