Permalink
Browse files

Implement SipHash, use as our hashing function w/ 64bit hashvals

This switches our hashing function from "Jenkins's Lookup2" to SipHash.

This adds a heavily modified version of https://github.com/majek/csiphash
which has been modified to use a struct to store the state and allow us to
add 64 bits at a time (two graphemes). It also has been modified to take
64 bit integers instead of pointers to bytes.

Rewriting the CSipHash code to take a 64 bit integer is significant because
it gives us a massive speedup due to much better inlining and less data
copying.

Some of this SipHash code was finished a month ago but only recently had I
optimized it enough so that was just as fast as our current hashing
function. It makes my hashing speed test file 0.5% slower, which is totally
acceptable given it has been studied extensively, while our previous hash fn
has not been studied for security to the best of my knowledge.

We now use a 128-bit hashSecret stored in two MVMuint64's. Previously we
only had a 32-bit length hashSecret (stored in a MVMuint32).

The hash values are now 64-bits instead of 32-bits.
  • Loading branch information...
samcv committed Jul 15, 2018
1 parent a50a0b1 commit d9a3270aa290c8dd3b547d4deceb5e76dc8c8e47
@@ -413,6 +413,7 @@ HEADERS = src/moar.h \
src/instrument/line_coverage.h \
src/gen/config.h \
src/debug/debugserver.h \
src/strings/siphash/csiphash.h \
src/strings/uthash_types.h \
src/strings/uthash.h \
3rdparty/cmp/cmp.h \
@@ -516,6 +517,7 @@ install: all
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/spesh"
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/debug"
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/strings"
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/strings/siphash"
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/jit"
$(MKPATH) "$(DESTDIR)$(PREFIX)/include/moar/instrument"
$(CP) 3rdparty/*.h "$(DESTDIR)$(PREFIX)/include/moar"
@@ -533,6 +535,7 @@ install: all
$(CP) src/spesh/*.h "$(DESTDIR)$(PREFIX)/include/moar/spesh"
$(CP) src/debug/*.h "$(DESTDIR)$(PREFIX)/include/moar/debug"
$(CP) src/strings/*.h "$(DESTDIR)$(PREFIX)/include/moar/strings"
$(CP) src/strings/siphash/*.h "$(DESTDIR)$(PREFIX)/include/moar/strings/siphash"
$(CP) src/jit/*.h "$(DESTDIR)$(PREFIX)/include/moar/jit"
$(CP) src/instrument/*.h "$(DESTDIR)$(PREFIX)/include/moar/instrument"
@install@
@@ -47,7 +47,7 @@ struct MVMStringBody {
MVMuint16 storage_type;
MVMuint16 num_strands;
MVMuint32 num_graphs;
MVMhashv cached_hash_code;
MVMHashv cached_hash_code;
};
/* A strand of a string. */
@@ -509,7 +509,7 @@ struct MVMInstance {
/* Flag for if NFA debugging is enabled. */
MVMint8 nfa_debug_enabled;
/* Hash Secret which is used as the hash seed. This is to avoid denial of
/* Hash Secrets which is used as the hash seed. This is to avoid denial of
* service type attacks. */
MVMuint32 hashSecret;
MVMuint64 hashSecrets[2];
};
@@ -85,17 +85,18 @@ MVMInstance * MVM_vm_create_instance(void) {
char *jit_log, *jit_expr_disable, *jit_disable, *jit_bytecode_dir, *jit_last_frame, *jit_last_bb;
char *dynvar_log;
int init_stat;
MVMuint32 hashSecret;
MVMuint64 now = MVM_platform_now();
/* Set up instance data structure. */
instance = MVM_calloc(1, sizeof(MVMInstance));
/* Create the main thread's ThreadContext and stash it. */
instance->main_thread = MVM_tc_create(NULL, instance);
MVM_getrandom(instance->main_thread, &hashSecret, sizeof(MVMuint32));
instance->hashSecret ^= now;
instance->hashSecret ^= MVM_proc_getpid(instance->main_thread) * now;
/* Get the 128-bit hashSecret */
MVM_getrandom(instance->main_thread, instance->hashSecrets, sizeof(MVMuint64) * 2);
/* Just in case MVM_getrandom didn't work, XOR it with some poorly randomized data */
instance->hashSecrets[1] ^= now;
instance->hashSecrets[1] ^= MVM_proc_getpid(instance->main_thread) * now;
instance->main_thread->thread_id = 1;
/* Next thread to be created gets ID 2 (the main thread got ID 1). */
@@ -84,7 +84,7 @@ typedef double MVMnum64;
/* stuff for uthash */
#define uthash_fatal(msg) MVM_exception_throw_adhoc(tc, "internal hash error: " msg)
typedef uint32_t MVMhashv;
typedef MVMuint64 MVMHashv;
#include "strings/uthash_types.h"
@@ -176,14 +176,14 @@ static void saw(MVMThreadContext *tc, MVMHeapSnapshotState *ss, void *addr, MVMu
MVMHeapSnapshotSeen *seen = MVM_calloc(1, sizeof(MVMHeapSnapshotSeen));
seen->address = addr;
seen->idx = idx;
HASH_ADD_KEYPTR(hash_handle, ss->seen, (char *)&(seen->address), sizeof(void *), seen);
HASH_ADD_KEYPTR(hash_handle, ss->seen, &(seen->address), sizeof(void *), seen);
}
/* Checks for an entry in the seen hash. If we find an entry, write the index
* into the index pointer passed. */
static MVMuint32 seen(MVMThreadContext *tc, MVMHeapSnapshotState *ss, void *addr, MVMuint64 *idx) {
MVMHeapSnapshotSeen *entry;
HASH_FIND(hash_handle, ss->seen, (char *)&(addr), sizeof(void *), entry);
HASH_FIND(hash_handle, ss->seen, &addr, sizeof(void *), entry);
if (entry) {
*idx = entry->idx;
return 1;
@@ -2825,124 +2825,72 @@ MVMString * MVM_string_chr(MVMThreadContext *tc, MVMint64 cp) {
* cache field of the string. Hashing code is derived from the Jenkins hash
* implementation in uthash.h. */
typedef union {
MVMint32 graphs[3];
unsigned char bytes[12];
MVMuint32 graphs[2];
MVMuint64 u64;
} MVMJenHashGraphemeView;
MVM_STATIC_INLINE void MVM_hash_add_three (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv) {
*hj_i += hash_block->graphs[0];
*hj_j += hash_block->graphs[1];
*hashv += hash_block->graphs[2];
HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
}
MVM_STATIC_INLINE void MVM_hash_finish (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv, MVMStringIndex sgraphs, MVMStringIndex graphs_remaining) {
/* Mix in key length (in bytes, not graphemes). */
*hashv += sgraphs * sizeof(MVMGrapheme32);
/* Now handle trailing graphemes (must be 2, 1, or 0). */
/* NOTE: this is weird since it changes the order in different cases. This
* is just replicating old functionality. */
switch (graphs_remaining) {
case 2:
*hj_j += hash_block->graphs[0];
*hj_i += hash_block->graphs[1];
break;
/* Fallthrough */
case 1:
*hj_i += hash_block->graphs[0];
}
HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
/* Because we check if MVMString->body.cached_hash_code == 0 to tell if
* we have not yet computed the hash code, ensure that hashv is never 0
* by adding the length of the string to hashv iff hashv == 0. Since both
* the hashv and MVMStringIndex are both uint32, there should never be any
* overflow. Only problematic case is if the string is of length 0 and
* hashv is zero, though this is very very unlikely (if possible at all)
* and it should be very fast to calculate the hash so as to be negligible. */
if (*hashv == 0) {
*hashv += sgraphs;
}
}
/* To force little endian representation on big endian machines, set
* MVM_HASH_FORCE_LITTLE_ENDIAN in strings/siphash/csiphash.h
* If this isn't set, MVM_MAYBE_TO_LITTLE_ENDIAN_32 does nothing (the default).
* This would mainly be useful for debugging or if there were some other reason
* someone cared that hashes were identical on different endian platforms */
void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) {
/* The hash algorithm works in bytes. Since we can represent strings in a
* number of ways, and we want consistent hashing, then we'll read the
* strings using the grapheme iterator in groups of 3, using 32-bit ints
* for the graphemes no matter what the string really holds them as. Then
* we'll use the bytes view of that in the hashing function. */
MVMStringIndex graphs_remaining, sgraphs;
/* Initialize hash state. */
MVMhashv hashv = tc->instance->hashSecret;
MVMuint32 hj_i, hj_j;
hj_i = hj_j = 0x9e3779b9;
graphs_remaining = sgraphs = MVM_string_graphs(tc, s);
#if defined(MVM_HASH_FORCE_LITTLE_ENDIAN)
const MVMuint64 key[2] = {
MVM_MAYBE_TO_LITTLE_ENDIAN_64(tc->instance->hashSecrets[0]),
MVM_MAYBE_TO_LITTLE_ENDIAN_64(tc->instance->hashSecrets[1])
};
#else
const MVMuint64 *key = tc->instance->hashSecrets;
#endif
MVMuint64 hash = 0;
MVMStringIndex s_len = MVM_string_graphs_nocheck(tc, s);
switch (s->body.storage_type) {
case MVM_STRING_GRAPHEME_ASCII:
case MVM_STRING_GRAPHEME_8: {
int i;
MVMJenHashGraphemeView hash_block;
for (i = 0; 3 <= sgraphs - i; i += 3) {
hash_block.graphs[0] = s->body.storage.blob_8[i];
hash_block.graphs[1] = s->body.storage.blob_8[i+1];
hash_block.graphs[2] = s->body.storage.blob_8[i+2];
MVM_hash_add_three(
&hash_block,
&hj_i, &hj_j, &hashv);
}
graphs_remaining = sgraphs - i;
switch (graphs_remaining) {
case 1:
hash_block.graphs[0] = s->body.storage.blob_8[i];
break;
case 2:
hash_block.graphs[0] = s->body.storage.blob_8[i];
hash_block.graphs[1] = s->body.storage.blob_8[i+1];
break;
case MVM_STRING_GRAPHEME_8:
case MVM_STRING_GRAPHEME_ASCII: {
size_t i;
MVMJenHashGraphemeView gv;
siphash sh;
siphashinit(&sh, s_len * sizeof(MVMGrapheme32), key);
for (i = 0; i + 1 < s_len;) {
gv.graphs[0] = MVM_MAYBE_TO_LITTLE_ENDIAN_32(s->body.storage.blob_8[i++]);
gv.graphs[1] = MVM_MAYBE_TO_LITTLE_ENDIAN_32(s->body.storage.blob_8[i++]);
siphashadd64bits(&sh, gv.u64);
}
MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
/* If there is a final 32 bit grapheme pass it through, otherwise
* pass through 0. */
hash = siphashfinish_32bits(&sh,
i < s_len
? MVM_MAYBE_TO_LITTLE_ENDIAN_32(s->body.storage.blob_8[i]) : 0);
break;
}
#if !defined(MVM_HASH_FORCE_LITTLE_ENDIAN)
case MVM_STRING_GRAPHEME_32: {
int i;
for (i = 0; 3 <= sgraphs - i; i += 3) {
MVM_hash_add_three(
(MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i),
&hj_i, &hj_j, &hashv);
}
graphs_remaining = sgraphs - i;
MVM_hash_finish((MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i), &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
hash = siphash24(
(MVMuint8*)s->body.storage.blob_32,
s_len * sizeof(MVMGrapheme32),
key);
break;
}
#endif
default: {
siphash sh;
MVMGraphemeIter gi;
MVMJenHashGraphemeView hash_block;
/* Work through the string 3 graphemes at a time. */
MVMJenHashGraphemeView gv;
size_t i;
siphashinit(&sh, s_len * sizeof(MVMGrapheme32), key);
MVM_string_gi_init(tc, &gi, s);
while (3 <= graphs_remaining) {
hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi);
MVM_hash_add_three(
&hash_block,
&hj_i, &hj_j, &hashv);
graphs_remaining -= 3;
for (i = 0; i + 1 < s_len; i += 2) {
gv.graphs[0] = MVM_MAYBE_TO_LITTLE_ENDIAN_32(MVM_string_gi_get_grapheme(tc, &gi));
gv.graphs[1] = MVM_MAYBE_TO_LITTLE_ENDIAN_32(MVM_string_gi_get_grapheme(tc, &gi));
siphashadd64bits(&sh, gv.u64);
}
/* Now handle trailing graphemes (must be 2, 1, or 0). */
switch (graphs_remaining) {
case 1:
hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
break;
case 2:
hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
break;
}
MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
hash = siphashfinish_32bits(&sh,
i < s_len
? MVM_MAYBE_TO_LITTLE_ENDIAN_32(MVM_string_gi_get_grapheme(tc, &gi))
: 0);
break;
}
}
/* Store computed hash value. */
s->body.cached_hash_code = hashv;
s->body.cached_hash_code = hash;
}
@@ -0,0 +1,4 @@
all:
gcc -O3 test.c -g -Wall -Wextra -ggdb -o siphashtest -D MVM_CAN_UNALIGNED_INT64 && ./siphashtest
gcc -O3 test.c -g -Wall -Wextra -ggdb -o siphashtest && ./siphashtest
Oops, something went wrong.

0 comments on commit d9a3270

Please sign in to comment.