Skip to content

Commit

Permalink
Only use MVMGraphemeIter_cached for strands in KMP index
Browse files Browse the repository at this point in the history
If the Haystack is a strand, use MVM_string_gi_cached_get_grapheme
since it retains its grapheme iterator over invocations unlike
MVM_string_get_grapheme_at_nocheck and caches the previous grapheme. It
is slower for flat Haystacks though (ever since I got
MVM_string_get_grapheme_at_nocheck to be inlined).
  • Loading branch information
samcv committed Sep 3, 2017
1 parent fec81bb commit ce76c99
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 7 deletions.
10 changes: 8 additions & 2 deletions src/strings/iter.h
Expand Up @@ -278,6 +278,9 @@ MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc,

return result;
}
/* The MVMGraphemeIter_cached is used for the Knuth-Morris-Pratt algorithm
* because often it will request the same grapheme again, and our grapheme
* iterators only return the next grapheme */
struct MVMGraphemeIter_cached {
MVMGraphemeIter gi;
MVMGrapheme32 last_g;
Expand All @@ -293,7 +296,8 @@ MVM_STATIC_INLINE void MVM_string_gi_cached_init (MVMThreadContext *tc, MVMGraph
gic->string = s;
}
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_cached_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter_cached *gic, MVMint64 index) {
/* Most likely case is we are getting the next grapheme */
/* Most likely case is we are getting the next grapheme. When that happens
* we will go directly to the end. */
if (index == gic->last_location + 1) {
}
/* Second most likely is getting the cached grapheme */
Expand All @@ -309,7 +313,9 @@ MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_cached_get_grapheme(MVMThreadConte
MVM_exception_throw_adhoc(tc, "Internal error: Requested an index %"PRIi64" that was less than the last_location %"PRIu32"",
index, gic->last_location);
/* Not yet tested, but we may be able to access previous graphemes by reinitializing
* MVM_string_gi_cached_init(tc, gic, gic->string, index); */
* MVM_string_gi_cached_init(tc, gic, gic->string, index);
* MVM_string_gi_move_to(tc, &(gic->gi), index);
* MVM_string_gi_get_grapheme(tc, &(gic->gi)); */
}
gic->last_location = index;
return (gic->last_g = MVM_string_gi_get_grapheme(tc, &(gic->gi)));
Expand Down
26 changes: 21 additions & 5 deletions src/strings/ops.c
Expand Up @@ -1041,11 +1041,10 @@ static void knuth_morris_pratt_process_pattern (MVMThreadContext *tc, MVMString
}

static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset) {
MVMGraphemeIter_cached H_gic;
MVMint64 needle_offset = 0;
MVMint64 text_offset = H_offset;
MVMStringIndex Haystack_graphs = MVM_string_graphs(tc, Haystack);
MVMStringIndex needle_graphs = MVM_string_graphs(tc, needle);
MVMStringIndex Haystack_graphs = MVM_string_graphs_nocheck(tc, Haystack);
MVMStringIndex needle_graphs = MVM_string_graphs_nocheck(tc, needle);
MVMGrapheme32 *next = NULL;
MVMString *flat_needle = NULL;
assert(needle_graphs <= MVM_string_KMP_max_pattern_length);
Expand All @@ -1057,12 +1056,29 @@ static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString
flat_needle = needle->body.storage_type == MVM_STRING_STRAND
? collapse_strands(tc, needle)
: needle;
MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
/* Process the needle into a jump table put into variable 'next' */
knuth_morris_pratt_process_pattern(tc, flat_needle, next, needle_graphs);
/* If the Haystack is a strand, use MVM_string_gi_cached_get_grapheme
* since it retains its grapheme iterator over invocations unlike
* MVM_string_get_grapheme_at_nocheck and caches the previous grapheme. It
* is slower for flat Haystacks though. */
if (Haystack->body.storage_type == MVM_STRING_STRAND) {
MVMGraphemeIter_cached H_gic;
MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
while (text_offset < Haystack_graphs && needle_offset < needle_graphs) {
if (needle_offset == -1 || MVM_string_get_grapheme_at_nocheck(tc, flat_needle, needle_offset)
== MVM_string_gi_cached_get_grapheme(tc, &H_gic, text_offset)) {
text_offset++; needle_offset++;
if (needle_offset == needle_graphs)
return text_offset - needle_offset;
}
else needle_offset = next[needle_offset];
}
return -1;
}
while (text_offset < Haystack_graphs && needle_offset < needle_graphs) {
if (needle_offset == -1 || MVM_string_get_grapheme_at_nocheck(tc, flat_needle, needle_offset)
== MVM_string_gi_cached_get_grapheme(tc, &H_gic, text_offset)) {
== MVM_string_get_grapheme_at_nocheck(tc, Haystack, text_offset)) {
text_offset++; needle_offset++;
if (needle_offset == needle_graphs)
return text_offset - needle_offset;
Expand Down

0 comments on commit ce76c99

Please sign in to comment.