Permalink
Browse files

Add a UTF-8 decoding fast-path.

In many cases, we are reading things that are already in normal form
thanks to being in the ASCII/Latin-1 range, which we can very cheaply
check. Add a fast path to the UTF-8 decoder for this case. It doesn't
actually skip anything UTF-8 related, but rather can take a shortcut
on the full normalizer. On the benchmark reading a million lines using
the UTF-8 encoding but with everything falling under the fast path, we
save around 3216 million CPU instructions.

When reading line by line, if the line separators are controls, this
mechanism ends up just naturally resetting itself back to trying the
fast path. This means that reading a file line by line, where just the
odd line has things needing full normalization, we'll still be able to
take the faster path on all of the other lines.
  • Loading branch information...
jnthn committed Jun 16, 2017
1 parent 8972ab2 commit a6abd3c6654413d2230470dbaa82b7b3a2b05762
Showing with 118 additions and 34 deletions.
  1. +118 −34 src/strings/utf8.c
View
@@ -323,12 +323,15 @@ MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds
MVMint32 count = 0, total = 0;
MVMint32 state = 0;
MVMCodepoint codepoint = 0;
MVMCodepoint lag_codepoint = -1;
MVMint32 bufsize;
MVMGrapheme32 *buffer;
MVMDecodeStreamBytes *cur_bytes;
MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
MVMint32 last_accept_pos, ready, at_start;
MVMDecodeStreamBytes *lag_last_accept_bytes;
MVMint32 last_accept_pos, lag_last_accept_pos, ready, at_start;
MVMuint32 reached_stopper;
MVMuint32 can_fast_path;
/* If there's no buffers, we're done. */
if (!ds->bytes_head)
@@ -339,6 +342,10 @@ MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds
if (stopper_chars && *stopper_chars == 0)
return 1;
/* If there's nothing hanging around in the normalization buffer, we can
* use the fast path. */
can_fast_path = MVM_unicode_normalizer_empty(tc, &(ds->norm));
/* Rough starting-size estimate is number of bytes in the head buffer. */
bufsize = ds->bytes_head->length;
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
@@ -363,43 +370,120 @@ MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds
}
at_start = 0;
}
while (pos < cur_bytes->length) {
switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
case UTF8_ACCEPT: {
MVMint32 first = 1;
MVMGrapheme32 g;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;
ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &(ds->norm), codepoint, &g);
while (ready--) {
if (first)
first = 0;
else
g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
if (count == bufsize) {
/* Valid character, but we filled the buffer. Attach this
* one to the buffers linked list, and continue with a new
* one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}
buffer[count++] = g;
total++;
if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) {
reached_stopper = 1;
goto done;
/* We have both a fast path and a slow path for UTF-8 decoding. The
* fast path covers the common case where we have no chars that are
* significant to normalization, and so we can skip the normalizer.
* This is true of the ASCII and Latin-1 ranges of UTF-8, with the
* exception of \r. Note that since the following codepoint may be
* the one that causes us to need to compose, we need a lag of 1
* codepoint. */
if (can_fast_path) {
while (pos < cur_bytes->length) {
switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
case UTF8_ACCEPT: {
/* If we hit something that needs the normalizer, we put
* any lagging codepoint into its buffer and jump to it. */
if (codepoint == '\r' || codepoint >= ds->norm.first_significant) {
if (lag_codepoint != -1) {
MVM_unicode_normalizer_push_codepoints(tc, &(ds->norm),
&lag_codepoint, 1);
lag_codepoint = -1; /* Invalidate, we used it. */
}
can_fast_path = 0;
goto slow_path;
}
else if (stopper_chars && *stopper_chars == total) {
reached_stopper = 1;
goto done;
/* If we've a lagging codepoint, and this one does not
* need normalization, then we know we can spit out the
* lagging one. */
if (lag_codepoint != -1) {
last_accept_bytes = lag_last_accept_bytes;
last_accept_pos = lag_last_accept_pos;
if (count == bufsize) {
/* Valid character, but we filled the buffer. Attach this
* one to the buffers linked list, and continue with a new
* one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}
buffer[count++] = lag_codepoint;
total++;
if (MVM_string_decode_stream_maybe_sep(tc, seps, lag_codepoint)) {
reached_stopper = 1;
goto done;
}
else if (stopper_chars && *stopper_chars == total) {
reached_stopper = 1;
goto done;
}
}
/* The current state becomes the lagged state. */
lag_codepoint = codepoint;
lag_last_accept_bytes = cur_bytes;
lag_last_accept_pos = pos;
break;
}
break;
case UTF8_REJECT:
MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
break;
}
}
/* If we fall out of the loop and have a lagged codepoint, but
* no next buffer, then we fall into the slow path to process it
* correctly. */
if (lag_codepoint != -1 && !cur_bytes->next) {
codepoint = lag_codepoint;
lag_codepoint = -1;
can_fast_path = 0;
goto slow_path;
}
case UTF8_REJECT:
MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
break;
}
else {
while (pos < cur_bytes->length) {
switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
case UTF8_ACCEPT: {
MVMGrapheme32 g;
MVMint32 first;
slow_path:
first = 1;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;
ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc,
&(ds->norm), codepoint, &g);
while (ready--) {
if (first)
first = 0;
else
g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
if (count == bufsize) {
/* Valid character, but we filled the buffer. Attach this
* one to the buffers linked list, and continue with a new
* one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}
buffer[count++] = g;
total++;
if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) {
reached_stopper = 1;
goto done;
}
else if (stopper_chars && *stopper_chars == total) {
reached_stopper = 1;
goto done;
}
}
break;
}
case UTF8_REJECT:
MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
break;
}
}
}
cur_bytes = cur_bytes->next;

0 comments on commit a6abd3c

Please sign in to comment.