Skip to content

Commit

Permalink
Fix concat bug with utf8c8 strings, flattening utf8c8 synths
Browse files Browse the repository at this point in the history
There was a bug when a string containing utf8-c8 sythetics was
concatenated, it could cause the utf8-c8 synthetics to turn into multiple
codepoints instead. Alter the codepoint iterators so they will pass back
the utf8-c8 synthetics unchanged if supplied an argument to the init
function.
  • Loading branch information
samcv committed Sep 20, 2017
1 parent 3a2eab5 commit 2f71945
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 38 deletions.
2 changes: 1 addition & 1 deletion src/strings/ascii.c
Expand Up @@ -163,7 +163,7 @@ char * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMu
else {
MVMuint32 i = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines);
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
if (i == result_alloc) {
Expand Down
46 changes: 32 additions & 14 deletions src/strings/iter.h
Expand Up @@ -187,24 +187,34 @@ struct MVMCodepointIter {
MVMCodepoint base_code;
/* If we should translate newline \n into \r\n. */
MVMint32 translate_newlines;
/* Used to pass through utf8-c8 synthetics, but not any others so we can
* renomalize text without getting rid of utf8-c8 synthetics */
MVMint32 pass_utfc8_synthetics;

};

/* Initializes a code point iterator. */
MVM_STATIC_INLINE void MVM_string_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMString *s,
MVMint32 translate_newlines) {
MVMint32 translate_newlines, MVMint32 pass_utfc8_synthetics) {
/* Initialize our underlying grapheme iterator. */
MVM_string_gi_init(tc, &(ci->gi), s);

/* We've no currently active synthetic codepoint (and other fields are
* unused until we do, so leave them alone for now). */
ci->synth_codes = NULL;
ci->translate_newlines = translate_newlines;
ci->synth_codes = NULL;
ci->translate_newlines = translate_newlines;
ci->pass_utfc8_synthetics = pass_utfc8_synthetics;
};
/* Iterates on a grapheme. Returns the number of codepoints in the grapheme */
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g) {
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g, MVMint32 pass_utfc8_synthetics) {
MVMNFGSynthetic *synth = NULL;
if (g < 0) {
/* Get the synthetics info. */
MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
synth = MVM_nfg_get_synthetic_info(tc, g);
}
/* If we got a synth, but *not* if we are supposed to pass utf8-c8 synthetics
* and it is a utf8-c8 synthetic */
if (synth && !(pass_utfc8_synthetics && synth->is_utf8_c8)) {
/* Set up the iterator so in the next iteration we will start to
* hand back codepoints after the initial */
/* TODO: for now assumes synthetics start 1 after the first codepoint */
Expand All @@ -222,6 +232,7 @@ MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc
}
return ci->total_synth_codes + 1;
}
/* Only for string_grapheme_ci ops */
MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) {
MVMCodepoint result = ci->visited_synth_codes < 0
? ci->base_code
Expand All @@ -236,6 +247,7 @@ MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadCon
MVM_STATIC_INLINE MVMint32 MVM_string_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
return ci->synth_codes || MVM_string_gi_has_more(tc, &(ci->gi));
}
/* Only for use with string_grapheme_ci ops */
MVM_STATIC_INLINE MVMint32 MVM_string_grapheme_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
return ci->visited_synth_codes < ci->total_synth_codes;
}
Expand Down Expand Up @@ -270,16 +282,22 @@ MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc,
else {
/* It's a synthetic. Look it up. */
MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
/* If we have pass_utfc8_synthetics set and it's a utf_c8 codepoint
* pass it back unchanged */
if (ci->pass_utfc8_synthetics && synth->is_utf8_c8) {
result = g;
}
else {
/* Set up the iterator so in the next iteration we will start to
* hand back codepoints. */
ci->synth_codes = synth->codes + 1;
ci->visited_synth_codes = 0;
/* Emulate num_combs and subtract one from num_codes */
ci->total_synth_codes = synth->num_codes - 1;

/* Set up the iterator so in the next iteration we will start to
* hand back codepoints. */
ci->synth_codes = synth->codes + 1;
ci->visited_synth_codes = 0;
/* Emulate num_combs and subtract one from num_codes */
ci->total_synth_codes = synth->num_codes - 1;

/* Result is the first codepoint of the `codes` array */
result = synth->codes[0];
/* Result is the first codepoint of the `codes` array */
result = synth->codes[0];
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/strings/latin1.c
Expand Up @@ -172,7 +172,7 @@ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVM
else {
MVMuint32 i = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines);
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
if (i == result_alloc) {
Expand Down
17 changes: 13 additions & 4 deletions src/strings/normalize.c
Expand Up @@ -154,7 +154,7 @@ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNor
result_pos = 0;

/* Create codepoint iterator. */
MVM_string_ci_init(tc, &ci, s, 0);
MVM_string_ci_init(tc, &ci, s, 0, 0);

/* If we want NFC, just iterate, since NFG is constructed out of NFC. */
if (form == MVM_NORMALIZE_NFC) {
Expand Down Expand Up @@ -524,14 +524,23 @@ static MVMint32 is_grapheme_prepend(MVMThreadContext *tc, MVMCodepoint cp) {
* we are checking two arbitrary codepoints. If we are normalizing linearly from
* the start of the string this has no more significance than returning 1) */
MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm) {
int GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
int GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
int GCB_a, GCB_b;

/* Don't break between \r and \n, but otherwise break around \r. */
if (a == 0x0D && b == 0x0A)
return 0;
if (a == 0x0D || b == 0x0D)
return 1;

/* For utf8-c8 graphemes. These we can't request property values and act like
* control's */
if (a < 0 || b < 0) {
if ((a < 0 && MVM_nfg_get_synthetic_info(tc, a)->is_utf8_c8) || (b < 0 && MVM_nfg_get_synthetic_info(tc, b)->is_utf8_c8)) {
return 0;
}
MVM_exception_throw_adhoc(tc, "Internal error: synthetic grapheme found when computing grapheme segmentation");
}
GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
switch (GCB_a) {
case MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR:
if (2 <= norm->regional_indicator) {
Expand Down
12 changes: 11 additions & 1 deletion src/strings/normalize.h
Expand Up @@ -80,9 +80,19 @@ MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadCon
* far in normalized form without having to consider them into the
* normalization process. The exception is if we're computing NFG, and
* we got \r, which can form a grapheme in the case of \r\n. */
if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD)
if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD) {
/* For utf8-c8 synthetic graphemes. May be able to be removed after
* changing and further testing of the TODO marked below. */
if (in < 0) {
if (MVM_nfg_get_synthetic_info(tc, in)->is_utf8_c8)
return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
MVM_exception_throw_adhoc(tc, "Internal error: encountered non-utf8-c8 synthetic during normalization");
}
/* TODO this does not seem to do what the comment above shows. Likely
* needs changing */
if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D))
return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
}

/* Fast-paths apply when the codepoint to consider is too low to have any
* interesting properties in the target normalization form AND
Expand Down
15 changes: 8 additions & 7 deletions src/strings/ops.c
Expand Up @@ -257,10 +257,10 @@ static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
* than the initial estimate, but utf8-c8 showed us otherwise. */
MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
MVMint64 out_pos = 0;

/* Iterate codepoints and normalizer. */
MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
MVM_string_ci_init(tc, &ci, in, 0);
/* Codepoint iterator that passes back utf8-c8 graphemes unchanged */
MVM_string_ci_init(tc, &ci, in, 0, 1);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMGrapheme32 g;
ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
Expand Down Expand Up @@ -649,16 +649,17 @@ MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString
};
MVMROOT(tc, a, {
MVMROOT(tc, b, {
/* Needing both to be CCC = 0 can probably be relaxed some, but be careful optimizing */
/* If both are not synthetics, we can can pass those values unchanged
* instead of iterating by codepoint */
if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) {
renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2);
consumed_a = 1; consumed_b = 1;
}
else {
MVMCodepointIter last_a_ci;
MVMCodepointIter first_b_ci;
MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci, last_a_first_b[0]);
MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1]);
MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci, last_a_first_b[0], 1);
MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1], 1);
/* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */
MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint));
MVMuint32 i = 0;
Expand Down Expand Up @@ -2002,8 +2003,8 @@ MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
* iterate the codepoints inside it */
if (g_a < 0 || g_b < 0) {
MVMCodepointIter ci_a, ci_b;
MVM_string_grapheme_ci_init(tc, &ci_a, g_a);
MVM_string_grapheme_ci_init(tc, &ci_b, g_b);
MVM_string_grapheme_ci_init(tc, &ci_a, g_a, 0);
MVM_string_grapheme_ci_init(tc, &ci_b, g_b, 0);
while (MVM_string_grapheme_ci_has_more(tc, &ci_a) && MVM_string_grapheme_ci_has_more(tc, &ci_b)) {
g_a = MVM_string_grapheme_ci_get_codepoint(tc, &ci_a);
g_b = MVM_string_grapheme_ci_get_codepoint(tc, &ci_b);
Expand Down
4 changes: 3 additions & 1 deletion src/strings/ops.h
Expand Up @@ -47,7 +47,9 @@ MVM_STATIC_INLINE MVMuint32 MVM_string_codes(MVMThreadContext *tc, MVMString *s)
MVM_string_check_arg(tc, s, "codes");
if (MVM_string_graphs_nocheck(tc, s) == 0)
return 0;
MVM_string_ci_init(tc, &ci, s, 0);
/* Maybe we should pass back utf-c8 graphemes unchanged so they count as 1
* codepoint each? For now we flatten utf8-c8 */
MVM_string_ci_init(tc, &ci, s, 0, 0);
while(MVM_string_ci_has_more(tc, &ci)) {
MVM_string_ci_get_codepoint(tc, &ci);
codes++;
Expand Down
2 changes: 1 addition & 1 deletion src/strings/parse_num.c
Expand Up @@ -281,7 +281,7 @@ MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) {
MVMCodepointIter ci;
MVMCodepoint cp;
MVMnum64 n = 123;
MVM_string_ci_init(tc, &ci, s, 0);
MVM_string_ci_init(tc, &ci, s, 0, 0);

if (get_cp(tc, &ci, &cp)) return 0;

Expand Down
11 changes: 6 additions & 5 deletions src/strings/unicode_ops.c
Expand Up @@ -556,15 +556,16 @@ MVMint64 MVM_unicode_string_compare(MVMThreadContext *tc, MVMString *a, MVMStrin
if (alen == 0 || blen == 0)
return collation_return_by_length(tc, alen, blen, collation_mode);

/* Initialize a codepoint iterator */
/* Initialize a codepoint iterator
* For now we decompose utf8-c8 synthetics. Eventually we may want to pass
* them back and choose some way to generate sorting info for them, similar
* to how Unassigned codepoints are dealt with */
MVMROOT(tc, a_ci, {
MVM_string_ci_init(tc, &a_ci, a, 0);
MVM_string_ci_init(tc, &a_ci, a, 0, 0);
MVMROOT(tc, b_ci, {
MVM_string_ci_init(tc, &b_ci, b, 0);
MVM_string_ci_init(tc, &b_ci, b, 0, 0);
});
});
MVM_string_ci_init(tc, &a_ci, a, 0);
MVM_string_ci_init(tc, &b_ci, b, 0);
init_ringbuffer(tc, &buf_a);
init_ringbuffer(tc, &buf_b);
/* The ring buffers hold the exact number of codepoints which comprise the longest
Expand Down
2 changes: 1 addition & 1 deletion src/strings/utf16.c
Expand Up @@ -124,7 +124,7 @@ char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMu
alloc_size = lengthu * 2;
result = MVM_malloc(alloc_size + 2);
result_pos = result;
MVM_string_ci_init(tc, &ci, str, translate_newlines);
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
int bytes_needed;
MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci);
Expand Down
2 changes: 1 addition & 1 deletion src/strings/utf8.c
Expand Up @@ -551,7 +551,7 @@ char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
result_pos = 0;

/* Iterate the codepoints and encode them. */
MVM_string_ci_init(tc, &ci, str, translate_newlines);
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMint32 bytes;
MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);
Expand Down
2 changes: 1 addition & 1 deletion src/strings/windows1252.c
Expand Up @@ -284,7 +284,7 @@ char * MVM_string_windows1252_encode_substr(MVMThreadContext *tc, MVMString *str
else {
MVMuint32 i = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines);
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
if (i == result_alloc) {
Expand Down

0 comments on commit 2f71945

Please sign in to comment.