Fix concat bug with utf8c8 strings, flattening utf8c8 synths

There was a bug when a string containing utf8-c8 sythetics was concatenated, it could cause the utf8-c8 synthetics to turn into multiple codepoints instead. Alter the codepoint iterators so they will pass back the utf8-c8 synthetics unchanged if supplied an argument to the init function.
MoarVM · Sep 20, 2017 · 2f71945 · 2f71945
1 parent 3a2eab5
commit 2f71945
Show file tree

Hide file tree

Showing 12 changed files with 79 additions and 38 deletions.
diff --git a/src/strings/ascii.c b/src/strings/ascii.c
@@ -163,7 +163,7 @@ char * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMu
     else {
         MVMuint32 i = 0;
         MVMCodepointIter ci;
-        MVM_string_ci_init(tc, &ci, str, translate_newlines);
+        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
         while (MVM_string_ci_has_more(tc, &ci)) {
             MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
             if (i == result_alloc) {

diff --git a/src/strings/iter.h b/src/strings/iter.h
@@ -187,24 +187,34 @@ struct MVMCodepointIter {
     MVMCodepoint   base_code;
     /* If we should translate newline \n into \r\n. */
     MVMint32       translate_newlines;
+    /* Used to pass through utf8-c8 synthetics, but not any others so we can
+     * renomalize text without getting rid of utf8-c8 synthetics */
+    MVMint32       pass_utfc8_synthetics;
+
 };
 
 /* Initializes a code point iterator. */
 MVM_STATIC_INLINE void MVM_string_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMString *s,
-        MVMint32 translate_newlines) {
+        MVMint32 translate_newlines, MVMint32 pass_utfc8_synthetics) {
     /* Initialize our underlying grapheme iterator. */
     MVM_string_gi_init(tc, &(ci->gi), s);
 
     /* We've no currently active synthetic codepoint (and other fields are
      * unused until we do, so leave them alone for now). */
-    ci->synth_codes = NULL;
-    ci->translate_newlines = translate_newlines;
+    ci->synth_codes           = NULL;
+    ci->translate_newlines    = translate_newlines;
+    ci->pass_utfc8_synthetics = pass_utfc8_synthetics;
 };
 /* Iterates on a grapheme. Returns the number of codepoints in the grapheme */
-MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g) {
+MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g, MVMint32 pass_utfc8_synthetics) {
+    MVMNFGSynthetic *synth = NULL;
     if (g < 0) {
         /* Get the synthetics info. */
-        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
+        synth = MVM_nfg_get_synthetic_info(tc, g);
+    }
+    /* If we got a synth, but *not* if we are supposed to pass utf8-c8 synthetics
+     * and it is a utf8-c8 synthetic */
+    if (synth && !(pass_utfc8_synthetics && synth->is_utf8_c8)) {
         /* Set up the iterator so in the next iteration we will start to
         * hand back codepoints after the initial */
         /* TODO: for now assumes synthetics start 1 after the first codepoint */
@@ -222,6 +232,7 @@ MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc
     }
     return ci->total_synth_codes + 1;
 }
+/* Only for string_grapheme_ci ops */
 MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) {
     MVMCodepoint result = ci->visited_synth_codes < 0
         ? ci->base_code
@@ -236,6 +247,7 @@ MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadCon
 MVM_STATIC_INLINE MVMint32 MVM_string_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
     return ci->synth_codes || MVM_string_gi_has_more(tc, &(ci->gi));
 }
+/* Only for use with string_grapheme_ci ops */
 MVM_STATIC_INLINE MVMint32 MVM_string_grapheme_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
     return ci->visited_synth_codes < ci->total_synth_codes;
 }
@@ -270,16 +282,22 @@ MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc,
         else {
             /* It's a synthetic. Look it up. */
             MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
+            /* If we have pass_utfc8_synthetics set and it's a utf_c8 codepoint
+             * pass it back unchanged */
+            if (ci->pass_utfc8_synthetics && synth->is_utf8_c8) {
+                result = g;
+            }
+            else {
+                /* Set up the iterator so in the next iteration we will start to
+                * hand back codepoints. */
+                ci->synth_codes         = synth->codes + 1;
+                ci->visited_synth_codes = 0;
+                /* Emulate num_combs and subtract one from num_codes */
+                ci->total_synth_codes   = synth->num_codes - 1;
 
-            /* Set up the iterator so in the next iteration we will start to
-            * hand back codepoints. */
-            ci->synth_codes         = synth->codes + 1;
-            ci->visited_synth_codes = 0;
-            /* Emulate num_combs and subtract one from num_codes */
-            ci->total_synth_codes   = synth->num_codes - 1;
-
-            /* Result is the first codepoint of the `codes` array */
-            result = synth->codes[0];
+                /* Result is the first codepoint of the `codes` array */
+                result = synth->codes[0];
+            }
         }
     }
 

diff --git a/src/strings/latin1.c b/src/strings/latin1.c
@@ -172,7 +172,7 @@ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVM
     else {
         MVMuint32 i = 0;
         MVMCodepointIter ci;
-        MVM_string_ci_init(tc, &ci, str, translate_newlines);
+        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
         while (MVM_string_ci_has_more(tc, &ci)) {
             MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
             if (i == result_alloc) {

diff --git a/src/strings/normalize.c b/src/strings/normalize.c
@@ -154,7 +154,7 @@ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNor
     result_pos   = 0;
 
     /* Create codepoint iterator. */
-    MVM_string_ci_init(tc, &ci, s, 0);
+    MVM_string_ci_init(tc, &ci, s, 0, 0);
 
     /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
     if (form == MVM_NORMALIZE_NFC) {
@@ -524,14 +524,23 @@ static MVMint32 is_grapheme_prepend(MVMThreadContext *tc, MVMCodepoint cp) {
  * we are checking two arbitrary codepoints. If we are normalizing linearly from
  * the start of the string this has no more significance than returning 1) */
 MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm) {
-    int GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
-    int GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
+    int GCB_a, GCB_b;
+
     /* Don't break between \r and \n, but otherwise break around \r. */
     if (a == 0x0D && b == 0x0A)
         return 0;
     if (a == 0x0D || b == 0x0D)
         return 1;
-
+    /* For utf8-c8 graphemes. These we can't request property values and act like
+     * control's */
+    if (a < 0 || b < 0) {
+        if ((a < 0 && MVM_nfg_get_synthetic_info(tc, a)->is_utf8_c8) || (b < 0 && MVM_nfg_get_synthetic_info(tc, b)->is_utf8_c8)) {
+            return 0;
+        }
+        MVM_exception_throw_adhoc(tc, "Internal error: synthetic grapheme found when computing grapheme segmentation");
+    }
+    GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
+    GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
     switch (GCB_a) {
         case MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR:
             if (2 <= norm->regional_indicator) {

diff --git a/src/strings/normalize.h b/src/strings/normalize.h
@@ -80,9 +80,19 @@ MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadCon
      * far in normalized form without having to consider them into the
      * normalization process. The exception is if we're computing NFG, and
      * we got \r, which can form a grapheme in the case of \r\n. */
-    if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD)
+    if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD) {
+        /* For utf8-c8 synthetic graphemes. May be able to be removed after
+         * changing and further testing of the TODO marked below. */
+        if (in < 0) {
+            if (MVM_nfg_get_synthetic_info(tc, in)->is_utf8_c8)
+                return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
+            MVM_exception_throw_adhoc(tc, "Internal error: encountered non-utf8-c8 synthetic during normalization");
+        }
+        /* TODO this does not seem to do what the comment above shows. Likely
+         * needs changing */
         if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D))
             return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
+    }
 
     /* Fast-paths apply when the codepoint to consider is too low to have any
      * interesting properties in the target normalization form AND

diff --git a/src/strings/ops.c b/src/strings/ops.c
@@ -257,10 +257,10 @@ static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
      * than the initial estimate, but utf8-c8 showed us otherwise. */
     MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
     MVMint64 out_pos = 0;
-
     /* Iterate codepoints and normalizer. */
     MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
-    MVM_string_ci_init(tc, &ci, in, 0);
+    /* Codepoint iterator that passes back utf8-c8 graphemes unchanged */
+    MVM_string_ci_init(tc, &ci, in, 0, 1);
     while (MVM_string_ci_has_more(tc, &ci)) {
         MVMGrapheme32 g;
         ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
@@ -649,16 +649,17 @@ MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString
         };
         MVMROOT(tc, a, {
         MVMROOT(tc, b, {
-        /* Needing both to be CCC = 0 can probably be relaxed some, but be careful optimizing */
+        /* If both are not synthetics, we can can pass those values unchanged
+         * instead of iterating by codepoint */
         if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) {
             renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2);
             consumed_a = 1; consumed_b = 1;
         }
         else {
             MVMCodepointIter last_a_ci;
             MVMCodepointIter first_b_ci;
-            MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci,  last_a_first_b[0]);
-            MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1]);
+            MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci,  last_a_first_b[0], 1);
+            MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1], 1);
             /* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */
             MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint));
             MVMuint32 i = 0;
@@ -2002,8 +2003,8 @@ MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
              * iterate the codepoints inside it */
             if (g_a < 0 || g_b < 0) {
                 MVMCodepointIter ci_a, ci_b;
-                MVM_string_grapheme_ci_init(tc, &ci_a, g_a);
-                MVM_string_grapheme_ci_init(tc, &ci_b, g_b);
+                MVM_string_grapheme_ci_init(tc, &ci_a, g_a, 0);
+                MVM_string_grapheme_ci_init(tc, &ci_b, g_b, 0);
                 while (MVM_string_grapheme_ci_has_more(tc, &ci_a) && MVM_string_grapheme_ci_has_more(tc, &ci_b)) {
                     g_a = MVM_string_grapheme_ci_get_codepoint(tc, &ci_a);
                     g_b = MVM_string_grapheme_ci_get_codepoint(tc, &ci_b);

diff --git a/src/strings/ops.h b/src/strings/ops.h
@@ -47,7 +47,9 @@ MVM_STATIC_INLINE MVMuint32 MVM_string_codes(MVMThreadContext *tc, MVMString *s)
     MVM_string_check_arg(tc, s, "codes");
     if (MVM_string_graphs_nocheck(tc, s) == 0)
         return 0;
-    MVM_string_ci_init(tc, &ci, s, 0);
+    /* Maybe we should pass back utf-c8 graphemes unchanged so they count as 1
+     * codepoint each? For now we flatten utf8-c8 */
+    MVM_string_ci_init(tc, &ci, s, 0, 0);
     while(MVM_string_ci_has_more(tc, &ci)) {
         MVM_string_ci_get_codepoint(tc, &ci);
         codes++;

diff --git a/src/strings/parse_num.c b/src/strings/parse_num.c
@@ -281,7 +281,7 @@ MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) {
     MVMCodepointIter ci;
     MVMCodepoint cp;
     MVMnum64  n = 123;
-    MVM_string_ci_init(tc, &ci, s, 0);
+    MVM_string_ci_init(tc, &ci, s, 0, 0);
 
     if (get_cp(tc, &ci, &cp)) return 0; 
 

diff --git a/src/strings/unicode_ops.c b/src/strings/unicode_ops.c
@@ -556,15 +556,16 @@ MVMint64 MVM_unicode_string_compare(MVMThreadContext *tc, MVMString *a, MVMStrin
     if (alen == 0 || blen == 0)
         return collation_return_by_length(tc, alen, blen, collation_mode);
 
-    /* Initialize a codepoint iterator */
+    /* Initialize a codepoint iterator
+     * For now we decompose utf8-c8 synthetics. Eventually we may want to pass
+     * them back and choose some way to generate sorting info for them, similar
+     * to how Unassigned codepoints are dealt with */
     MVMROOT(tc, a_ci, {
-        MVM_string_ci_init(tc, &a_ci, a, 0);
+        MVM_string_ci_init(tc, &a_ci, a, 0, 0);
         MVMROOT(tc, b_ci, {
-            MVM_string_ci_init(tc, &b_ci, b, 0);
+            MVM_string_ci_init(tc, &b_ci, b, 0, 0);
         });
     });
-    MVM_string_ci_init(tc, &a_ci, a, 0);
-    MVM_string_ci_init(tc, &b_ci, b, 0);
     init_ringbuffer(tc, &buf_a);
     init_ringbuffer(tc, &buf_b);
     /* The ring buffers hold the exact number of codepoints which comprise the longest

diff --git a/src/strings/utf16.c b/src/strings/utf16.c
@@ -124,7 +124,7 @@ char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMu
     alloc_size = lengthu * 2;
     result = MVM_malloc(alloc_size + 2);
     result_pos = result;
-    MVM_string_ci_init(tc, &ci, str, translate_newlines);
+    MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
     while (MVM_string_ci_has_more(tc, &ci)) {
         int bytes_needed;
         MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci);

diff --git a/src/strings/utf8.c b/src/strings/utf8.c
@@ -551,7 +551,7 @@ char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
     result_pos   = 0;
 
     /* Iterate the codepoints and encode them. */
-    MVM_string_ci_init(tc, &ci, str, translate_newlines);
+    MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
     while (MVM_string_ci_has_more(tc, &ci)) {
         MVMint32 bytes;
         MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);

diff --git a/src/strings/windows1252.c b/src/strings/windows1252.c
@@ -284,7 +284,7 @@ char * MVM_string_windows1252_encode_substr(MVMThreadContext *tc, MVMString *str
     else {
         MVMuint32 i = 0;
         MVMCodepointIter ci;
-        MVM_string_ci_init(tc, &ci, str, translate_newlines);
+        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
         while (MVM_string_ci_has_more(tc, &ci)) {
             MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
             if (i == result_alloc) {