From b0151bf30391e1696bafebcd07da2b1a2f4e02d2 Mon Sep 17 00:00:00 2001 From: Thomas Marks Date: Wed, 18 Nov 2020 20:05:55 -0500 Subject: [PATCH 1/6] Fix extended emoji + zwj combo --- utf8proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utf8proc.c b/utf8proc.c index 6591976..2a4ff4b 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -310,6 +310,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t else *state = tbc; } + else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ) ++ *state = UTF8PROC_BOUNDCLASS_E_ZWG; else *state = tbc; } From 7279d0c0a0ed16bb6e162a2b66d1042b336efa53 Mon Sep 17 00:00:00 2001 From: Thomas Marks Date: Wed, 18 Nov 2020 23:51:06 -0500 Subject: [PATCH 2/6] Patch initial repeated regional flags and extended+zwj emoj --- test/graphemetest.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ utf8proc.c | 4 +++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/test/graphemetest.c b/test/graphemetest.c index 08330fd..804524d 100644 --- a/test/graphemetest.c +++ b/test/graphemetest.c @@ -80,5 +80,51 @@ int main(int argc, char **argv) free(g); }; + /*https://github.com/JuliaLang/julia/issues/37680*/ + { + // Two swedish flags after each other + utf8proc_int32_t double_sweden[] = { + 0x0001f1f8, 0x0001f1ea, 0x0001f1f8, 0x0001f1ea + }; + // facepalm + pale skin + zwj + male sign + FE0F + utf8proc_int32_t facepalm[] ={ + 0x0001f926, 0x0001f3fc, 0x0000200d, 0x00002642, 0x0000fe0f + }; + // man face + pale skin + zwj + hand holding + zwj + man face + dark skin + utf8proc_int32_t family[] = { + 0x0001f468, 0x0001f3fb, 0x0000200d, 0x00001f91d, 0x0000200d, 0x0001f468, 0x0001f3fd + }; + bool expected_double_sweden[] = {false, true, false}; + bool expected_facepalm[] = {false, false, false, false}; + bool expected_family[] = {false, false, false, false, false, false}; + bool results_double_sweden[4]; + bool results_facepalm[5]; + bool results_family[6]; + + utf8proc_int32_t state = 0; + for (int i = 0; i < 3; i++) { + utf8proc_int32_t c1 = double_sweden[i]; + utf8proc_int32_t c2 = double_sweden[i+1]; + results_double_sweden[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); + check(results_double_sweden[i] == expected_double_sweden[i], "Incorrect grapheme break on initial repeated flags"); + } + + state = 0; + for (int i = 0; i < 4; i++) { + utf8proc_int32_t c1 = facepalm[i]; + utf8proc_int32_t c2 = facepalm[i+1]; + results_facepalm[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); + check(results_facepalm[i] == expected_facepalm[i], "Incorrect grapheme break on initial extended + zwj emoji"); + } + + state = 0; + for (int i = 0; i < 5; i++) { + utf8proc_int32_t c1 = family[i]; + utf8proc_int32_t c2 = family[i+1]; + results_family[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); + check(results_family[i] == expected_family[i], "Incorrect grapheme break on initial extended + zwj emoji"); + } + } + return 0; } diff --git a/utf8proc.c b/utf8proc.c index 2a4ff4b..b8453d1 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -301,6 +301,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t // forbidden by a different rule such as GB9). if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) *state = UTF8PROC_BOUNDCLASS_OTHER; + else if (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) + *state = UTF8PROC_BOUNDCLASS_OTHER; // Special support for GB11 (emoji extend* zwj / emoji) else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji @@ -311,7 +313,7 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state = tbc; } else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ) -+ *state = UTF8PROC_BOUNDCLASS_E_ZWG; + *state = UTF8PROC_BOUNDCLASS_E_ZWG; else *state = tbc; } From 900c5541599a30adf017dccef97c8cfa1594dea6 Mon Sep 17 00:00:00 2001 From: Thomas Marks Date: Thu, 19 Nov 2020 12:59:33 -0500 Subject: [PATCH 3/6] Merge conditions for setting breaks bt region --- utf8proc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/utf8proc.c b/utf8proc.c index b8453d1..6d8832f 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -299,10 +299,9 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break // after that character according to GB999 (unless of course such a break is // forbidden by a different rule such as GB9). - if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) - *state = UTF8PROC_BOUNDCLASS_OTHER; - else if (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) - *state = UTF8PROC_BOUNDCLASS_OTHER; + if (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && + (*state == tbc || (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR))) + *state = UTF8PROC_BOUNDCLASS_OTHER; // Special support for GB11 (emoji extend* zwj / emoji) else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji From b3bf5fdfe116a152693eb2a697b3cd28da09021a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sun, 22 Nov 2020 17:04:47 -0500 Subject: [PATCH 4/6] updated fix --- utf8proc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/utf8proc.c b/utf8proc.c index 6d8832f..5a9fbf3 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -290,8 +290,11 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) { - int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START) - ? *state : lbc); + int lbc_override; + if (*state == UTF8PROC_BOUNDCLASS_START) + *state = lbc_override = lbc; + else + lbc_override = *state; utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc); if (state) { // Special support for GB 12/13 made possible by GB999. After two RI @@ -299,9 +302,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break // after that character according to GB999 (unless of course such a break is // forbidden by a different rule such as GB9). - if (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && - (*state == tbc || (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR))) - *state = UTF8PROC_BOUNDCLASS_OTHER; + if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) + *state = UTF8PROC_BOUNDCLASS_OTHER; // Special support for GB11 (emoji extend* zwj / emoji) else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji @@ -311,8 +313,6 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t else *state = tbc; } - else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ) - *state = UTF8PROC_BOUNDCLASS_E_ZWG; else *state = tbc; } From d6ba5f6619b0401cdf1092036e35a1eba28b4207 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 23 Nov 2020 11:38:49 -0500 Subject: [PATCH 5/6] perform tests for both utf8proc_map and manual calls to utf8proc_grapheme_break_stateful --- test/graphemetest.c | 93 +++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/test/graphemetest.c b/test/graphemetest.c index 804524d..ea9d29e 100644 --- a/test/graphemetest.c +++ b/test/graphemetest.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) --si; /* no break after final grapheme */ src[si] = 0; /* NUL-terminate */ - if (si) { + if (si) { /* test utf8proc_map */ utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ size_t i = 0, j = 0; utf8proc_ssize_t glen, k; @@ -65,6 +65,27 @@ int main(int argc, char **argv) } free(g); } + + if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ + utf8proc_int32_t state = 0, prev_codepoint = 0; + size_t i = 0; + utf8proc_bool expectbreak = false; + do { + utf8proc_int32_t codepoint; + i += utf8proc_iterate(src + i, si - i, &codepoint); + check(codepoint >= 0, "invalid UTF-8 data"); + if (codepoint == 0x002F) + expectbreak = true; + else { + if (prev_codepoint != 0) { + check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), + "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); + } + expectbreak = false; + prev_codepoint = codepoint; + } + } while (i < si); + } } fclose(f); printf("Passed tests after %zd lines!\n", lineno); @@ -97,34 +118,56 @@ int main(int argc, char **argv) bool expected_double_sweden[] = {false, true, false}; bool expected_facepalm[] = {false, false, false, false}; bool expected_family[] = {false, false, false, false, false, false}; - bool results_double_sweden[4]; - bool results_facepalm[5]; - bool results_family[6]; - - utf8proc_int32_t state = 0; - for (int i = 0; i < 3; i++) { - utf8proc_int32_t c1 = double_sweden[i]; - utf8proc_int32_t c2 = double_sweden[i+1]; - results_double_sweden[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); - check(results_double_sweden[i] == expected_double_sweden[i], "Incorrect grapheme break on initial repeated flags"); - } - state = 0; - for (int i = 0; i < 4; i++) { - utf8proc_int32_t c1 = facepalm[i]; - utf8proc_int32_t c2 = facepalm[i+1]; - results_facepalm[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); - check(results_facepalm[i] == expected_facepalm[i], "Incorrect grapheme break on initial extended + zwj emoji"); - } + utf8proc_int32_t *test_codepoints[] = {double_sweden, facepalm, family}; + bool *test_expected[] = {expected_double_sweden, expected_facepalm, expected_family}; + size_t itest, test_len[] = {4, 5, 6}; + + for (itest = 0; itest < sizeof(test_len) / sizeof(size_t); ++itest) { + utf8proc_uint8_t test_str[256]; + utf8proc_int32_t j = 0, state = 0; + size_t i, break_count = 0; + utf8proc_ssize_t glen, gi; + utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ + + for (i = 0; i < test_len[itest]; ++i) + j += utf8proc_encode_char(test_codepoints[itest][i], test_str+j); + test_str[j] = 0; - state = 0; - for (int i = 0; i < 5; i++) { - utf8proc_int32_t c1 = family[i]; - utf8proc_int32_t c2 = family[i+1]; - results_family[i] = utf8proc_grapheme_break_stateful(c1, c2, &state); - check(results_family[i] == expected_family[i], "Incorrect grapheme break on initial extended + zwj emoji"); + printf("grapheme regression test for \"%s\"...\n", (char*) test_str); + + /* test manual utf8proc_grapheme_break_stateful calls: */ + for (i = 0; i < test_len[itest]-1; ++i) { + utf8proc_int32_t c1 = test_codepoints[itest][i]; + utf8proc_int32_t c2 = test_codepoints[itest][i+1]; + bool break_found = utf8proc_grapheme_break_stateful(c1, c2, &state); + break_count += break_found; + check(break_found == test_expected[itest][i], + "incorrect grapheme between 0x%04x and 0x%04x in \"%s\"", c1, c2, (char*) test_str); + } + + /* test utf8proc_map: */ + glen = utf8proc_map(test_str, j, &g, UTF8PROC_CHARBOUND); + check(glen > 0 && g[0] == 0xff, "invalid UTF-8 in test"); + for (gi = 0; gi < glen; ++gi) + g[gi] = g[gi] == 0xff ? '/' : g[gi]; /* easier to debug with /, for printing */ + gi = i = 0; + while (gi < glen && i < test_len[itest]) { + utf8proc_int32_t c; + gi += g[gi] == '/'; + gi += utf8proc_iterate(g+gi, glen - gi, &c); /* skip first char */ + if (i < test_len[itest]-1) + check(test_expected[itest][i] == (g[gi] == '/'), "incorrect break after 0x%04x in \"%s\"", c, (char*)test_str); + else + check(g[gi] == 0, "missing null terminator"); + ++i; + } + check(gi == glen && i == test_len[itest], "length mismatch %d/%d vs. %d in \"%s\" test", (int)gi, (int)glen, (int)i, (char*)test_str); + free(g); } } + printf("Passed regression tests!\n"); + return 0; } From 3965f0b6d677723826ce3c956880b52bde6a975f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 23 Nov 2020 12:15:36 -0500 Subject: [PATCH 6/6] consolidate tests --- test/graphemetest.c | 235 ++++++++++++++++++-------------------------- 1 file changed, 93 insertions(+), 142 deletions(-) diff --git a/test/graphemetest.c b/test/graphemetest.c index ea9d29e..95e7dc0 100644 --- a/test/graphemetest.c +++ b/test/graphemetest.c @@ -1,95 +1,107 @@ #include "tests.h" +/* check one line in the format of GraphemeBreakTest.txt */ +void checkline(const char *_buf, bool verbose) { + size_t bi = 0, si = 0; + utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */ + const unsigned char *buf = (const unsigned char *) _buf; + + while (buf[bi]) { + bi = skipspaces(buf, bi); + if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */ + src[si++] = '/'; + bi += 2; + } + else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */ + bi += 2; + } + else if (buf[bi] == '#') { /* start of comments */ + break; + } + else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */ + src[si++] = '/'; + bi += 1; + } + else { /* hex-encoded codepoint */ + size_t len = encode((unsigned char*) (src + si), buf + bi) - 1; + while (src[si]) ++si; /* advance to NUL termination */ + bi += len; + } + } + if (si && src[si-1] == '/') + --si; /* no break after final grapheme */ + src[si] = 0; /* NUL-terminate */ + + if (si) { /* test utf8proc_map */ + utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ + size_t i = 0, j = 0; + utf8proc_ssize_t glen, k; + utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ + while (i < si) { + if (src[i] != '/') + utf8[j++] = src[i++]; + else + i++; + } + glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); + if (glen == UTF8PROC_ERROR_INVALIDUTF8) { + /* the test file contains surrogate codepoints, which are only for UTF-16 */ + printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); + } + else { + check(glen >= 0, "utf8proc_map error = %s", + utf8proc_errmsg(glen)); + for (k = 0; k <= glen; ++k) + if (g[k] == 0xff) + g[k] = '/'; /* easier-to-read output (/ is not in test strings) */ + check(!strcmp((char*)g, (char*)src), + "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); + } + free(g); + } + + if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ + utf8proc_int32_t state = 0, prev_codepoint = 0; + size_t i = 0; + utf8proc_bool expectbreak = false; + do { + utf8proc_int32_t codepoint; + i += utf8proc_iterate(src + i, si - i, &codepoint); + check(codepoint >= 0, "invalid UTF-8 data"); + if (codepoint == 0x002F) + expectbreak = true; + else { + if (prev_codepoint != 0) { + check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), + "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); + } + expectbreak = false; + prev_codepoint = codepoint; + } + } while (i < si); + } + + if (verbose) + printf("passed grapheme test: \"%s\"\n", (char*) src); +} + int main(int argc, char **argv) { unsigned char buf[8192]; FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; - utf8proc_uint8_t src[1024]; check(f != NULL, "error opening GraphemeBreakTest.txt"); while (simple_getline(buf, f) > 0) { - size_t bi = 0, si = 0; - lineno += 1; - - if (lineno % 100 == 0) + if ((++lineno) % 100 == 0) printf("checking line %zd...\n", lineno); - if (buf[0] == '#') continue; - - while (buf[bi]) { - bi = skipspaces(buf, bi); - if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */ - src[si++] = '/'; - bi += 2; - } - else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */ - bi += 2; - } - else if (buf[bi] == '#') { /* start of comments */ - break; - } - else { /* hex-encoded codepoint */ - size_t len = encode((unsigned char*) (src + si), buf + bi) - 1; - while (src[si]) ++si; /* advance to NUL termination */ - bi += len; - } - } - if (si && src[si-1] == '/') - --si; /* no break after final grapheme */ - src[si] = 0; /* NUL-terminate */ - - if (si) { /* test utf8proc_map */ - utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ - size_t i = 0, j = 0; - utf8proc_ssize_t glen, k; - utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ - while (i < si) { - if (src[i] != '/') - utf8[j++] = src[i++]; - else - i++; - } - glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); - if (glen == UTF8PROC_ERROR_INVALIDUTF8) { - /* the test file contains surrogate codepoints, which are only for UTF-16 */ - printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); - } - else { - check(glen >= 0, "utf8proc_map error = %s", - utf8proc_errmsg(glen)); - for (k = 0; k <= glen; ++k) - if (g[k] == 0xff) - g[k] = '/'; /* easier-to-read output (/ is not in test strings) */ - check(!strcmp((char*)g, (char*)src), - "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); - } - free(g); - } - - if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ - utf8proc_int32_t state = 0, prev_codepoint = 0; - size_t i = 0; - utf8proc_bool expectbreak = false; - do { - utf8proc_int32_t codepoint; - i += utf8proc_iterate(src + i, si - i, &codepoint); - check(codepoint >= 0, "invalid UTF-8 data"); - if (codepoint == 0x002F) - expectbreak = true; - else { - if (prev_codepoint != 0) { - check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), - "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); - } - expectbreak = false; - prev_codepoint = codepoint; - } - } while (i < si); - } + checkline((char *) buf, false); } fclose(f); printf("Passed tests after %zd lines!\n", lineno); + printf("Performing regression tests...\n"); + /* issue 144 */ { utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */ @@ -101,71 +113,10 @@ int main(int argc, char **argv) free(g); }; - /*https://github.com/JuliaLang/julia/issues/37680*/ - { - // Two swedish flags after each other - utf8proc_int32_t double_sweden[] = { - 0x0001f1f8, 0x0001f1ea, 0x0001f1f8, 0x0001f1ea - }; - // facepalm + pale skin + zwj + male sign + FE0F - utf8proc_int32_t facepalm[] ={ - 0x0001f926, 0x0001f3fc, 0x0000200d, 0x00002642, 0x0000fe0f - }; - // man face + pale skin + zwj + hand holding + zwj + man face + dark skin - utf8proc_int32_t family[] = { - 0x0001f468, 0x0001f3fb, 0x0000200d, 0x00001f91d, 0x0000200d, 0x0001f468, 0x0001f3fd - }; - bool expected_double_sweden[] = {false, true, false}; - bool expected_facepalm[] = {false, false, false, false}; - bool expected_family[] = {false, false, false, false, false, false}; - - utf8proc_int32_t *test_codepoints[] = {double_sweden, facepalm, family}; - bool *test_expected[] = {expected_double_sweden, expected_facepalm, expected_family}; - size_t itest, test_len[] = {4, 5, 6}; - - for (itest = 0; itest < sizeof(test_len) / sizeof(size_t); ++itest) { - utf8proc_uint8_t test_str[256]; - utf8proc_int32_t j = 0, state = 0; - size_t i, break_count = 0; - utf8proc_ssize_t glen, gi; - utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ - - for (i = 0; i < test_len[itest]; ++i) - j += utf8proc_encode_char(test_codepoints[itest][i], test_str+j); - test_str[j] = 0; - - printf("grapheme regression test for \"%s\"...\n", (char*) test_str); - - /* test manual utf8proc_grapheme_break_stateful calls: */ - for (i = 0; i < test_len[itest]-1; ++i) { - utf8proc_int32_t c1 = test_codepoints[itest][i]; - utf8proc_int32_t c2 = test_codepoints[itest][i+1]; - bool break_found = utf8proc_grapheme_break_stateful(c1, c2, &state); - break_count += break_found; - check(break_found == test_expected[itest][i], - "incorrect grapheme between 0x%04x and 0x%04x in \"%s\"", c1, c2, (char*) test_str); - } - - /* test utf8proc_map: */ - glen = utf8proc_map(test_str, j, &g, UTF8PROC_CHARBOUND); - check(glen > 0 && g[0] == 0xff, "invalid UTF-8 in test"); - for (gi = 0; gi < glen; ++gi) - g[gi] = g[gi] == 0xff ? '/' : g[gi]; /* easier to debug with /, for printing */ - gi = i = 0; - while (gi < glen && i < test_len[itest]) { - utf8proc_int32_t c; - gi += g[gi] == '/'; - gi += utf8proc_iterate(g+gi, glen - gi, &c); /* skip first char */ - if (i < test_len[itest]-1) - check(test_expected[itest][i] == (g[gi] == '/'), "incorrect break after 0x%04x in \"%s\"", c, (char*)test_str); - else - check(g[gi] == 0, "missing null terminator"); - ++i; - } - check(gi == glen && i == test_len[itest], "length mismatch %d/%d vs. %d in \"%s\" test", (int)gi, (int)glen, (int)i, (char*)test_str); - free(g); - } - } + /* https://github.com/JuliaLang/julia/issues/37680 */ + checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */ + checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */ + checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */ printf("Passed regression tests!\n");