From b0151bf30391e1696bafebcd07da2b1a2f4e02d2 Mon Sep 17 00:00:00 2001
From: Thomas Marks <marksta@umich.edu>
Date: Wed, 18 Nov 2020 20:05:55 -0500
Subject: [PATCH 1/6] Fix extended emoji + zwj combo

---
 utf8proc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utf8proc.c b/utf8proc.c
index 6591976..2a4ff4b 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -310,6 +310,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
       else
         *state = tbc;
     }
+    else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ)
++       *state = UTF8PROC_BOUNDCLASS_E_ZWG;     
     else
       *state = tbc;
   }

From 7279d0c0a0ed16bb6e162a2b66d1042b336efa53 Mon Sep 17 00:00:00 2001
From: Thomas Marks <marksta@umich.edu>
Date: Wed, 18 Nov 2020 23:51:06 -0500
Subject: [PATCH 2/6] Patch initial repeated regional flags and extended+zwj
 emoj

---
 test/graphemetest.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
 utf8proc.c          |  4 +++-
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/test/graphemetest.c b/test/graphemetest.c
index 08330fd..804524d 100644
--- a/test/graphemetest.c
+++ b/test/graphemetest.c
@@ -80,5 +80,51 @@ int main(int argc, char **argv)
         free(g);
     };
 
+    /*https://github.com/JuliaLang/julia/issues/37680*/
+    {
+        // Two swedish flags after each other
+        utf8proc_int32_t double_sweden[] = {
+            0x0001f1f8, 0x0001f1ea, 0x0001f1f8, 0x0001f1ea
+        };
+        // facepalm + pale skin + zwj + male sign + FE0F
+        utf8proc_int32_t facepalm[] ={
+           0x0001f926, 0x0001f3fc, 0x0000200d, 0x00002642, 0x0000fe0f
+        };
+        // man face + pale skin + zwj + hand holding + zwj + man face + dark skin
+        utf8proc_int32_t family[] = {
+            0x0001f468, 0x0001f3fb, 0x0000200d, 0x00001f91d, 0x0000200d, 0x0001f468, 0x0001f3fd
+        };
+        bool expected_double_sweden[] = {false, true, false};
+        bool expected_facepalm[] = {false, false, false, false};
+        bool expected_family[] = {false, false, false, false, false, false};
+        bool results_double_sweden[4];
+        bool results_facepalm[5];
+        bool results_family[6];
+
+        utf8proc_int32_t state = 0;
+        for (int i = 0; i < 3; i++) {
+            utf8proc_int32_t c1 = double_sweden[i];
+            utf8proc_int32_t c2 = double_sweden[i+1];
+            results_double_sweden[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
+            check(results_double_sweden[i] == expected_double_sweden[i], "Incorrect grapheme break on initial repeated flags");
+        }
+
+        state = 0;
+        for (int i = 0; i < 4; i++) {
+            utf8proc_int32_t c1 = facepalm[i];
+            utf8proc_int32_t c2 = facepalm[i+1];
+            results_facepalm[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
+            check(results_facepalm[i] == expected_facepalm[i], "Incorrect grapheme break on initial extended + zwj emoji");
+        }
+
+        state = 0;
+        for (int i = 0; i < 5; i++) {
+            utf8proc_int32_t c1 = family[i];
+            utf8proc_int32_t c2 = family[i+1];
+            results_family[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
+            check(results_family[i] == expected_family[i], "Incorrect grapheme break on initial extended + zwj emoji");
+        }
+    }
+
     return 0;
 }
diff --git a/utf8proc.c b/utf8proc.c
index 2a4ff4b..b8453d1 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -301,6 +301,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
     // forbidden by a different rule such as GB9).
     if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
       *state = UTF8PROC_BOUNDCLASS_OTHER;
+    else if (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
+       *state = UTF8PROC_BOUNDCLASS_OTHER;
     // Special support for GB11 (emoji extend* zwj / emoji)
     else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
@@ -311,7 +313,7 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
         *state = tbc;
     }
     else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ)
-+       *state = UTF8PROC_BOUNDCLASS_E_ZWG;     
+       *state = UTF8PROC_BOUNDCLASS_E_ZWG;
     else
       *state = tbc;
   }

From 900c5541599a30adf017dccef97c8cfa1594dea6 Mon Sep 17 00:00:00 2001
From: Thomas Marks <marksta@umich.edu>
Date: Thu, 19 Nov 2020 12:59:33 -0500
Subject: [PATCH 3/6] Merge conditions for setting breaks bt region

---
 utf8proc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/utf8proc.c b/utf8proc.c
index b8453d1..6d8832f 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -299,10 +299,9 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
     // after that character according to GB999 (unless of course such a break is
     // forbidden by a different rule such as GB9).
-    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
-      *state = UTF8PROC_BOUNDCLASS_OTHER;
-    else if (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
-       *state = UTF8PROC_BOUNDCLASS_OTHER;
+    if (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && 
+        (*state == tbc || (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)))
+        *state = UTF8PROC_BOUNDCLASS_OTHER;
     // Special support for GB11 (emoji extend* zwj / emoji)
     else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji

From b3bf5fdfe116a152693eb2a697b3cd28da09021a Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Sun, 22 Nov 2020 17:04:47 -0500
Subject: [PATCH 4/6] updated fix

---
 utf8proc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/utf8proc.c b/utf8proc.c
index 6d8832f..5a9fbf3 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -290,8 +290,11 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
 
 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
 {
-  int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
-                      ? *state : lbc);
+  int lbc_override;
+  if (*state == UTF8PROC_BOUNDCLASS_START)
+    *state = lbc_override = lbc; 
+  else
+    lbc_override = *state;
   utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
   if (state) {
     // Special support for GB 12/13 made possible by GB999. After two RI
@@ -299,9 +302,8 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
     // after that character according to GB999 (unless of course such a break is
     // forbidden by a different rule such as GB9).
-    if (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && 
-        (*state == tbc || (*state == UTF8PROC_BOUNDCLASS_START && lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)))
-        *state = UTF8PROC_BOUNDCLASS_OTHER;
+    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
+      *state = UTF8PROC_BOUNDCLASS_OTHER;
     // Special support for GB11 (emoji extend* zwj / emoji)
     else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
@@ -311,8 +313,6 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
       else
         *state = tbc;
     }
-    else if (*state == UTF8PROC_BOUNDCLASS_EXTEND && tbc == UTF8PROC_BOUNDCLASS_ZWJ)
-       *state = UTF8PROC_BOUNDCLASS_E_ZWG;
     else
       *state = tbc;
   }

From d6ba5f6619b0401cdf1092036e35a1eba28b4207 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Mon, 23 Nov 2020 11:38:49 -0500
Subject: [PATCH 5/6] perform tests for both utf8proc_map and manual calls to
 utf8proc_grapheme_break_stateful

---
 test/graphemetest.c | 93 +++++++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/test/graphemetest.c b/test/graphemetest.c
index 804524d..ea9d29e 100644
--- a/test/graphemetest.c
+++ b/test/graphemetest.c
@@ -38,7 +38,7 @@ int main(int argc, char **argv)
             --si; /* no break after final grapheme */
         src[si] = 0; /* NUL-terminate */
 
-        if (si) {
+        if (si) { /* test utf8proc_map */
             utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
             size_t i = 0, j = 0;
             utf8proc_ssize_t glen, k;
@@ -65,6 +65,27 @@ int main(int argc, char **argv)
             }
             free(g);
         }
+
+        if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
+            utf8proc_int32_t state = 0, prev_codepoint = 0;
+            size_t i = 0;
+            utf8proc_bool expectbreak = false;
+            do {
+                utf8proc_int32_t codepoint;
+                i += utf8proc_iterate(src + i, si - i, &codepoint);
+                check(codepoint >= 0, "invalid UTF-8 data");
+                if (codepoint == 0x002F)
+                    expectbreak = true;
+                else {
+                    if (prev_codepoint != 0) {
+                        check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
+                              "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
+                    }
+                    expectbreak = false;
+                    prev_codepoint = codepoint;
+                }
+            } while (i < si);
+        }
     }
     fclose(f);
     printf("Passed tests after %zd lines!\n", lineno);
@@ -97,34 +118,56 @@ int main(int argc, char **argv)
         bool expected_double_sweden[] = {false, true, false};
         bool expected_facepalm[] = {false, false, false, false};
         bool expected_family[] = {false, false, false, false, false, false};
-        bool results_double_sweden[4];
-        bool results_facepalm[5];
-        bool results_family[6];
-
-        utf8proc_int32_t state = 0;
-        for (int i = 0; i < 3; i++) {
-            utf8proc_int32_t c1 = double_sweden[i];
-            utf8proc_int32_t c2 = double_sweden[i+1];
-            results_double_sweden[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
-            check(results_double_sweden[i] == expected_double_sweden[i], "Incorrect grapheme break on initial repeated flags");
-        }
 
-        state = 0;
-        for (int i = 0; i < 4; i++) {
-            utf8proc_int32_t c1 = facepalm[i];
-            utf8proc_int32_t c2 = facepalm[i+1];
-            results_facepalm[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
-            check(results_facepalm[i] == expected_facepalm[i], "Incorrect grapheme break on initial extended + zwj emoji");
-        }
+        utf8proc_int32_t *test_codepoints[] = {double_sweden, facepalm, family};
+        bool *test_expected[] = {expected_double_sweden, expected_facepalm, expected_family};
+        size_t itest, test_len[] = {4, 5, 6};
+
+        for (itest = 0; itest < sizeof(test_len) / sizeof(size_t); ++itest) {
+            utf8proc_uint8_t test_str[256];
+            utf8proc_int32_t j = 0, state = 0;
+            size_t i, break_count = 0;
+            utf8proc_ssize_t glen, gi;
+            utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
+
+            for (i = 0; i < test_len[itest]; ++i)
+                j += utf8proc_encode_char(test_codepoints[itest][i], test_str+j);
+            test_str[j] = 0;
 
-        state = 0;
-        for (int i = 0; i < 5; i++) {
-            utf8proc_int32_t c1 = family[i];
-            utf8proc_int32_t c2 = family[i+1];
-            results_family[i] = utf8proc_grapheme_break_stateful(c1, c2, &state);
-            check(results_family[i] == expected_family[i], "Incorrect grapheme break on initial extended + zwj emoji");
+            printf("grapheme regression test for \"%s\"...\n", (char*) test_str);
+
+            /* test manual utf8proc_grapheme_break_stateful calls: */
+            for (i = 0; i < test_len[itest]-1; ++i) {
+                utf8proc_int32_t c1 = test_codepoints[itest][i];
+                utf8proc_int32_t c2 = test_codepoints[itest][i+1];
+                bool break_found = utf8proc_grapheme_break_stateful(c1, c2, &state);
+                break_count += break_found;
+                check(break_found == test_expected[itest][i],
+                      "incorrect grapheme between 0x%04x and 0x%04x in \"%s\"", c1, c2, (char*) test_str);
+            }
+
+            /* test utf8proc_map: */
+            glen = utf8proc_map(test_str, j, &g, UTF8PROC_CHARBOUND);
+            check(glen > 0 && g[0] == 0xff, "invalid UTF-8 in test");
+            for (gi = 0; gi < glen; ++gi)
+                g[gi] = g[gi] == 0xff ? '/' : g[gi]; /* easier to debug with /, for printing */
+            gi = i = 0;
+            while (gi < glen && i < test_len[itest]) {
+                utf8proc_int32_t c;
+                gi += g[gi] == '/';
+                gi += utf8proc_iterate(g+gi, glen - gi, &c); /* skip first char */
+                if (i < test_len[itest]-1)
+                    check(test_expected[itest][i] == (g[gi] == '/'), "incorrect break after 0x%04x in \"%s\"", c, (char*)test_str);
+                else
+                    check(g[gi] == 0, "missing null terminator");
+                ++i;
+            }
+            check(gi == glen && i == test_len[itest], "length mismatch %d/%d vs. %d in \"%s\" test", (int)gi, (int)glen, (int)i, (char*)test_str);
+            free(g);
         }
     }
 
+    printf("Passed regression tests!\n");
+
     return 0;
 }

From 3965f0b6d677723826ce3c956880b52bde6a975f Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Mon, 23 Nov 2020 12:15:36 -0500
Subject: [PATCH 6/6] consolidate tests

---
 test/graphemetest.c | 235 ++++++++++++++++++--------------------------
 1 file changed, 93 insertions(+), 142 deletions(-)

diff --git a/test/graphemetest.c b/test/graphemetest.c
index ea9d29e..95e7dc0 100644
--- a/test/graphemetest.c
+++ b/test/graphemetest.c
@@ -1,95 +1,107 @@
 #include "tests.h"
 
+/* check one line in the format of GraphemeBreakTest.txt */
+void checkline(const char *_buf, bool verbose) {
+    size_t bi = 0, si = 0;
+    utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
+    const unsigned char *buf = (const unsigned char *) _buf;
+
+    while (buf[bi]) {
+        bi = skipspaces(buf, bi);
+        if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
+            src[si++] = '/';
+            bi += 2;
+        }
+        else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
+            bi += 2;
+        }
+        else if (buf[bi] == '#') { /* start of comments */
+            break;
+        }
+        else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
+            src[si++] = '/';
+            bi += 1;
+        }
+        else { /* hex-encoded codepoint */
+            size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
+            while (src[si]) ++si; /* advance to NUL termination */
+            bi += len;
+        }
+    }
+    if (si && src[si-1] == '/')
+        --si; /* no break after final grapheme */
+    src[si] = 0; /* NUL-terminate */
+
+    if (si) { /* test utf8proc_map */
+        utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
+        size_t i = 0, j = 0;
+        utf8proc_ssize_t glen, k;
+        utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
+        while (i < si) {
+            if (src[i] != '/')
+                utf8[j++] = src[i++];
+            else
+                i++;
+        }
+        glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+        if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
+            /* the test file contains surrogate codepoints, which are only for UTF-16 */
+            printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
+        }
+        else {
+            check(glen >= 0, "utf8proc_map error = %s",
+                utf8proc_errmsg(glen));
+            for (k = 0; k <= glen; ++k)
+                if (g[k] == 0xff)
+                    g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
+            check(!strcmp((char*)g, (char*)src),
+                "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
+        }
+        free(g);
+    }
+
+    if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
+        utf8proc_int32_t state = 0, prev_codepoint = 0;
+        size_t i = 0;
+        utf8proc_bool expectbreak = false;
+        do {
+            utf8proc_int32_t codepoint;
+            i += utf8proc_iterate(src + i, si - i, &codepoint);
+            check(codepoint >= 0, "invalid UTF-8 data");
+            if (codepoint == 0x002F)
+                expectbreak = true;
+            else {
+                if (prev_codepoint != 0) {
+                    check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
+                          "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
+                }
+                expectbreak = false;
+                prev_codepoint = codepoint;
+            }
+        } while (i < si);
+    }
+
+    if (verbose)
+        printf("passed grapheme test: \"%s\"\n", (char*) src);
+}
+
 int main(int argc, char **argv)
 {
     unsigned char buf[8192];
     FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
-    utf8proc_uint8_t src[1024];
 
     check(f != NULL, "error opening GraphemeBreakTest.txt");
     while (simple_getline(buf, f) > 0) {
-        size_t bi = 0, si = 0;
-        lineno += 1;
-
-        if (lineno % 100 == 0)
+        if ((++lineno) % 100 == 0)
             printf("checking line %zd...\n", lineno);
-
         if (buf[0] == '#') continue;
-
-        while (buf[bi]) {
-            bi = skipspaces(buf, bi);
-            if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
-                src[si++] = '/';
-                bi += 2;
-            }
-            else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
-                bi += 2;
-            }
-            else if (buf[bi] == '#') { /* start of comments */
-                break;
-            }
-	    else { /* hex-encoded codepoint */
-                size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
-                while (src[si]) ++si; /* advance to NUL termination */
-                bi += len;
-            }
-        }
-        if (si && src[si-1] == '/')
-            --si; /* no break after final grapheme */
-        src[si] = 0; /* NUL-terminate */
-
-        if (si) { /* test utf8proc_map */
-            utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
-            size_t i = 0, j = 0;
-            utf8proc_ssize_t glen, k;
-            utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
-            while (i < si) {
-                if (src[i] != '/')
-                    utf8[j++] = src[i++];
-                else
-                    i++;
-            }
-            glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
-            if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
-                 /* the test file contains surrogate codepoints, which are only for UTF-16 */
-                 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
-            }
-            else {
-                 check(glen >= 0, "utf8proc_map error = %s",
-                       utf8proc_errmsg(glen));
-                 for (k = 0; k <= glen; ++k)
-                      if (g[k] == 0xff)
-                          g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
-                 check(!strcmp((char*)g, (char*)src),
-                       "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
-            }
-            free(g);
-        }
-
-        if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
-            utf8proc_int32_t state = 0, prev_codepoint = 0;
-            size_t i = 0;
-            utf8proc_bool expectbreak = false;
-            do {
-                utf8proc_int32_t codepoint;
-                i += utf8proc_iterate(src + i, si - i, &codepoint);
-                check(codepoint >= 0, "invalid UTF-8 data");
-                if (codepoint == 0x002F)
-                    expectbreak = true;
-                else {
-                    if (prev_codepoint != 0) {
-                        check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
-                              "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
-                    }
-                    expectbreak = false;
-                    prev_codepoint = codepoint;
-                }
-            } while (i < si);
-        }
+        checkline((char *) buf, false);
     }
     fclose(f);
     printf("Passed tests after %zd lines!\n", lineno);
 
+    printf("Performing regression tests...\n");
+
     /* issue 144 */
     {
         utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
@@ -101,71 +113,10 @@ int main(int argc, char **argv)
         free(g);
     };
 
-    /*https://github.com/JuliaLang/julia/issues/37680*/
-    {
-        // Two swedish flags after each other
-        utf8proc_int32_t double_sweden[] = {
-            0x0001f1f8, 0x0001f1ea, 0x0001f1f8, 0x0001f1ea
-        };
-        // facepalm + pale skin + zwj + male sign + FE0F
-        utf8proc_int32_t facepalm[] ={
-           0x0001f926, 0x0001f3fc, 0x0000200d, 0x00002642, 0x0000fe0f
-        };
-        // man face + pale skin + zwj + hand holding + zwj + man face + dark skin
-        utf8proc_int32_t family[] = {
-            0x0001f468, 0x0001f3fb, 0x0000200d, 0x00001f91d, 0x0000200d, 0x0001f468, 0x0001f3fd
-        };
-        bool expected_double_sweden[] = {false, true, false};
-        bool expected_facepalm[] = {false, false, false, false};
-        bool expected_family[] = {false, false, false, false, false, false};
-
-        utf8proc_int32_t *test_codepoints[] = {double_sweden, facepalm, family};
-        bool *test_expected[] = {expected_double_sweden, expected_facepalm, expected_family};
-        size_t itest, test_len[] = {4, 5, 6};
-
-        for (itest = 0; itest < sizeof(test_len) / sizeof(size_t); ++itest) {
-            utf8proc_uint8_t test_str[256];
-            utf8proc_int32_t j = 0, state = 0;
-            size_t i, break_count = 0;
-            utf8proc_ssize_t glen, gi;
-            utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
-
-            for (i = 0; i < test_len[itest]; ++i)
-                j += utf8proc_encode_char(test_codepoints[itest][i], test_str+j);
-            test_str[j] = 0;
-
-            printf("grapheme regression test for \"%s\"...\n", (char*) test_str);
-
-            /* test manual utf8proc_grapheme_break_stateful calls: */
-            for (i = 0; i < test_len[itest]-1; ++i) {
-                utf8proc_int32_t c1 = test_codepoints[itest][i];
-                utf8proc_int32_t c2 = test_codepoints[itest][i+1];
-                bool break_found = utf8proc_grapheme_break_stateful(c1, c2, &state);
-                break_count += break_found;
-                check(break_found == test_expected[itest][i],
-                      "incorrect grapheme between 0x%04x and 0x%04x in \"%s\"", c1, c2, (char*) test_str);
-            }
-
-            /* test utf8proc_map: */
-            glen = utf8proc_map(test_str, j, &g, UTF8PROC_CHARBOUND);
-            check(glen > 0 && g[0] == 0xff, "invalid UTF-8 in test");
-            for (gi = 0; gi < glen; ++gi)
-                g[gi] = g[gi] == 0xff ? '/' : g[gi]; /* easier to debug with /, for printing */
-            gi = i = 0;
-            while (gi < glen && i < test_len[itest]) {
-                utf8proc_int32_t c;
-                gi += g[gi] == '/';
-                gi += utf8proc_iterate(g+gi, glen - gi, &c); /* skip first char */
-                if (i < test_len[itest]-1)
-                    check(test_expected[itest][i] == (g[gi] == '/'), "incorrect break after 0x%04x in \"%s\"", c, (char*)test_str);
-                else
-                    check(g[gi] == 0, "missing null terminator");
-                ++i;
-            }
-            check(gi == glen && i == test_len[itest], "length mismatch %d/%d vs. %d in \"%s\" test", (int)gi, (int)glen, (int)i, (char*)test_str);
-            free(g);
-        }
-    }
+    /* https://github.com/JuliaLang/julia/issues/37680 */
+    checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
+    checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
+    checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
 
     printf("Passed regression tests!\n");