Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix grapheme breaks on string-initial #205

Merged
merged 6 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 96 additions & 56 deletions test/graphemetest.c
Original file line number Diff line number Diff line change
@@ -1,74 +1,107 @@
#include "tests.h"

/* check one line in the format of GraphemeBreakTest.txt */
void checkline(const char *_buf, bool verbose) {
size_t bi = 0, si = 0;
utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
const unsigned char *buf = (const unsigned char *) _buf;

while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
src[si++] = '/';
bi += 2;
}
else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
bi += 2;
}
else if (buf[bi] == '#') { /* start of comments */
break;
}
else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
src[si++] = '/';
bi += 1;
}
else { /* hex-encoded codepoint */
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
bi += len;
}
}
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */

if (si) { /* test utf8proc_map */
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
utf8proc_ssize_t glen, k;
utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) {
if (src[i] != '/')
utf8[j++] = src[i++];
else
i++;
}
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
}
else {
check(glen >= 0, "utf8proc_map error = %s",
utf8proc_errmsg(glen));
for (k = 0; k <= glen; ++k)
if (g[k] == 0xff)
g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
check(!strcmp((char*)g, (char*)src),
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
}
free(g);
}

if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
utf8proc_int32_t state = 0, prev_codepoint = 0;
size_t i = 0;
utf8proc_bool expectbreak = false;
do {
utf8proc_int32_t codepoint;
i += utf8proc_iterate(src + i, si - i, &codepoint);
check(codepoint >= 0, "invalid UTF-8 data");
if (codepoint == 0x002F)
expectbreak = true;
else {
if (prev_codepoint != 0) {
check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
"grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
}
expectbreak = false;
prev_codepoint = codepoint;
}
} while (i < si);
}

if (verbose)
printf("passed grapheme test: \"%s\"\n", (char*) src);
}

int main(int argc, char **argv)
{
unsigned char buf[8192];
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
utf8proc_uint8_t src[1024];

check(f != NULL, "error opening GraphemeBreakTest.txt");
while (simple_getline(buf, f) > 0) {
size_t bi = 0, si = 0;
lineno += 1;

if (lineno % 100 == 0)
if ((++lineno) % 100 == 0)
printf("checking line %zd...\n", lineno);

if (buf[0] == '#') continue;

while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
src[si++] = '/';
bi += 2;
}
else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
bi += 2;
}
else if (buf[bi] == '#') { /* start of comments */
break;
}
else { /* hex-encoded codepoint */
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
bi += len;
}
}
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */

if (si) {
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
utf8proc_ssize_t glen, k;
utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) {
if (src[i] != '/')
utf8[j++] = src[i++];
else
i++;
}
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
}
else {
check(glen >= 0, "utf8proc_map error = %s",
utf8proc_errmsg(glen));
for (k = 0; k <= glen; ++k)
if (g[k] == 0xff)
g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
check(!strcmp((char*)g, (char*)src),
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
}
free(g);
}
checkline((char *) buf, false);
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);

printf("Performing regression tests...\n");

/* issue 144 */
{
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
Expand All @@ -80,5 +113,12 @@ int main(int argc, char **argv)
free(g);
};

/* https://github.com/JuliaLang/julia/issues/37680 */
checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */

printf("Passed regression tests!\n");

return 0;
}
7 changes: 5 additions & 2 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,11 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {

static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
{
int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
? *state : lbc);
int lbc_override;
if (*state == UTF8PROC_BOUNDCLASS_START)
*state = lbc_override = lbc;
else
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
if (state) {
// Special support for GB 12/13 made possible by GB999. After two RI
Expand Down