Skip to content

Commit

Permalink
fix CHARBOUND option for non-characters
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Mar 30, 2019
1 parent 2d9b2d2 commit f59538b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 15 deletions.
22 changes: 17 additions & 5 deletions test/graphemetest.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ int main(int argc, char **argv)
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
utf8proc_uint8_t src[1024];
int len;

check(f != NULL, "error opening GraphemeBreakTest.txt");
while (getline(&buf, &bufsize, f) > 0) {
size_t bi = 0, si = 0;
lineno += 1;

if (lineno % 100 == 0)
printf("checking line %zd...\n", lineno);

if (buf[0] == '#') continue;

while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == '/') { /* grapheme break */
Expand All @@ -39,7 +39,7 @@ int main(int argc, char **argv)
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */

if (si) {
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
Expand Down Expand Up @@ -70,5 +70,17 @@ int main(int argc, char **argv)
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);

/* issue 144 */
{
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
utf8proc_ssize_t glen;
utf8proc_uint8_t *g;
glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
free(g);
};

return 0;
}
18 changes: 8 additions & 10 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
} else return 0;
}

/* internal "unsafe" version that does not check whether uc is in range */
static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
/* internal version used for inserting 0xff bytes between graphemes */
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) {
if (uc == -1) { /* internal value used for grapheme breaks */
dst[0] = (utf8proc_uint8_t)0xFF;
return 1;
}
return 0;
} else if (uc < 0x80) {
dst[0] = (utf8proc_uint8_t)uc;
Expand All @@ -207,12 +211,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2;
} else if (uc == 0xFFFF) {
dst[0] = (utf8proc_uint8_t)0xFF;
return 1;
} else if (uc == 0xFFFE) {
dst[0] = (utf8proc_uint8_t)0xFE;
return 1;
} else if (uc < 0x10000) {
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
Expand Down Expand Up @@ -480,7 +478,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
return 2;
}
Expand Down Expand Up @@ -686,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
if (options & UTF8PROC_CHARBOUND) {
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
}
} else {
for (rpos = 0; rpos < length; rpos++) {
Expand Down

0 comments on commit f59538b

Please sign in to comment.