From cfee1044cbf66084b069e937cb21dc55609a38f6 Mon Sep 17 00:00:00 2001 From: Diego Frias Date: Fri, 21 Nov 2025 20:27:58 -0800 Subject: [PATCH 1/4] Fix attempting to combine Hangul Jamo 0x11a7 0x11a7 is not a valid Hangul T syllable despite being equal to T_BASE. This is because, per the Unicode spec: TCount is set to one more than the number of trailing consonants relevant to the decomposition algorithm: (11C216 - 11A816 + 1) + 1 So the first valid Hangul T syllable is 0x11a8. Also see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G59434 for where the spec describes the usage of 0x11a8, not 0x11a7, during composition. --- utf8proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utf8proc.c b/utf8proc.c index c59bad2..453ea23 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -684,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { utf8proc_int32_t hangul_tindex; hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; - if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { + if (hangul_tindex >= 1 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { *starter += hangul_tindex; starter_property = NULL; continue; From 14f6b935bb0c274f37f202655621caa02ce18b5d Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 22 Nov 2025 09:34:10 -0500 Subject: [PATCH 2/4] document that utf8proc_map simply wraps utf8proc_decompose and utf8proc_reencode (#312) --- utf8proc.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utf8proc.h b/utf8proc.h index 7619949..b43904f 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -750,6 +750,10 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi * * @note The memory of the new UTF-8 string will have been allocated * with `malloc`, and should therefore be deallocated with `free`. + * + * @note `utf8proc_map` simply calls `utf8proc_decompose` followed by `utf8proc_reencode`, + * and applications requiring greater control over memory allocation should instead call + * those two functions directly. */ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options From 164b9f66fb473347de912e4e68208bebc7640532 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 22 Nov 2025 09:40:53 -0500 Subject: [PATCH 3/4] test code refactoring (#318) --- test/custom.c | 5 +---- test/misc.c | 27 +++++++-------------------- test/tests.c | 42 +++++++++++++++++++++++++++++++++++++++++- test/tests.h | 5 +++++ 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/test/custom.c b/test/custom.c index fe4239d..c06bfc6 100644 --- a/test/custom.c +++ b/test/custom.c @@ -19,10 +19,7 @@ int main(void) utf8proc_uint8_t *output; utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM, custom, &thunk_test); - printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output); - check(strlen((char*) output) == 6, "incorrect output length"); - check(!memcmp(correct, output, 7), "incorrect output data"); - free(output); + check_compare("map_custom", input, correct, output, 1); printf("map_custom tests SUCCEEDED.\n"); return 0; } diff --git a/test/misc.c b/test/misc.c index 9156f95..bff793d 100644 --- a/test/misc.c +++ b/test/misc.c @@ -7,35 +7,22 @@ static void issue128(void) /* #128 */ utf8proc_uint8_t input[] = {0x72, 0xcc, 0x87, 0xcc, 0xa3, 0x00}; /* "r\u0307\u0323" */ utf8proc_uint8_t nfc[] = {0xe1, 0xb9, 0x9b, 0xcc, 0x87, 0x00}; /* "\u1E5B\u0307" */ utf8proc_uint8_t nfd[] = {0x72, 0xcc, 0xa3, 0xcc, 0x87, 0x00}; /* "r\u0323\u0307" */ - utf8proc_uint8_t *nfc_out, *nfd_out; - nfc_out = utf8proc_NFC(input); - printf("NFC \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfc_out, (char*)nfc); - check(strlen((char*) nfc_out) == 5, "incorrect nfc length"); - check(!memcmp(nfc, nfc_out, 6), "incorrect nfc data"); - nfd_out = utf8proc_NFD(input); - printf("NFD \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfd_out, (char*)nfd); - check(strlen((char*) nfd_out) == 5, "incorrect nfd length"); - check(!memcmp(nfd, nfd_out, 6), "incorrect nfd data"); - free(nfd_out); free(nfc_out); + + check_compare("NFC", input, nfc, utf8proc_NFC(input), 1); + check_compare("NFD", input, nfd, utf8proc_NFD(input), 1); } -static void issue102(void) /* #128 */ +static void issue102(void) /* #102 */ { utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */ utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */ utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */ utf8proc_uint8_t *output; + utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA); - printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna); - check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length"); - check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data"); - free(output); - output = utf8proc_NFKC_Casefold(input); - printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct); - check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length"); - check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data"); - free(output); + check_compare("NFKC_Casefold+stripna", input, stripna, output, 1); + check_compare("NFKC_Casefold", input, correct, utf8proc_NFKC_Casefold(input), 1); } int main(void) diff --git a/test/tests.c b/test/tests.c index 147ac39..8a47b85 100644 --- a/test/tests.c +++ b/test/tests.c @@ -8,7 +8,10 @@ void check(int cond, const char *format, ...) { if (!cond) { va_list args; - fprintf(stderr, "line %zd: ", lineno); + if (lineno) + fprintf(stderr, "FAILED at line %zd: ", lineno); + else + fprintf(stderr, "FAILED: "); va_start(args, format); vfprintf(stderr, format, args); va_end(args); @@ -58,3 +61,40 @@ size_t simple_getline(unsigned char buf[8192], FILE *f) { buf[i] = 0; return i; } + +void print_escaped(FILE* f, const utf8proc_uint8_t *utf8) { + fprintf(f, "\""); + while (*utf8) { + utf8proc_int32_t codepoint; + utf8 += utf8proc_iterate(utf8, -1, &codepoint); + if (codepoint < 0x10000) + fprintf(f, "\\u%04x", codepoint); + else + fprintf(f, "\\U%06x", codepoint); + } + fprintf(f, "\""); +} + +void print_string_and_escaped(FILE* f, const utf8proc_uint8_t *utf8) { + fprintf(f, "\"%s\" (", (const char *) utf8); + print_escaped(f, utf8); + fprintf(f, ")"); +} + +void check_compare(const char *transformation, + const utf8proc_uint8_t *input, const utf8proc_uint8_t *expected, + utf8proc_uint8_t *received, int free_received) { + int passed = !strcmp((const char *) received, (const char *) expected); + FILE *f = passed ? stdout : stderr; + fprintf(f, "%s: %s ", passed ? "PASSED" : "FAILED", transformation); + print_string_and_escaped(f, input); + fprintf(f, " -> "); + print_string_and_escaped(f, received); + if (!passed) { + fprintf(f, " != expected "); + print_string_and_escaped(f, expected); + } + fprintf(f, "\n"); + if (free_received) free(received); + if (!passed) exit(1); +} diff --git a/test/tests.h b/test/tests.h index acda329..d55779f 100644 --- a/test/tests.h +++ b/test/tests.h @@ -25,3 +25,8 @@ void check(int cond, const char *format, ...); size_t skipspaces(const unsigned char *buf, size_t i); size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf); size_t simple_getline(unsigned char buf[8192], FILE *f); +void print_escaped(FILE* f, const utf8proc_uint8_t *utf8); +void print_string_and_escaped(FILE* f, const utf8proc_uint8_t *utf8); +void check_compare(const char *transformation, + const utf8proc_uint8_t *input, const utf8proc_uint8_t *expected, + utf8proc_uint8_t *received, int free_received); From 57c7a2d3f56f0ae0f37e7e6e054198f1197734b6 Mon Sep 17 00:00:00 2001 From: Diego Frias Date: Sat, 22 Nov 2025 08:01:29 -0800 Subject: [PATCH 4/4] Write regression test for #317 --- test/misc.c | 20 ++++++++++++++++++++ utf8proc.c | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/test/misc.c b/test/misc.c index bff793d..7ea2ebc 100644 --- a/test/misc.c +++ b/test/misc.c @@ -25,10 +25,30 @@ static void issue102(void) /* #102 */ check_compare("NFKC_Casefold", input, correct, utf8proc_NFKC_Casefold(input), 1); } +static void issue317(void) /* #317 */ +{ + utf8proc_uint8_t input[] = {0xec, 0xa3, 0xa0, 0xe1, 0x86, 0xa7, 0x00}; /* "\uc8e0\u11a7" */ + utf8proc_uint8_t combined[] = {0xec, 0xa3, 0xa, 0x00}; /* "\uc8e1" */ + utf8proc_int32_t codepoint; + + /* inputs that should *not* be combined* */ + check_compare("NFC", input, input, utf8proc_NFC(input), 1); + utf8proc_encode_char(0x11c3, input+3); + check_compare("NFC", input, input, utf8proc_NFC(input), 1); + + /* inputs that *should* be combined (TCOUNT-1 chars starting at TBASE+1) */ + for (codepoint = 0x11a8; codepoint < 0x11c3; ++codepoint) { + utf8proc_encode_char(codepoint, input+3); + utf8proc_encode_char(0xc8e0 + (codepoint - 0x11a7), combined); + check_compare("NFC", input, combined, utf8proc_NFC(input), 1); + } +} + int main(void) { issue128(); issue102(); + issue317(); #ifdef UNICODE_VERSION printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version()); check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch"); diff --git a/utf8proc.c b/utf8proc.c index 453ea23..b9877c0 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -684,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { utf8proc_int32_t hangul_tindex; hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; - if (hangul_tindex >= 1 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { + if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { *starter += hangul_tindex; starter_property = NULL; continue;