Unicode 15.1 support (#253)

* Unicode 15.1 support * always update state * fix GB9c logic * print indic_conjunct_break in printproperty * fix grapheme test * update utf8proc_decompose_char docs * more GB9c tests
JuliaStrings · Oct 20, 2023 · 46a442b · 46a442b
1 parent 1cb28a6
commit 46a442b
Show file tree

Hide file tree

Showing 9 changed files with 11,000 additions and 10,896 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,7 +68,7 @@ endif()
 if(UTF8PROC_ENABLE_TESTING)
   enable_testing()
   file(MAKE_DIRECTORY data)
-  set(UNICODE_VERSION 15.0.0)
+  set(UNICODE_VERSION 15.1.0)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
   add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)

diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
 and is named `libutf8proc.a` (for the static library) and
 `libutf8proc.so` (for the dynamic library).
 
-The Unicode version supported is 15.0.0.
+The Unicode version supported is 15.1.0.
 
 For Unicode normalizations, the following options are used:
 

diff --git a/data/Makefile b/data/Makefile
@@ -22,7 +22,7 @@ CharWidths.txt: charwidths.jl EastAsianWidth.txt
 	$(JULIA) charwidths.jl > $@
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=15.0.0
+UNICODE_VERSION=15.1.0
 
 UnicodeData.txt:
 	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

diff --git a/data/data_generator.rb b/data/data_generator.rb
@@ -97,6 +97,32 @@
   end
 end
 
+$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
+$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
+$icb_linker_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
+  elsif entry =~ /^[0-9A-F]+/
+    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
+  end
+end
+$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
+$icb_consonant_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
+  elsif entry =~ /^[0-9A-F]+/
+    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
+  end
+end
+$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
+$icb_extend_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+    $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
+  elsif entry =~ /^[0-9A-F]+/
+    $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
+  end
+end
+
 $grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
 $grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
 $grapheme_boundclass_list.each_line do |entry|
@@ -174,7 +200,7 @@ def cpary2c(array)
   return "UINT16_MAX" if array.nil? || array.length == 0
   lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
   array = cpary2utf16encoded(array)
-  if lencode >= 3 #we have only 2 bits for the length 
+  if lencode >= 3 #we have only 2 bits for the length
     array = [lencode] + array
     lencode = 3
   end
@@ -249,7 +275,8 @@ def c_entry(comb_indicies)
     "#{$ignorable.include?(code)}, " <<
     "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
     "#{$charwidth[code]}, 0, " <<
-    "#{$grapheme_boundclass[code]}},\n"
+    "#{$grapheme_boundclass[code]}, " <<
+    "#{$icb[code]}},\n"
   end
 end
 
@@ -415,7 +442,7 @@ def c_entry(comb_indicies)
 $stdout << "};\n\n"
 
 $stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
+$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
 properties.each { |line|
   $stdout << line
 }

diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -119,6 +119,13 @@ int main(int argc, char **argv)
     checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
     checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
 
+    /* more GB9c tests */
+    checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
+    checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
+    checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
+    checkline("/ 0915 0300 094d 0300 / 0078 /", true);
+    checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
+
     check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
     check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
 

diff --git a/test/printproperty.c b/test/printproperty.c
@@ -39,6 +39,7 @@ int main(int argc, char **argv)
             "  ignorable = %d\n"
             "  control_boundary = %d\n"
             "  boundclass = %d\n"
+            "  indic_conjunct_break = %d\n"
             "  charwidth = %d\n",
         argv[i], (char*) cstr,
         utf8proc_category_string(c),
@@ -55,6 +56,7 @@ int main(int argc, char **argv)
         p->ignorable,
         p->control_boundary,
         p->boundclass,
+        p->indic_conjunct_break,
         utf8proc_charwidth(c));
         free(map);
     }

diff --git a/utf8proc.c b/utf8proc.c
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
-  return "15.0.0";
+  return "15.1.0";
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
@@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
     true; // GB999
 }
 
-static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
 {
   if (state) {
-    int lbc_override;
-    if (*state == UTF8PROC_BOUNDCLASS_START)
-      *state = lbc_override = lbc;
-    else
-      lbc_override = *state;
-    utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
+    int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
+    if (*state == 0) { /* state initialization */
+      state_bc = lbc;
+      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
+    }
+    else { /* lbc and licb are already encoded in *state */
+      state_bc = *state & 0xff;  // 1st byte of state is bound class
+      state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break
+    }
+
+    utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
+       !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
+        && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
+
+    // Special support for GB9c.  Don't break between two consonants
+    // separated 1+ linker characters and 0+ extend characters in any order.
+    // After a consonant, we enter LINKER state after at least one linker.
+    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
+        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
+        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
+      state_icb = ticb;
+    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
+      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
+                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
 
     // Special support for GB 12/13 made possible by GB999. After two RI
     // class codepoints we want to force a break. Do this by resetting the
     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
     // after that character according to GB999 (unless of course such a break is
     // forbidden by a different rule such as GB9).
-    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
-      *state = UTF8PROC_BOUNDCLASS_OTHER;
+    if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
+      state_bc = UTF8PROC_BOUNDCLASS_OTHER;
     // Special support for GB11 (emoji extend* zwj / emoji)
-    else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
+    else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
-        *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
+        state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
       else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
-        *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
+        state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
       else
-        *state = tbc;
+        state_bc = tbc;
     }
     else
-      *state = tbc;
+      state_bc = tbc;
 
+    *state = state_bc + (state_icb << 8);
     return break_permitted;
   }
   else
@@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
     utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
 
-  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
-                                 utf8proc_get_property(c2)->boundclass,
+  const utf8proc_property_t *p1 = utf8proc_get_property(c1);
+  const utf8proc_property_t *p2 = utf8proc_get_property(c2);
+  return grapheme_break_extended(p1->boundclass,
+                                 p2->boundclass,
+                                 p1->indic_conjunct_break,
+                                 p2->indic_conjunct_break,
                                  state);
 }
 
@@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
   }
   if (options & UTF8PROC_CHARBOUND) {
     utf8proc_bool boundary;
-    int tbc = property->boundclass;
-    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
+    boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
+                                       last_boundclass);
     if (boundary) {
       if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
       if (bufsize >= 2) dst[1] = uc;

diff --git a/utf8proc.h b/utf8proc.h
@@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
    * Boundclass.
    * @see utf8proc_boundclass_t.
    */
-  unsigned boundclass:8;
+  unsigned boundclass:6;
+  unsigned indic_conjunct_break:2;
 } utf8proc_property_t;
 
 /** Unicode categories. */
@@ -388,6 +389,14 @@ typedef enum {
   UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
 } utf8proc_boundclass_t;
 
+/** Indic_Conjunct_Break property. (TR44) */
+typedef enum {
+  UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
+  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
+  UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
+  UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
+} utf8proc_indic_conjunct_break_t;
+
 /**
  * Function pointer type passed to @ref utf8proc_map_custom and
  * @ref utf8proc_decompose_custom, which is used to specify a user-defined
@@ -481,8 +490,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
  * @param last_boundclass
  * Pointer to an integer variable containing
- * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
- * option is used.  Otherwise, this parameter is ignored.
+ * the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
+ * option is used.  If the string is being processed in order, this can be initialized to 0 for
+ * the beginning of the string, and is thereafter updated automatically.  Otherwise, this parameter is ignored.
  *
  * @return
  * In case of success, the number of codepoints written is returned; in case