MDEV-31069 Reuse duplicate char-to-weight conversion code in ctype-utf8.c and ctype-ucs2.c

abarkov · abarkov · commit 2ad287caad59 · 2023-04-18T10:24:05.000+04:00
Removing similar functions from ctype-utf8.c and ctype-ucs2.c

- my_tosort_utf16()
- my_tosort_utf32()
- my_tosort_ucs2()
- my_tosort_unicode()

Adding new shared functions into ctype-unidata.h:

- my_tosort_unicode_bmp()  - reused for utf8mb3, ucs2
- my_tosort_unicode()      - reused for utf8mb4, utf16, utf32

For simplicity, the new version of my_tosort_unicode*()
does not include the code handling the MY_CS_LOWER_SORT flag because:
- it affects performance negatively
- we don't have any collations with this flag yet anyway
(This code was most likely earlier erroneously merged from
MySQL's utf8_tolower_ci at some point.)
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
@@ -1284,22 +1284,6 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
 const char charset_name_utf16le[]= "utf16le";
 #define charset_name_utf16le_length (sizeof(charset_name_utf16le)-1)
 
-static inline void
-my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-
-
 
 static size_t
 my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
@@ -1341,7 +1325,7 @@ my_hash_sort_utf16_nopad(CHARSET_INFO *cs,
 
   while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
   {
-    my_tosort_utf16(uni_plane, &wc);
+    my_tosort_unicode(uni_plane, &wc);
     MY_HASH_ADD_16(m1, m2, wc);
     s+= res;
   }
@@ -2178,22 +2162,6 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static inline void
-my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-
-
 static size_t
 my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
                   const char *ptr, size_t length)
@@ -2242,7 +2210,7 @@ my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
 
   while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
   {
-    my_tosort_utf32(uni_plane, &wc);
+    my_tosort_unicode(uni_plane, &wc);
     MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
     MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
     MY_HASH_ADD(m1, m2, (uint) (wc >> 8)  & 0xFF);
@@ -3082,14 +3050,6 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
 }
 
 
-static inline void
-my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  MY_UNICASE_CHARACTER *page;
-  if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
-    *wc= page[*wc & 0xFF].sort;
-}
-
 static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
                            char *dst, size_t dstlen)
 {
@@ -3125,7 +3085,7 @@ my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
 
   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
   {
-    my_tosort_ucs2(uni_plane, &wc);
+    my_tosort_unicode_bmp(uni_plane, &wc);
     MY_HASH_ADD_16(m1, m2, wc);
     s+=res;
   }
diff --git a/strings/ctype-unidata.h b/strings/ctype-unidata.h
@@ -36,6 +36,32 @@ static inline my_wc_t my_u300_toupper_7bit(uchar ch)
 }
 
 
+static inline void my_tosort_unicode_bmp(MY_UNICASE_INFO *uni_plane,
+                                         my_wc_t *wc)
+{
+  const MY_UNICASE_CHARACTER *page;
+  DBUG_ASSERT(*wc <= uni_plane->maxchar);
+  if ((page= uni_plane->page[*wc >> 8]))
+    *wc= page[*wc & 0xFF].sort;
+}
+
+
+static inline void my_tosort_unicode(MY_UNICASE_INFO *uni_plane,
+                                     my_wc_t *wc)
+{
+  if (*wc <= uni_plane->maxchar)
+  {
+    const MY_UNICASE_CHARACTER *page;
+    if ((page= uni_plane->page[*wc >> 8]))
+      *wc= page[*wc & 0xFF].sort;
+  }
+  else
+  {
+    *wc= MY_CS_REPLACEMENT_CHARACTER;
+  }
+}
+
+
 static inline void
 my_tolower_unicode_bmp(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
 {
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
@@ -4638,23 +4638,6 @@ MY_UNICASE_INFO my_unicase_unicode520=
 };
 
 
-static inline void
-my_tosort_unicode(MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= (flags & MY_CS_LOWER_SORT) ?
-           page[*wc & 0xFF].tolower :
-           page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-
 
 static uint
 my_casefold_multiply_utf8mbx(CHARSET_INFO *cs)
@@ -4734,8 +4717,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
       {
         if (weights)
         {
-          my_tosort_unicode(weights, &s_wc, cs->state);
-          my_tosort_unicode(weights, &w_wc, cs->state);
+          my_tosort_unicode(weights, &s_wc);
+          my_tosort_unicode(weights, &w_wc);
         }
         if (s_wc != w_wc)
           return 1;                               /* No match */
@@ -4803,8 +4786,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
             return 1;
           if (weights)
           {
-            my_tosort_unicode(weights, &s_wc, cs->state);
-            my_tosort_unicode(weights, &w_wc, cs->state);
+            my_tosort_unicode(weights, &s_wc);
+            my_tosort_unicode(weights, &w_wc);
           }
 
           if (s_wc == w_wc)
@@ -5242,7 +5225,7 @@ static void my_hash_sort_utf8mb3_nopad(CHARSET_INFO *cs, const uchar *s, size_t
 
   while ((s < e) && (res=my_utf8mb3_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
   {
-    my_tosort_unicode(uni_plane, &wc, cs->state);
+    my_tosort_unicode(uni_plane, &wc);
     MY_HASH_ADD_16(m1, m2, wc);
     s+= res;
   }
@@ -5976,8 +5959,8 @@ static int my_strnncoll_utf8mb3_cs(CHARSET_INFO *cs,
       save_diff = ((int)s_wc) - ((int)t_wc);
     }
 
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);
 
     if ( s_wc != t_wc )
     {
@@ -6018,8 +6001,8 @@ static int my_strnncollsp_utf8mb3_cs(CHARSET_INFO *cs,
       save_diff = ((int)s_wc) - ((int)t_wc);
     }
 
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);
 
     if ( s_wc != t_wc )
     {
@@ -7697,7 +7680,7 @@ my_hash_sort_utf8mb4_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
 
   while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
   {
-    my_tosort_unicode(uni_plane, &wc, cs->state);
+    my_tosort_unicode(uni_plane, &wc);
     MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF));
     if (wc > 0xFFFF)
     {