Skip to content

Commit

Permalink
MDEV-31069 Reuse duplicate char-to-weight conversion code in ctype-ut…
Browse files Browse the repository at this point in the history
…f8.c and ctype-ucs2.c

Removing similar functions from ctype-utf8.c and ctype-ucs2.c

- my_tosort_utf16()
- my_tosort_utf32()
- my_tosort_ucs2()
- my_tosort_unicode()

Adding new shared functions into ctype-unidata.h:

- my_tosort_unicode_bmp()  - reused for utf8mb3, ucs2
- my_tosort_unicode()      - reused for utf8mb4, utf16, utf32

For simplicity, the new version of my_tosort_unicode*()
does not include the code handling the MY_CS_LOWER_SORT flag because:
- it affects performance negatively
- we don't have any collations with this flag yet anyway
(This code was most likely earlier erroneously merged from
MySQL's utf8_tolower_ci at some point.)
  • Loading branch information
abarkov committed Apr 18, 2023
1 parent 30b4bb4 commit 2ad287c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 70 deletions.
46 changes: 3 additions & 43 deletions strings/ctype-ucs2.c
Expand Up @@ -1284,22 +1284,6 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
const char charset_name_utf16le[]= "utf16le";
#define charset_name_utf16le_length (sizeof(charset_name_utf16le)-1)

static inline void
my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}



static size_t
my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
Expand Down Expand Up @@ -1341,7 +1325,7 @@ my_hash_sort_utf16_nopad(CHARSET_INFO *cs,

while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
{
my_tosort_utf16(uni_plane, &wc);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+= res;
}
Expand Down Expand Up @@ -2178,22 +2162,6 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
}


static inline void
my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}


static size_t
my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *ptr, size_t length)
Expand Down Expand Up @@ -2242,7 +2210,7 @@ my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_utf32(uni_plane, &wc);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF);
Expand Down Expand Up @@ -3082,14 +3050,6 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
}


static inline void
my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
*wc= page[*wc & 0xFF].sort;
}

static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
Expand Down Expand Up @@ -3125,7 +3085,7 @@ my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
{
my_tosort_ucs2(uni_plane, &wc);
my_tosort_unicode_bmp(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+=res;
}
Expand Down
26 changes: 26 additions & 0 deletions strings/ctype-unidata.h
Expand Up @@ -36,6 +36,32 @@ static inline my_wc_t my_u300_toupper_7bit(uchar ch)
}


static inline void my_tosort_unicode_bmp(MY_UNICASE_INFO *uni_plane,
my_wc_t *wc)
{
const MY_UNICASE_CHARACTER *page;
DBUG_ASSERT(*wc <= uni_plane->maxchar);
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}


static inline void my_tosort_unicode(MY_UNICASE_INFO *uni_plane,
my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
const MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}


static inline void
my_tolower_unicode_bmp(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
Expand Down
37 changes: 10 additions & 27 deletions strings/ctype-utf8.c
Expand Up @@ -4638,23 +4638,6 @@ MY_UNICASE_INFO my_unicase_unicode520=
};


static inline void
my_tosort_unicode(MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= (flags & MY_CS_LOWER_SORT) ?
page[*wc & 0xFF].tolower :
page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}


static uint
my_casefold_multiply_utf8mbx(CHARSET_INFO *cs)
Expand Down Expand Up @@ -4734,8 +4717,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
{
if (weights)
{
my_tosort_unicode(weights, &s_wc, cs->state);
my_tosort_unicode(weights, &w_wc, cs->state);
my_tosort_unicode(weights, &s_wc);
my_tosort_unicode(weights, &w_wc);
}
if (s_wc != w_wc)
return 1; /* No match */
Expand Down Expand Up @@ -4803,8 +4786,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
return 1;
if (weights)
{
my_tosort_unicode(weights, &s_wc, cs->state);
my_tosort_unicode(weights, &w_wc, cs->state);
my_tosort_unicode(weights, &s_wc);
my_tosort_unicode(weights, &w_wc);
}

if (s_wc == w_wc)
Expand Down Expand Up @@ -5242,7 +5225,7 @@ static void my_hash_sort_utf8mb3_nopad(CHARSET_INFO *cs, const uchar *s, size_t

while ((s < e) && (res=my_utf8mb3_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
{
my_tosort_unicode(uni_plane, &wc, cs->state);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+= res;
}
Expand Down Expand Up @@ -5976,8 +5959,8 @@ static int my_strnncoll_utf8mb3_cs(CHARSET_INFO *cs,
save_diff = ((int)s_wc) - ((int)t_wc);
}

my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
my_tosort_unicode(uni_plane, &s_wc);
my_tosort_unicode(uni_plane, &t_wc);

if ( s_wc != t_wc )
{
Expand Down Expand Up @@ -6018,8 +6001,8 @@ static int my_strnncollsp_utf8mb3_cs(CHARSET_INFO *cs,
save_diff = ((int)s_wc) - ((int)t_wc);
}

my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
my_tosort_unicode(uni_plane, &s_wc);
my_tosort_unicode(uni_plane, &t_wc);

if ( s_wc != t_wc )
{
Expand Down Expand Up @@ -7697,7 +7680,7 @@ my_hash_sort_utf8mb4_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_unicode(uni_plane, &wc, cs->state);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF));
if (wc > 0xFFFF)
{
Expand Down

0 comments on commit 2ad287c

Please sign in to comment.