Skip to content

Commit

Permalink
MDEV-31071 Refactor case folding data types in Unicode collations
Browse files Browse the repository at this point in the history
This is a non-functional change. It changes the way how case folding data
and weight data (for simple Unicode collations) are stored:

- Removing data types MY_UNICASE_CHARACTER, MY_UNICASE_INFO
- Using data types MY_CASEFOLD_CHARACTER, MY_CASEFOLD_INFO instead.

This patch changes simple Unicode collations in a similar way
how MDEV-30695 previously changed Asian collations.

No new MTR tests are needed. The underlying code is thoroughly
covered by a number of ctype_*_ws.test and ctype_*_casefold.test
files, which were added recently as a preparation
for this change.

Old and new Unicode data layout
-------------------------------

Case folding data is now stored in separate tables
consisting of MY_CASEFOLD_CHARACTER elements with two members:

    typedef struct casefold_info_char_t
    {
      uint32 toupper;
      uint32 tolower;
    } MY_CASEFOLD_CHARACTER;

while weight data (for simple non-UCA collations xxx_general_ci
and xxx_general_mysql500_ci) is stored in separate arrays of
uint16 elements.

Before this change case folding data and simple weight data were
stored together, in tables of the following elements with three members:

    typedef struct unicase_info_char_st
    {
      uint32 toupper;
      uint32 tolower;
      uint32 sort;          /* weights for simple collations */
    } MY_UNICASE_CHARACTER;

This data format was redundant, because weights (the "sort" member) were
needed only for these two simple Unicode collations:
- xxx_general_ci
- xxx_general_mysql500_ci

Adding case folding information for Unicode-14.0.0 using the old
format would waste memory without purpose.

Detailed changes
----------------
- Changing the underlying data types as described above

- Including unidata-dump.c into the sources.
  This program was earlier used to dump UnicodeData.txt
  (e.g. https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt)
  into MySQL / MariaDB source files.
  It was originally written in 2002, but has not been distributed yet
  together with MySQL / MariaDB sources.

- Removing the old format Unicode data earlier dumped from UnicodeData.txt
  (versions 3.0.0 and 5.2.0) from ctype-utf8.c.
  Adding Unicode data in the new format into separate header files,
  to maintain the code easier:

    - ctype-unicode300-casefold.h
    - ctype-unicode300-casefold-tr.h
    - ctype-unicode300-general_ci.h
    - ctype-unicode300-general_mysql500_ci.h
    - ctype-unicode520-casefold.h

- Adding a new file ctype-unidata.c as an aggregator for
  the header files listed above.
  • Loading branch information
abarkov committed Apr 18, 2023
1 parent 2ad287c commit 6075f12
Show file tree
Hide file tree
Showing 29 changed files with 7,471 additions and 5,195 deletions.
25 changes: 2 additions & 23 deletions include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ extern "C" {
typedef const struct my_charset_handler_st MY_CHARSET_HANDLER;
typedef const struct my_collation_handler_st MY_COLLATION_HANDLER;

typedef const struct unicase_info_st MY_UNICASE_INFO;
typedef const struct casefold_info_st MY_CASEFOLD_INFO;
typedef const struct uni_ctype_st MY_UNI_CTYPE;
typedef const struct my_uni_idx_st MY_UNI_IDX;
Expand All @@ -97,29 +96,10 @@ struct casefold_info_st
{
my_wc_t maxchar;
const MY_CASEFOLD_CHARACTER * const *page;
const uint16 * const *simple_weight; /* For general_ci-alike collations */
};


typedef struct unicase_info_char_st
{
uint32 toupper;
uint32 tolower;
uint32 sort;
} MY_UNICASE_CHARACTER;


struct unicase_info_st
{
my_wc_t maxchar;
MY_UNICASE_CHARACTER **page;
};


extern MY_UNICASE_INFO my_unicase_default;
extern MY_UNICASE_INFO my_unicase_turkish;
extern MY_UNICASE_INFO my_unicase_mysql500;
extern MY_UNICASE_INFO my_unicase_unicode520;

#define MY_UCA_MAX_CONTRACTION 6
/*
The DUCET tables in ctype-uca.c are dumped with a limit of 8 weights
Expand Down Expand Up @@ -795,7 +775,6 @@ struct charset_info_st
const uint16 *tab_to_uni;
MY_UNI_IDX *tab_from_uni;
MY_CASEFOLD_INFO *casefold;
MY_UNICASE_INFO *caseinfo;
const uchar *state_map;
const uchar *ident_map;
uint strxfrm_multiply;
Expand Down Expand Up @@ -1691,7 +1670,7 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str, const char *str_end,
const char *wildstr, const char *wildend,
int escape, int w_one, int w_many,
MY_UNICASE_INFO *weights);
MY_CASEFOLD_INFO *weights);

extern my_bool my_parse_charset_xml(MY_CHARSET_LOADER *loader,
const char *buf, size_t buflen);
Expand Down
1 change: 1 addition & 0 deletions strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ SET(STRINGS_SOURCES bchange.c bmove_upp.c ctype-big5.c ctype-bin.c ctype-cp932.c
ctype-czech.c ctype-euc_kr.c ctype-eucjpms.c ctype-extra.c ctype-gb2312.c ctype-gbk.c
ctype-latin1.c ctype-mb.c ctype-simple.c ctype-sjis.c ctype-tis620.c ctype-uca.c
ctype-ucs2.c ctype-ujis.c ctype-utf8.c ctype-win1250ch.c ctype.c decimal.c dtoa.c int2str.c
ctype-unidata.c
is_prefix.c llstr.c longlong2str.c my_strtoll10.c my_vsnprintf.c
str2int.c strcend.c strend.c strfill.c strmake.c strmov.c strnmov.c
strxmov.c strxnmov.c xml.c
Expand Down
1 change: 0 additions & 1 deletion strings/conf_to_src.c
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,6 @@ void dispcset(FILE *f,CHARSET_INFO *cs)

fprintf(f," NULL, /* from_uni */\n");
fprintf(f," NULL, /* casefold */\n");
fprintf(f," &my_unicase_default, /* caseinfo */\n");
fprintf(f," NULL, /* state map */\n");
fprintf(f," NULL, /* ident map */\n");
fprintf(f," 1, /* strxfrm_multiply*/\n");
Expand Down
7 changes: 2 additions & 5 deletions strings/ctype-big5.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_big5[256]=
static MY_CASEFOLD_INFO my_casefold_big5=
{
0xFFFF,
my_casefold_pages_big5
my_casefold_pages_big5,
NULL /* ws */
};


Expand Down Expand Up @@ -6847,7 +6848,6 @@ struct charset_info_st my_charset_big5_chinese_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_big5, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -6879,7 +6879,6 @@ struct charset_info_st my_charset_big5_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_big5, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -6911,7 +6910,6 @@ struct charset_info_st my_charset_big5_chinese_nopad_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_big5, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -6943,7 +6941,6 @@ struct charset_info_st my_charset_big5_nopad_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_big5, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down
1 change: 0 additions & 1 deletion strings/ctype-bin.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,6 @@ struct charset_info_st my_charset_bin =
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
NULL, /* casefold */
&my_unicase_default, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down
7 changes: 2 additions & 5 deletions strings/ctype-cp932.c
Original file line number Diff line number Diff line change
Expand Up @@ -1706,7 +1706,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_cp932[256]=
MY_CASEFOLD_INFO my_casefold_cp932=
{
0xFFFF,
my_casefold_pages_cp932
my_casefold_pages_cp932,
NULL /* ws */
};


Expand Down Expand Up @@ -34805,7 +34806,6 @@ struct charset_info_st my_charset_cp932_japanese_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_cp932, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -34836,7 +34836,6 @@ struct charset_info_st my_charset_cp932_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_cp932, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -34868,7 +34867,6 @@ struct charset_info_st my_charset_cp932_japanese_nopad_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_cp932, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -34899,7 +34897,6 @@ struct charset_info_st my_charset_cp932_nopad_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_cp932, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down
1 change: 0 additions & 1 deletion strings/ctype-czech.c
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,6 @@ struct charset_info_st my_charset_latin2_czech_cs =
tab_8859_2_uni, /* tab_to_uni */
idx_uni_8859_2, /* tab_from_uni */
NULL, /* casefold */
&my_unicase_default,/* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
4, /* strxfrm_multiply */
Expand Down
7 changes: 2 additions & 5 deletions strings/ctype-euc_kr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1483,7 +1483,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_euckr[256]=
static MY_CASEFOLD_INFO my_casefold_euckr=
{
0xFFFF,
my_casefold_pages_euckr
my_casefold_pages_euckr,
NULL /* ws */
};


Expand Down Expand Up @@ -10095,7 +10096,6 @@ struct charset_info_st my_charset_euckr_korean_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_euckr, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -10127,7 +10127,6 @@ struct charset_info_st my_charset_euckr_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_euckr, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -10159,7 +10158,6 @@ struct charset_info_st my_charset_euckr_korean_nopad_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_euckr, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -10191,7 +10189,6 @@ struct charset_info_st my_charset_euckr_nopad_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_euckr, /* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down
7 changes: 2 additions & 5 deletions strings/ctype-eucjpms.c
Original file line number Diff line number Diff line change
Expand Up @@ -1779,7 +1779,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_eucjpms[512]=
static MY_CASEFOLD_INFO my_casefold_eucjpms=
{
0x0FFFF,
my_casefold_pages_eucjpms
my_casefold_pages_eucjpms,
NULL /* ws */
};


Expand Down Expand Up @@ -67634,7 +67635,6 @@ struct charset_info_st my_charset_eucjpms_japanese_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_eucjpms,/* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -67666,7 +67666,6 @@ struct charset_info_st my_charset_eucjpms_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_eucjpms,/* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -67698,7 +67697,6 @@ struct charset_info_st my_charset_eucjpms_japanese_nopad_ci=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_eucjpms,/* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down Expand Up @@ -67730,7 +67728,6 @@ struct charset_info_st my_charset_eucjpms_nopad_bin=
NULL, /* tab_to_uni */
NULL, /* tab_from_uni */
&my_casefold_eucjpms,/* casefold */
NULL, /* caseinfo */
NULL, /* state_map */
NULL, /* ident_map */
1, /* strxfrm_multiply */
Expand Down
Loading

0 comments on commit 6075f12

Please sign in to comment.