Skip to content

Commit 6075f12

Browse files
committed
MDEV-31071 Refactor case folding data types in Unicode collations
This is a non-functional change. It changes the way how case folding data and weight data (for simple Unicode collations) are stored: - Removing data types MY_UNICASE_CHARACTER, MY_UNICASE_INFO - Using data types MY_CASEFOLD_CHARACTER, MY_CASEFOLD_INFO instead. This patch changes simple Unicode collations in a similar way how MDEV-30695 previously changed Asian collations. No new MTR tests are needed. The underlying code is thoroughly covered by a number of ctype_*_ws.test and ctype_*_casefold.test files, which were added recently as a preparation for this change. Old and new Unicode data layout ------------------------------- Case folding data is now stored in separate tables consisting of MY_CASEFOLD_CHARACTER elements with two members: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; while weight data (for simple non-UCA collations xxx_general_ci and xxx_general_mysql500_ci) is stored in separate arrays of uint16 elements. Before this change case folding data and simple weight data were stored together, in tables of the following elements with three members: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; /* weights for simple collations */ } MY_UNICASE_CHARACTER; This data format was redundant, because weights (the "sort" member) were needed only for these two simple Unicode collations: - xxx_general_ci - xxx_general_mysql500_ci Adding case folding information for Unicode-14.0.0 using the old format would waste memory without purpose. Detailed changes ---------------- - Changing the underlying data types as described above - Including unidata-dump.c into the sources. This program was earlier used to dump UnicodeData.txt (e.g. https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt) into MySQL / MariaDB source files. It was originally written in 2002, but has not been distributed yet together with MySQL / MariaDB sources. - Removing the old format Unicode data earlier dumped from UnicodeData.txt (versions 3.0.0 and 5.2.0) from ctype-utf8.c. Adding Unicode data in the new format into separate header files, to maintain the code easier: - ctype-unicode300-casefold.h - ctype-unicode300-casefold-tr.h - ctype-unicode300-general_ci.h - ctype-unicode300-general_mysql500_ci.h - ctype-unicode520-casefold.h - Adding a new file ctype-unidata.c as an aggregator for the header files listed above.
1 parent 2ad287c commit 6075f12

29 files changed

+7471
-5195
lines changed

include/m_ctype.h

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ extern "C" {
7979
typedef const struct my_charset_handler_st MY_CHARSET_HANDLER;
8080
typedef const struct my_collation_handler_st MY_COLLATION_HANDLER;
8181

82-
typedef const struct unicase_info_st MY_UNICASE_INFO;
8382
typedef const struct casefold_info_st MY_CASEFOLD_INFO;
8483
typedef const struct uni_ctype_st MY_UNI_CTYPE;
8584
typedef const struct my_uni_idx_st MY_UNI_IDX;
@@ -97,29 +96,10 @@ struct casefold_info_st
9796
{
9897
my_wc_t maxchar;
9998
const MY_CASEFOLD_CHARACTER * const *page;
99+
const uint16 * const *simple_weight; /* For general_ci-alike collations */
100100
};
101101

102102

103-
typedef struct unicase_info_char_st
104-
{
105-
uint32 toupper;
106-
uint32 tolower;
107-
uint32 sort;
108-
} MY_UNICASE_CHARACTER;
109-
110-
111-
struct unicase_info_st
112-
{
113-
my_wc_t maxchar;
114-
MY_UNICASE_CHARACTER **page;
115-
};
116-
117-
118-
extern MY_UNICASE_INFO my_unicase_default;
119-
extern MY_UNICASE_INFO my_unicase_turkish;
120-
extern MY_UNICASE_INFO my_unicase_mysql500;
121-
extern MY_UNICASE_INFO my_unicase_unicode520;
122-
123103
#define MY_UCA_MAX_CONTRACTION 6
124104
/*
125105
The DUCET tables in ctype-uca.c are dumped with a limit of 8 weights
@@ -795,7 +775,6 @@ struct charset_info_st
795775
const uint16 *tab_to_uni;
796776
MY_UNI_IDX *tab_from_uni;
797777
MY_CASEFOLD_INFO *casefold;
798-
MY_UNICASE_INFO *caseinfo;
799778
const uchar *state_map;
800779
const uchar *ident_map;
801780
uint strxfrm_multiply;
@@ -1691,7 +1670,7 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
16911670
const char *str, const char *str_end,
16921671
const char *wildstr, const char *wildend,
16931672
int escape, int w_one, int w_many,
1694-
MY_UNICASE_INFO *weights);
1673+
MY_CASEFOLD_INFO *weights);
16951674

16961675
extern my_bool my_parse_charset_xml(MY_CHARSET_LOADER *loader,
16971676
const char *buf, size_t buflen);

strings/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ SET(STRINGS_SOURCES bchange.c bmove_upp.c ctype-big5.c ctype-bin.c ctype-cp932.c
2222
ctype-czech.c ctype-euc_kr.c ctype-eucjpms.c ctype-extra.c ctype-gb2312.c ctype-gbk.c
2323
ctype-latin1.c ctype-mb.c ctype-simple.c ctype-sjis.c ctype-tis620.c ctype-uca.c
2424
ctype-ucs2.c ctype-ujis.c ctype-utf8.c ctype-win1250ch.c ctype.c decimal.c dtoa.c int2str.c
25+
ctype-unidata.c
2526
is_prefix.c llstr.c longlong2str.c my_strtoll10.c my_vsnprintf.c
2627
str2int.c strcend.c strend.c strfill.c strmake.c strmov.c strnmov.c
2728
strxmov.c strxnmov.c xml.c

strings/conf_to_src.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,6 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
409409

410410
fprintf(f," NULL, /* from_uni */\n");
411411
fprintf(f," NULL, /* casefold */\n");
412-
fprintf(f," &my_unicase_default, /* caseinfo */\n");
413412
fprintf(f," NULL, /* state map */\n");
414413
fprintf(f," NULL, /* ident map */\n");
415414
fprintf(f," 1, /* strxfrm_multiply*/\n");

strings/ctype-big5.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_big5[256]=
806806
static MY_CASEFOLD_INFO my_casefold_big5=
807807
{
808808
0xFFFF,
809-
my_casefold_pages_big5
809+
my_casefold_pages_big5,
810+
NULL /* ws */
810811
};
811812

812813

@@ -6847,7 +6848,6 @@ struct charset_info_st my_charset_big5_chinese_ci=
68476848
NULL, /* tab_to_uni */
68486849
NULL, /* tab_from_uni */
68496850
&my_casefold_big5, /* casefold */
6850-
NULL, /* caseinfo */
68516851
NULL, /* state_map */
68526852
NULL, /* ident_map */
68536853
1, /* strxfrm_multiply */
@@ -6879,7 +6879,6 @@ struct charset_info_st my_charset_big5_bin=
68796879
NULL, /* tab_to_uni */
68806880
NULL, /* tab_from_uni */
68816881
&my_casefold_big5, /* casefold */
6882-
NULL, /* caseinfo */
68836882
NULL, /* state_map */
68846883
NULL, /* ident_map */
68856884
1, /* strxfrm_multiply */
@@ -6911,7 +6910,6 @@ struct charset_info_st my_charset_big5_chinese_nopad_ci=
69116910
NULL, /* tab_to_uni */
69126911
NULL, /* tab_from_uni */
69136912
&my_casefold_big5, /* casefold */
6914-
NULL, /* caseinfo */
69156913
NULL, /* state_map */
69166914
NULL, /* ident_map */
69176915
1, /* strxfrm_multiply */
@@ -6943,7 +6941,6 @@ struct charset_info_st my_charset_big5_nopad_bin=
69436941
NULL, /* tab_to_uni */
69446942
NULL, /* tab_from_uni */
69456943
&my_casefold_big5, /* casefold */
6946-
NULL, /* caseinfo */
69476944
NULL, /* state_map */
69486945
NULL, /* ident_map */
69496946
1, /* strxfrm_multiply */

strings/ctype-bin.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ struct charset_info_st my_charset_bin =
625625
NULL, /* tab_to_uni */
626626
NULL, /* tab_from_uni */
627627
NULL, /* casefold */
628-
&my_unicase_default, /* caseinfo */
629628
NULL, /* state_map */
630629
NULL, /* ident_map */
631630
1, /* strxfrm_multiply */

strings/ctype-cp932.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_cp932[256]=
17061706
MY_CASEFOLD_INFO my_casefold_cp932=
17071707
{
17081708
0xFFFF,
1709-
my_casefold_pages_cp932
1709+
my_casefold_pages_cp932,
1710+
NULL /* ws */
17101711
};
17111712

17121713

@@ -34805,7 +34806,6 @@ struct charset_info_st my_charset_cp932_japanese_ci=
3480534806
NULL, /* tab_to_uni */
3480634807
NULL, /* tab_from_uni */
3480734808
&my_casefold_cp932, /* casefold */
34808-
NULL, /* caseinfo */
3480934809
NULL, /* state_map */
3481034810
NULL, /* ident_map */
3481134811
1, /* strxfrm_multiply */
@@ -34836,7 +34836,6 @@ struct charset_info_st my_charset_cp932_bin=
3483634836
NULL, /* tab_to_uni */
3483734837
NULL, /* tab_from_uni */
3483834838
&my_casefold_cp932, /* casefold */
34839-
NULL, /* caseinfo */
3484034839
NULL, /* state_map */
3484134840
NULL, /* ident_map */
3484234841
1, /* strxfrm_multiply */
@@ -34868,7 +34867,6 @@ struct charset_info_st my_charset_cp932_japanese_nopad_ci=
3486834867
NULL, /* tab_to_uni */
3486934868
NULL, /* tab_from_uni */
3487034869
&my_casefold_cp932, /* casefold */
34871-
NULL, /* caseinfo */
3487234870
NULL, /* state_map */
3487334871
NULL, /* ident_map */
3487434872
1, /* strxfrm_multiply */
@@ -34899,7 +34897,6 @@ struct charset_info_st my_charset_cp932_nopad_bin=
3489934897
NULL, /* tab_to_uni */
3490034898
NULL, /* tab_from_uni */
3490134899
&my_casefold_cp932, /* casefold */
34902-
NULL, /* caseinfo */
3490334900
NULL, /* state_map */
3490434901
NULL, /* ident_map */
3490534902
1, /* strxfrm_multiply */

strings/ctype-czech.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,6 @@ struct charset_info_st my_charset_latin2_czech_cs =
618618
tab_8859_2_uni, /* tab_to_uni */
619619
idx_uni_8859_2, /* tab_from_uni */
620620
NULL, /* casefold */
621-
&my_unicase_default,/* caseinfo */
622621
NULL, /* state_map */
623622
NULL, /* ident_map */
624623
4, /* strxfrm_multiply */

strings/ctype-euc_kr.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1483,7 +1483,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_euckr[256]=
14831483
static MY_CASEFOLD_INFO my_casefold_euckr=
14841484
{
14851485
0xFFFF,
1486-
my_casefold_pages_euckr
1486+
my_casefold_pages_euckr,
1487+
NULL /* ws */
14871488
};
14881489

14891490

@@ -10095,7 +10096,6 @@ struct charset_info_st my_charset_euckr_korean_ci=
1009510096
NULL, /* tab_to_uni */
1009610097
NULL, /* tab_from_uni */
1009710098
&my_casefold_euckr, /* casefold */
10098-
NULL, /* caseinfo */
1009910099
NULL, /* state_map */
1010010100
NULL, /* ident_map */
1010110101
1, /* strxfrm_multiply */
@@ -10127,7 +10127,6 @@ struct charset_info_st my_charset_euckr_bin=
1012710127
NULL, /* tab_to_uni */
1012810128
NULL, /* tab_from_uni */
1012910129
&my_casefold_euckr, /* casefold */
10130-
NULL, /* caseinfo */
1013110130
NULL, /* state_map */
1013210131
NULL, /* ident_map */
1013310132
1, /* strxfrm_multiply */
@@ -10159,7 +10158,6 @@ struct charset_info_st my_charset_euckr_korean_nopad_ci=
1015910158
NULL, /* tab_to_uni */
1016010159
NULL, /* tab_from_uni */
1016110160
&my_casefold_euckr, /* casefold */
10162-
NULL, /* caseinfo */
1016310161
NULL, /* state_map */
1016410162
NULL, /* ident_map */
1016510163
1, /* strxfrm_multiply */
@@ -10191,7 +10189,6 @@ struct charset_info_st my_charset_euckr_nopad_bin=
1019110189
NULL, /* tab_to_uni */
1019210190
NULL, /* tab_from_uni */
1019310191
&my_casefold_euckr, /* casefold */
10194-
NULL, /* caseinfo */
1019510192
NULL, /* state_map */
1019610193
NULL, /* ident_map */
1019710194
1, /* strxfrm_multiply */

strings/ctype-eucjpms.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,7 +1779,8 @@ static const MY_CASEFOLD_CHARACTER *my_casefold_pages_eucjpms[512]=
17791779
static MY_CASEFOLD_INFO my_casefold_eucjpms=
17801780
{
17811781
0x0FFFF,
1782-
my_casefold_pages_eucjpms
1782+
my_casefold_pages_eucjpms,
1783+
NULL /* ws */
17831784
};
17841785

17851786

@@ -67634,7 +67635,6 @@ struct charset_info_st my_charset_eucjpms_japanese_ci=
6763467635
NULL, /* tab_to_uni */
6763567636
NULL, /* tab_from_uni */
6763667637
&my_casefold_eucjpms,/* casefold */
67637-
NULL, /* caseinfo */
6763867638
NULL, /* state_map */
6763967639
NULL, /* ident_map */
6764067640
1, /* strxfrm_multiply */
@@ -67666,7 +67666,6 @@ struct charset_info_st my_charset_eucjpms_bin=
6766667666
NULL, /* tab_to_uni */
6766767667
NULL, /* tab_from_uni */
6766867668
&my_casefold_eucjpms,/* casefold */
67669-
NULL, /* caseinfo */
6767067669
NULL, /* state_map */
6767167670
NULL, /* ident_map */
6767267671
1, /* strxfrm_multiply */
@@ -67698,7 +67697,6 @@ struct charset_info_st my_charset_eucjpms_japanese_nopad_ci=
6769867697
NULL, /* tab_to_uni */
6769967698
NULL, /* tab_from_uni */
6770067699
&my_casefold_eucjpms,/* casefold */
67701-
NULL, /* caseinfo */
6770267700
NULL, /* state_map */
6770367701
NULL, /* ident_map */
6770467702
1, /* strxfrm_multiply */
@@ -67730,7 +67728,6 @@ struct charset_info_st my_charset_eucjpms_nopad_bin=
6773067728
NULL, /* tab_to_uni */
6773167729
NULL, /* tab_from_uni */
6773267730
&my_casefold_eucjpms,/* casefold */
67733-
NULL, /* caseinfo */
6773467731
NULL, /* state_map */
6773567732
NULL, /* ident_map */
6773667733
1, /* strxfrm_multiply */

0 commit comments

Comments
 (0)