Skip to content

Commit 0736c03

Browse files
abarkovsanja-byelkin
authored andcommitted
MDEV-27009 Add UCA-14.0.0 collations - Adding implicit weight handling for Unicode-14.0.0
1. Adding separate functions for different Unicode versions - my_uca_520_implicit_weight_primary() It calculates implicit weights according to the old algorithm that we used to dump Unicode-5.2.0 weights. - my_uca_1400_implicit_weight_primary() It calculates implicit weights according to https://unicode.org/reports/tr10/#Values_For_Base_Table as of November 2021, Unicode version 14.0.0. 2. Adding the "@Version" line recognition when dumping allkeys.txt. Implicit weights are dumped according to @Version. 3. Dumping the scanned version as a "#define" 4. Removing dumping MY_UCA_NPAGES, MY_UCA_NCHARS, MY_UCA_CMASK, MY_UCA_PSHIFT, as they are defined in ctype-uca.c. Removing dumping of "main()", it's not needed. The intent is to generate an *.h file which can be put directly to the MariaDB source tree. 5. Adding a structure MY_DUCET. It now contains weights for single characters and version related members. Later we'll add contractions and logical positions in here.
1 parent bb84f61 commit 0736c03

File tree

4 files changed

+316
-37
lines changed

4 files changed

+316
-37
lines changed

strings/ctype-uca.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31694,7 +31694,7 @@ static inline void
3169431694
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
3169531695
{
3169631696
MY_UCA_IMPLICIT_WEIGHT weight;
31697-
weight= my_uca_520_implicit_weight_on_level(code, level);
31697+
weight= my_uca_implicit_weight_on_level(520, code, level);
3169831698
to[0]= weight.weight[0];
3169931699
to[1]= weight.weight[1];
3170031700
to[2]= 0;

strings/ctype-uca.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
1818
MA 02110-1335 USA */
1919

20+
#define MY_UCA_VERSION_ID(x,y,z) ((uint) ((x) * 100 + (y) * 10 + (z)))
2021

2122
/*
2223
Implicit weight handling is done according to
@@ -105,6 +106,18 @@ my_uca_520_implicit_weight_primary(my_wc_t code)
105106
}
106107

107108

109+
#include "ctype-uca1400.h"
110+
111+
112+
static inline MY_UCA_IMPLICIT_WEIGHT
113+
my_uca_implicit_weight_primary(uint version, my_wc_t code)
114+
{
115+
return version >= 1400 ?
116+
my_uca_1400_implicit_weight_primary(code) :
117+
my_uca_520_implicit_weight_primary(code);
118+
}
119+
120+
108121
static inline MY_UCA_IMPLICIT_WEIGHT
109122
my_uca_implicit_weight_secondary()
110123
{
@@ -136,11 +149,11 @@ my_uca_implicit_weight_quaternary()
136149

137150

138151
static inline MY_UCA_IMPLICIT_WEIGHT
139-
my_uca_520_implicit_weight_on_level(my_wc_t code, uint level)
152+
my_uca_implicit_weight_on_level(uint version, my_wc_t code, uint level)
140153
{
141154
switch (level) {
142155
case 0:
143-
return my_uca_520_implicit_weight_primary(code);
156+
return my_uca_implicit_weight_primary(version, code);
144157
case 1:
145158
return my_uca_implicit_weight_secondary();
146159
case 2:

strings/ctype-uca1400.h

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#ifndef CTYPE_UCA_1400_H
2+
#define CTYPE_UCA_1400_H
3+
/* Copyright (c) 2021, MariaDB
4+
5+
This library is free software; you can redistribute it and/or
6+
modify it under the terms of the GNU Library General Public
7+
License as published by the Free Software Foundation; version 2
8+
of the License.
9+
10+
This library is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13+
Library General Public License for more details.
14+
15+
You should have received a copy of the GNU Library General Public
16+
License along with this library; if not, write to the Free
17+
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
18+
MA 02110-1335 USA */
19+
20+
21+
/*
22+
17000..187FF; Tangut [6144]
23+
18800..18AFF; Tangut Components [768]
24+
18D00..18D7F; Tangut Supplement [128]
25+
*/
26+
static inline my_bool
27+
my_uca_1400_is_assigned_tangut(my_wc_t code)
28+
{
29+
return (code >= 0x17000 && code <= 0x187FF) ||
30+
(code >= 0x18800 && code <= 0x18AFF) ||
31+
(code >= 0x18D00 && code <= 0x18D7F);
32+
}
33+
34+
static inline MY_UCA_IMPLICIT_WEIGHT
35+
my_uca_1400_implicit_weight_primary_tangut(my_wc_t code)
36+
{
37+
MY_UCA_IMPLICIT_WEIGHT res;
38+
res.weight[0]= 0xFB00;
39+
res.weight[1]= (uint16) (code - 0x17000) | 0x8000;
40+
return res;
41+
}
42+
43+
44+
/*
45+
1B170..1B2FF; Nushu [400]
46+
*/
47+
static inline my_bool
48+
my_uca_1400_is_assigned_nushu(my_wc_t code)
49+
{
50+
return code >= 0x1B170 && code <= 0x1B2FF;
51+
}
52+
53+
static inline MY_UCA_IMPLICIT_WEIGHT
54+
my_uca_1400_implicit_weight_primary_nushu(my_wc_t code)
55+
{
56+
MY_UCA_IMPLICIT_WEIGHT res;
57+
res.weight[0]= 0xFB01;
58+
res.weight[1]= (uint16) (code - 0x1B170) | 0x8000;
59+
return res;
60+
}
61+
62+
63+
/*
64+
18B00..18CFF; Khitan Small Script [512]
65+
*/
66+
static inline my_bool
67+
my_uca_1400_is_assigned_khitan_small_script(my_wc_t code)
68+
{
69+
return code >= 0x18B00 && code <= 0x18CFF;
70+
}
71+
72+
static inline MY_UCA_IMPLICIT_WEIGHT
73+
my_uca_1400_implicit_weight_primary_khitan(my_wc_t code)
74+
{
75+
MY_UCA_IMPLICIT_WEIGHT res;
76+
res.weight[0]= 0xFB02;
77+
res.weight[1]= (uint16) (code - 0x18B00) | 0x8000;
78+
return res;
79+
}
80+
81+
82+
/*
83+
Unified_Ideograph=True AND
84+
((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs))
85+
86+
https://www.unicode.org/Public/14.0.0/ucd/Blocks.txt
87+
88+
4E00..9FFF; CJK Unified Ideographs
89+
F900..FAFF; CJK Compatibility Ideographs
90+
91+
https://www.unicode.org/Public/14.0.0/ucd/PropList.txt
92+
93+
4E00..9FFF ; Unified_Ideograph # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
94+
FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F
95+
FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11
96+
FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14
97+
FA1F ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA1F
98+
FA21 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA21
99+
FA23..FA24 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPATIBILITY IDEOGRAPH-FA24
100+
FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPATIBILITY IDEOGRAPH-FA29
101+
*/
102+
static inline my_bool
103+
my_uca_1400_is_core_han_unified_ideograph(my_wc_t code)
104+
{
105+
return (code >= 0x4E00 && code <= 0x9FFF) ||
106+
(code >= 0xFA0E && code <= 0xFA0F) ||
107+
(code == 0xFA11) ||
108+
(code >= 0xFA13 && code <= 0xFA14) ||
109+
(code == 0xFA1F) ||
110+
(code == 0xFA21) ||
111+
(code >= 0xFA23 && code <= 0xFA24) ||
112+
(code >= 0xFA27 && code <= 0xFA29);
113+
}
114+
115+
116+
/*
117+
(Unified_Ideograph=True AND NOT
118+
((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs))
119+
120+
https://www.unicode.org/Public/14.0.0/ucd/Blocks.txt
121+
122+
3400..4DBF; CJK Unified Ideographs Extension A
123+
20000..2A6DF; CJK Unified Ideographs Extension B
124+
2A700..2B73F; CJK Unified Ideographs Extension C
125+
2B740..2B81F; CJK Unified Ideographs Extension D
126+
2B820..2CEAF; CJK Unified Ideographs Extension E
127+
2CEB0..2EBEF; CJK Unified Ideographs Extension F
128+
30000..3134F; CJK Unified Ideographs Extension G
129+
130+
https://www.unicode.org/Public/14.0.0/ucd/PropList.txt
131+
132+
3400..4DBF ; Unified_Ideograph # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
133+
20000..2A6DF ; Unified_Ideograph # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
134+
2A700..2B738 ; Unified_Ideograph # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
135+
2B740..2B81D ; Unified_Ideograph # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
136+
2B820..2CEA1 ; Unified_Ideograph # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
137+
2CEB0..2EBE0 ; Unified_Ideograph # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
138+
30000..3134A ; Unified_Ideograph # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
139+
*/
140+
static inline my_bool
141+
my_uca_1400_is_other_han_unified_ideograph(my_wc_t code)
142+
{
143+
return (code >= 0x3400 && code <= 0x4DBF) ||
144+
(code >= 0x20000 && code <= 0x2A6DF) ||
145+
(code >= 0x2A700 && code <= 0x2B738) ||
146+
(code >= 0x2B740 && code <= 0x2B81D) ||
147+
(code >= 0x2B820 && code <= 0x2CEA1) ||
148+
(code >= 0x2CEB0 && code <= 0x2EBE0) ||
149+
(code >= 0x30000 && code <= 0x3134A);
150+
}
151+
152+
153+
/*
154+
See section "Computing Implicit Weights" in
155+
https://unicode.org/reports/tr10/#Values_For_Base_Table
156+
*/
157+
static inline MY_UCA_IMPLICIT_WEIGHT
158+
my_uca_1400_implicit_weight_primary(my_wc_t code)
159+
{
160+
if (my_uca_1400_is_core_han_unified_ideograph(code))
161+
return my_uca_implicit_weight_primary_default(0xFB40, code);
162+
163+
if (my_uca_1400_is_other_han_unified_ideograph(code))
164+
return my_uca_implicit_weight_primary_default(0xFB80, code);
165+
166+
if (my_uca_1400_is_assigned_tangut(code))
167+
return my_uca_1400_implicit_weight_primary_tangut(code);
168+
169+
if (my_uca_1400_is_assigned_nushu(code))
170+
return my_uca_1400_implicit_weight_primary_nushu(code);
171+
172+
if (my_uca_1400_is_assigned_khitan_small_script(code))
173+
return my_uca_1400_implicit_weight_primary_khitan(code);
174+
175+
/* Unassigned - Any other code point */
176+
return my_uca_implicit_weight_primary_default(0xFBC0, code);
177+
}
178+
179+
#endif /* CTYPE_UCA_1400_H */

0 commit comments

Comments
 (0)