Skip to content

Commit 02de93d

Browse files
committed
MDEV-27154 allkeys.txt based tests for Unicode-4.0.0 and 5.2.0
1 parent 897d8c5 commit 02de93d

File tree

7 files changed

+38031
-0
lines changed

7 files changed

+38031
-0
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#
2+
# Make a table with all Unicode characters
3+
# in the range U+0000 .. U+10FFFF
4+
#
5+
6+
CREATE TABLE allchars AS SELECT 1 AS code, ' ' AS str LIMIT 0;
7+
SHOW CREATE TABLE allchars;
8+
9+
CREATE TABLE t1tmp (a INT NOT NULL);
10+
DELIMITER $$;
11+
FOR i IN 0..0xFFF
12+
DO
13+
INSERT INTO t1tmp VALUES (i);
14+
END FOR;
15+
$$
16+
DELIMITER ;$$
17+
INSERT INTO allchars SELECT
18+
t1.a*0x1000+t2.a,
19+
CHAR(t1.a*0x1000+t2.a USING utf32)
20+
FROM t1tmp t1, t1tmp t2
21+
WHERE t1.a BETWEEN 0 AND 0x10F;
22+
DROP TABLE t1tmp;
23+
SELECT COUNT(*) FROM allchars;
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#
2+
# Start of 10.8 tests
3+
#
4+
SET NAMES utf8mb4 COLLATE utf8mb4_bin;
5+
CREATE TABLE allchars AS SELECT 1 AS code, ' ' AS str LIMIT 0;
6+
SHOW CREATE TABLE allchars;
7+
Table Create Table
8+
allchars CREATE TABLE `allchars` (
9+
`code` int(1) NOT NULL,
10+
`str` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL
11+
) ENGINE=MyISAM DEFAULT CHARSET=latin1
12+
CREATE TABLE t1tmp (a INT NOT NULL);
13+
FOR i IN 0..0xFFF
14+
DO
15+
INSERT INTO t1tmp VALUES (i);
16+
END FOR;
17+
$$
18+
INSERT INTO allchars SELECT
19+
t1.a*0x1000+t2.a,
20+
CHAR(t1.a*0x1000+t2.a USING utf32)
21+
FROM t1tmp t1, t1tmp t2
22+
WHERE t1.a BETWEEN 0 AND 0x10F;
23+
DROP TABLE t1tmp;
24+
SELECT COUNT(*) FROM allchars;
25+
COUNT(*)
26+
1114112
27+
CREATE TABLE allkeys_txt (a TEXT, b TEXT, c TEXT) ENGINE=MyISAM;
28+
LOAD DATA INFILE '../../std_data/unicode/allkeys400.txt'
29+
INTO TABLE allkeys_txt FIELDS TERMINATED BY ';' (@a,@b,@qq)
30+
SET a=TRIM(@a), b=TRIM(REGEXP_SUBSTR(@b,'^[^#]*')), c=TRIM(REGEXP_SUBSTR(@b, '#.*$'));
31+
CREATE TABLE allkeys AS
32+
SELECT
33+
a,
34+
CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_bin AS str,
35+
HEX(WEIGHT_STRING(CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_unicode_ci)) as ws,
36+
REPLACE(REPLACE(REGEXP_REPLACE(b,'[[][.*](....)[.]....[.]....[.].{4,5}]','-\\1-'),'-0000-',''),'-','') AS wd,
37+
c
38+
FROM allkeys_txt
39+
WHERE a RLIKE '^[0-9A-Z]';
40+
ALTER TABLE allkeys ADD KEY(str(3));
41+
SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE OCTET_LENGTH(str)<=3;
42+
COUNT(*) SUM(ws<>wd)
43+
12073 1
44+
SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND OCTET_LENGTH(str)<=3;
45+
a ws wd
46+
FDFA FBC1FDFA 138713AB13C70209135013AB13AB13B70209138F13AB13C813B7020913BD138113AB13B0
47+
SELECT
48+
HEX(code),
49+
HEX(WEIGHT_STRING(str COLLATE utf8mb4_unicode_ci)) AS ws,
50+
CASE
51+
WHEN code >= 0x10000 THEN 'FFFD'
52+
WHEN code >= 0x3400 AND code <= 0x4DB5 THEN
53+
CONCAT(LPAD(HEX(0xFB80 + (code >> 15)),4,'0'),
54+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
55+
WHEN code >= 0x4E00 AND code <= 0x9FA5 THEN
56+
CONCAT(LPAD(HEX(0xFB40 + (code >> 15)),4,'0'),
57+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
58+
ELSE
59+
CONCAT(LPAD(HEX(0xFBC0 + (code >> 15)),4,'0'),
60+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
61+
END AS wd
62+
FROM allchars
63+
LEFT OUTER JOIN allkeys USING (str)
64+
WHERE allkeys.str IS NULL
65+
HAVING ws<>wd
66+
ORDER BY HEX(str);
67+
HEX(code) ws wd
68+
DROP TABLE allkeys_txt;
69+
DROP TABLE allkeys;
70+
DROP TABLE allchars;
71+
#
72+
# End of 10.8 tests
73+
#
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
--source include/have_utf32.inc
2+
--source include/have_utf8mb4.inc
3+
4+
--echo #
5+
--echo # Start of 10.8 tests
6+
--echo #
7+
8+
9+
SET NAMES utf8mb4 COLLATE utf8mb4_bin;
10+
--source include/ctype_unicode_allchars.inc
11+
12+
#
13+
# Load allkeys.txt from Unicode-4.0.0
14+
#
15+
# The 4.0.0 file has four weight levels and an optional extra field
16+
# after the character name, e.g. "; QQK"
17+
#00A0 ; [*0209.0020.001B.00A0] # NO-BREAK SPACE; QQK
18+
#
19+
20+
CREATE TABLE allkeys_txt (a TEXT, b TEXT, c TEXT) ENGINE=MyISAM;
21+
LOAD DATA INFILE '../../std_data/unicode/allkeys400.txt'
22+
INTO TABLE allkeys_txt FIELDS TERMINATED BY ';' (@a,@b,@qq)
23+
SET a=TRIM(@a), b=TRIM(REGEXP_SUBSTR(@b,'^[^#]*')), c=TRIM(REGEXP_SUBSTR(@b, '#.*$'));
24+
CREATE TABLE allkeys AS
25+
SELECT
26+
a,
27+
CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_bin AS str,
28+
HEX(WEIGHT_STRING(CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_unicode_ci)) as ws,
29+
REPLACE(REPLACE(REGEXP_REPLACE(b,'[[][.*](....)[.]....[.]....[.].{4,5}]','-\\1-'),'-0000-',''),'-','') AS wd,
30+
c
31+
FROM allkeys_txt
32+
WHERE a RLIKE '^[0-9A-Z]';
33+
ALTER TABLE allkeys ADD KEY(str(3));
34+
35+
#
36+
# Test explicit weights
37+
# utf8mb4_unicode_ci supports only BMP characters.
38+
# Built-in default contractions are not supported.
39+
# The (OCTET_LENGTH(str)<=3) part of the condition filters out
40+
# characters outside BMP and contractions.
41+
42+
SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE OCTET_LENGTH(str)<=3;
43+
SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND OCTET_LENGTH(str)<=3;
44+
45+
#
46+
# Test implicit weights
47+
# Non-BMP characters all have the same weight FFFD.
48+
#
49+
50+
SELECT
51+
HEX(code),
52+
HEX(WEIGHT_STRING(str COLLATE utf8mb4_unicode_ci)) AS ws,
53+
CASE
54+
WHEN code >= 0x10000 THEN 'FFFD'
55+
WHEN code >= 0x3400 AND code <= 0x4DB5 THEN
56+
CONCAT(LPAD(HEX(0xFB80 + (code >> 15)),4,'0'),
57+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
58+
WHEN code >= 0x4E00 AND code <= 0x9FA5 THEN
59+
CONCAT(LPAD(HEX(0xFB40 + (code >> 15)),4,'0'),
60+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
61+
ELSE
62+
CONCAT(LPAD(HEX(0xFBC0 + (code >> 15)),4,'0'),
63+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
64+
END AS wd
65+
FROM allchars
66+
LEFT OUTER JOIN allkeys USING (str)
67+
WHERE allkeys.str IS NULL
68+
HAVING ws<>wd
69+
ORDER BY HEX(str);
70+
71+
DROP TABLE allkeys_txt;
72+
DROP TABLE allkeys;
73+
DROP TABLE allchars;
74+
75+
--echo #
76+
--echo # End of 10.8 tests
77+
--echo #
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#
2+
# Start of 10.8 tests
3+
#
4+
SET NAMES utf8mb4 COLLATE utf8mb4_bin;
5+
CREATE TABLE allchars AS SELECT 1 AS code, ' ' AS str LIMIT 0;
6+
SHOW CREATE TABLE allchars;
7+
Table Create Table
8+
allchars CREATE TABLE `allchars` (
9+
`code` int(1) NOT NULL,
10+
`str` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL
11+
) ENGINE=MyISAM DEFAULT CHARSET=latin1
12+
CREATE TABLE t1tmp (a INT NOT NULL);
13+
FOR i IN 0..0xFFF
14+
DO
15+
INSERT INTO t1tmp VALUES (i);
16+
END FOR;
17+
$$
18+
INSERT INTO allchars SELECT
19+
t1.a*0x1000+t2.a,
20+
CHAR(t1.a*0x1000+t2.a USING utf32)
21+
FROM t1tmp t1, t1tmp t2
22+
WHERE t1.a BETWEEN 0 AND 0x10F;
23+
DROP TABLE t1tmp;
24+
SELECT COUNT(*) FROM allchars;
25+
COUNT(*)
26+
1114112
27+
CREATE TABLE allkeys_txt (a TEXT, b TEXT, c TEXT) ENGINE=MyISAM;
28+
LOAD DATA INFILE '../../std_data/unicode/allkeys520.txt'
29+
INTO TABLE allkeys_txt FIELDS TERMINATED BY ';' (@a,@b,@qq)
30+
SET a=TRIM(@a), b=TRIM(REGEXP_SUBSTR(@b,'^[^#]*')), c=TRIM(REGEXP_SUBSTR(@b, '#.*$'));
31+
CREATE TABLE allkeys AS
32+
SELECT
33+
a,
34+
CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_bin AS str,
35+
HEX(WEIGHT_STRING(CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_unicode_520_ci)) as ws,
36+
REPLACE(REPLACE(REGEXP_REPLACE(b,'[[][.*](....)[.]....[.]....[.].{4,5}]','-\\1-'),'-0000-',''),'-','') AS wd,
37+
c
38+
FROM allkeys_txt
39+
WHERE a RLIKE '^[0-9A-Z]';
40+
ALTER TABLE allkeys ADD KEY(str(3));
41+
SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE a NOT LIKE '% %';
42+
COUNT(*) SUM(ws<>wd)
43+
21807 1
44+
SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND a NOT LIKE '% %';
45+
a ws wd
46+
FDFA 18FC192B194F020A18AD192B192B193D 18FC192B194F020A18AD192B192B193D020A1904192B1950193D020A194318F1192B1931
47+
SELECT
48+
HEX(code),
49+
HEX(WEIGHT_STRING(str COLLATE utf8mb4_unicode_520_ci)) AS ws,
50+
CASE
51+
WHEN code >= 0x3400 AND code <= 0x4DB5 THEN
52+
CONCAT(LPAD(HEX(0xFB80 + (code >> 15)),4,'0'),
53+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
54+
WHEN code >= 0x4E00 AND code <= 0x9FA5 THEN
55+
CONCAT(LPAD(HEX(0xFB40 + (code >> 15)),4,'0'),
56+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
57+
ELSE
58+
CONCAT(LPAD(HEX(0xFBC0 + (code >> 15)),4,'0'),
59+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
60+
END AS wd
61+
FROM allchars
62+
LEFT OUTER JOIN allkeys USING (str)
63+
WHERE allkeys.str IS NULL
64+
HAVING ws<>wd
65+
ORDER BY HEX(str);
66+
HEX(code) ws wd
67+
DROP TABLE allkeys_txt;
68+
DROP TABLE allkeys;
69+
DROP TABLE allchars;
70+
#
71+
# End of 10.8 tests
72+
#
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
--source include/have_utf32.inc
2+
--source include/have_utf8mb4.inc
3+
4+
--echo #
5+
--echo # Start of 10.8 tests
6+
--echo #
7+
8+
9+
SET NAMES utf8mb4 COLLATE utf8mb4_bin;
10+
--source include/ctype_unicode_allchars.inc
11+
12+
#
13+
# Load allkeys.txt from Unicode-5.2.0
14+
#
15+
# The 5.2.0 file has four weight levels and an optional extra field
16+
# after the character name, e.g. "; QQK"
17+
#00A0 ; [*020A.0020.001B.00A0] # NO-BREAK SPACE; QQK
18+
#
19+
20+
CREATE TABLE allkeys_txt (a TEXT, b TEXT, c TEXT) ENGINE=MyISAM;
21+
LOAD DATA INFILE '../../std_data/unicode/allkeys520.txt'
22+
INTO TABLE allkeys_txt FIELDS TERMINATED BY ';' (@a,@b,@qq)
23+
SET a=TRIM(@a), b=TRIM(REGEXP_SUBSTR(@b,'^[^#]*')), c=TRIM(REGEXP_SUBSTR(@b, '#.*$'));
24+
CREATE TABLE allkeys AS
25+
SELECT
26+
a,
27+
CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_bin AS str,
28+
HEX(WEIGHT_STRING(CONVERT(CAST(UNHEX(regexp_replace(regexp_replace(regexp_replace(a,'(\\b[0-9A-Z]{4}\\b)','-0000\\1-'),'(\\b[0-9A-Z]{5}\\b)','-000\\1-'),'[ -]','')) AS CHAR CHARACTER SET utf32) USING utf8mb4) COLLATE utf8mb4_unicode_520_ci)) as ws,
29+
REPLACE(REPLACE(REGEXP_REPLACE(b,'[[][.*](....)[.]....[.]....[.].{4,5}]','-\\1-'),'-0000-',''),'-','') AS wd,
30+
c
31+
FROM allkeys_txt
32+
WHERE a RLIKE '^[0-9A-Z]';
33+
ALTER TABLE allkeys ADD KEY(str(3));
34+
35+
#
36+
# Test explicit weights
37+
# Built-in default contractions are not supported.
38+
# The (NOT LIKE '% %') part of the condition filters out contractions.
39+
40+
SELECT COUNT(*), SUM(ws<>wd) FROM allkeys WHERE a NOT LIKE '% %';
41+
SELECT a, ws, wd FROM allkeys WHERE ws<>wd AND a NOT LIKE '% %';
42+
43+
44+
#
45+
# Test implicit weights
46+
# Non-BMP characters all have the same weight FFFD.
47+
#
48+
49+
SELECT
50+
HEX(code),
51+
HEX(WEIGHT_STRING(str COLLATE utf8mb4_unicode_520_ci)) AS ws,
52+
CASE
53+
WHEN code >= 0x3400 AND code <= 0x4DB5 THEN
54+
CONCAT(LPAD(HEX(0xFB80 + (code >> 15)),4,'0'),
55+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
56+
WHEN code >= 0x4E00 AND code <= 0x9FA5 THEN
57+
CONCAT(LPAD(HEX(0xFB40 + (code >> 15)),4,'0'),
58+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
59+
ELSE
60+
CONCAT(LPAD(HEX(0xFBC0 + (code >> 15)),4,'0'),
61+
LPAD(HEX(0x8000 | (code & 0x7FFF)),4,'0'))
62+
END AS wd
63+
FROM allchars
64+
LEFT OUTER JOIN allkeys USING (str)
65+
WHERE allkeys.str IS NULL
66+
HAVING ws<>wd
67+
ORDER BY HEX(str);
68+
69+
DROP TABLE allkeys_txt;
70+
DROP TABLE allkeys;
71+
DROP TABLE allchars;
72+
73+
--echo #
74+
--echo # End of 10.8 tests
75+
--echo #

0 commit comments

Comments
 (0)