Skip to content

Commit 00917fa

Browse files
author
Alexander Barkov
committed
MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters
- Moving the new my_charlen()-based code handling multi-byte characters from READ_INFO::field_field() to a new method READ_INFO::read_mbtail() - Reusing read_mbtail() in READ_INFO::read_value(), instead of the old my_mbcharlen()-based code which did not catch broken byte sequences
1 parent d516a2a commit 00917fa

File tree

4 files changed

+91
-47
lines changed

4 files changed

+91
-47
lines changed

mysql-test/r/ctype_utf8.result

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10448,5 +10448,16 @@ HEX(a)
1044810448
3F3F3F
1044910449
DROP TABLE t1;
1045010450
#
10451+
# MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters
10452+
#
10453+
CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
10454+
LOAD XML INFILE '../../std_data/loaddata/mdev9874.xml' INTO TABLE t1 CHARACTER SET utf8 ROWS IDENTIFIED BY '<row>';
10455+
Warnings:
10456+
Warning 1366 Incorrect string value: '\xD0' for column 'a' at row 1
10457+
SELECT HEX(a) FROM t1;
10458+
HEX(a)
10459+
613F
10460+
DROP TABLE t1;
10461+
#
1045110462
# End of 10.2 tests
1045210463
#
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<table><row><a>aĐ</a></row></table>

mysql-test/t/ctype_utf8.test

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,6 +1974,14 @@ LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CH
19741974
SELECT HEX(a) FROM t1;
19751975
DROP TABLE t1;
19761976

1977+
--echo #
1978+
--echo # MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters
1979+
--echo #
1980+
CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
1981+
LOAD XML INFILE '../../std_data/loaddata/mdev9874.xml' INTO TABLE t1 CHARACTER SET utf8 ROWS IDENTIFIED BY '<row>';
1982+
SELECT HEX(a) FROM t1;
1983+
DROP TABLE t1;
1984+
19771985
--echo #
19781986
--echo # End of 10.2 tests
19791987
--echo #

sql/sql_load.cc

Lines changed: 71 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,70 @@ class READ_INFO {
119119
*to= chr;
120120
return false;
121121
}
122+
123+
/**
124+
Read a tail of a multi-byte character.
125+
The first byte of the character is assumed to be already
126+
read from the file and appended to "str".
127+
128+
@returns true - if EOF happened unexpectedly
129+
@returns false - no EOF happened: found a good multi-byte character,
130+
or a bad byte sequence
131+
132+
Note:
133+
The return value depends only on EOF:
134+
- read_mbtail() returns "false" is a good character was read, but also
135+
- read_mbtail() returns "false" if an incomplete byte sequence was found
136+
and no EOF happened.
137+
138+
For example, suppose we have an ujis file with bytes 0x8FA10A, where:
139+
- 0x8FA1 is an incomplete prefix of a 3-byte character
140+
(it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
141+
- 0x0A is a line demiliter
142+
This file has some broken data, the trailing [A1-FE] is missing.
143+
144+
In this example it works as follows:
145+
- 0x8F is read from the file and put into "data" before the call
146+
for read_mbtail()
147+
- 0xA1 is read from the file and put into "data" by read_mbtail()
148+
- 0x0A is kept in the read queue, so the next read iteration after
149+
the current read_mbtail() call will normally find it and recognize as
150+
a line delimiter
151+
- the current call for read_mbtail() returns "false",
152+
because no EOF happened
153+
*/
154+
bool read_mbtail(String *str)
155+
{
156+
int chlen;
157+
if ((chlen= my_charlen(read_charset, str->end() - 1, str->end())) == 1)
158+
return false; // Single byte character found
159+
for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
160+
{
161+
int chr= GET;
162+
if (chr == my_b_EOF)
163+
{
164+
DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
165+
return true; // EOF
166+
}
167+
str->append(chr);
168+
chlen= my_charlen(read_charset, str->ptr() + length0, str->end());
169+
if (chlen == MY_CS_ILSEQ)
170+
{
171+
/**
172+
It has been an incomplete (but a valid) sequence so far,
173+
but the last byte turned it into a bad byte sequence.
174+
Unget the very last byte.
175+
*/
176+
str->length(str->length() - 1);
177+
PUSH(chr);
178+
DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
179+
return false; // Bad byte sequence
180+
}
181+
}
182+
DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
183+
return false; // Good multi-byte character
184+
}
185+
122186
public:
123187
bool error,line_cuted,found_null,enclosed;
124188
uchar *row_start, /* Found row starts here */
@@ -1590,33 +1654,8 @@ int READ_INFO::read_field()
15901654
}
15911655
}
15921656
data.append(chr);
1593-
if (use_mb(read_charset))
1594-
{
1595-
int chlen;
1596-
if ((chlen= my_charlen(read_charset, data.end() - 1,
1597-
data.end())) != 1)
1598-
{
1599-
for (uint32 length0= data.length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
1600-
{
1601-
chr= GET;
1602-
if (chr == my_b_EOF)
1603-
goto found_eof;
1604-
data.append(chr);
1605-
chlen= my_charlen(read_charset, data.ptr() + length0, data.end());
1606-
if (chlen == MY_CS_ILSEQ)
1607-
{
1608-
/**
1609-
It has been an incomplete (but a valid) sequence so far,
1610-
but the last byte turned it into a bad byte sequence.
1611-
Unget the very last byte.
1612-
*/
1613-
data.length(data.length() - 1);
1614-
PUSH(chr);
1615-
break;
1616-
}
1617-
}
1618-
}
1619-
}
1657+
if (use_mb(read_charset) && read_mbtail(&data))
1658+
goto found_eof;
16201659
}
16211660
/*
16221661
** We come here if buffer is too small. Enlarge it and continue
@@ -1872,26 +1911,8 @@ int READ_INFO::read_value(int delim, String *val)
18721911
int chr;
18731912
String tmp;
18741913

1875-
for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF;)
1914+
for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF; chr= GET)
18761915
{
1877-
#ifdef USE_MB
1878-
if (my_mbcharlen(read_charset, chr) > 1)
1879-
{
1880-
DBUG_PRINT("read_xml",("multi byte"));
1881-
int i, ml= my_mbcharlen(read_charset, chr);
1882-
for (i= 1; i < ml; i++)
1883-
{
1884-
val->append(chr);
1885-
/*
1886-
Don't use my_tospace() in the middle of a multi-byte character
1887-
TODO: check that the multi-byte sequence is valid.
1888-
*/
1889-
chr= GET;
1890-
if (chr == my_b_EOF)
1891-
return chr;
1892-
}
1893-
}
1894-
#endif
18951916
if(chr == '&')
18961917
{
18971918
tmp.length(0);
@@ -1911,8 +1932,11 @@ int READ_INFO::read_value(int delim, String *val)
19111932
}
19121933
}
19131934
else
1935+
{
19141936
val->append(chr);
1915-
chr= GET;
1937+
if (use_mb(read_charset) && read_mbtail(val))
1938+
return my_b_EOF;
1939+
}
19161940
}
19171941
return my_tospace(chr);
19181942
}

0 commit comments

Comments
 (0)