Skip to content

Commit 3fc6a8b

Browse files
author
Alexander Barkov
committed
MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
1 parent 1d73005 commit 3fc6a8b

File tree

9 files changed

+165
-21
lines changed

9 files changed

+165
-21
lines changed

include/m_ctype.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
180180
/* A helper macros for "need at least n bytes" */
181181
#define MY_CS_TOOSMALLN(n) (-100-(n))
182182

183+
#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */
184+
#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
185+
186+
183187
#define MY_SEQ_INTTAIL 1
184188
#define MY_SEQ_SPACES 2
185189

mysql-test/r/ctype_gbk.result

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5926,3 +5926,24 @@ Warning 1300 Invalid gb2312 character string: '\xA3A'
59265926
#
59275927
# End of 10.1 tests
59285928
#
5929+
#
5930+
# Start of 10.2 tests
5931+
#
5932+
#
5933+
# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
5934+
#
5935+
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
5936+
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
5937+
SELECT HEX(a) FROM t1;
5938+
HEX(a)
5939+
B04061B041
5940+
B042
5941+
DELETE FROM t1;
5942+
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
5943+
SELECT HEX(a) FROM t1;
5944+
HEX(a)
5945+
B042
5946+
DROP TABLE t1;
5947+
#
5948+
# End of 10.2 tests
5949+
#

mysql-test/r/ctype_utf8.result

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default;
1040110401
#
1040210402
# End of 10.1 tests
1040310403
#
10404+
#
10405+
# Start of 10.2 tests
10406+
#
10407+
#
10408+
# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
10409+
#
10410+
CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
10411+
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё';
10412+
Warnings:
10413+
Warning 1638 Non-ASCII separator arguments are not fully supported
10414+
SELECT c1 FROM t1;
10415+
c1
10416+
a
10417+
b
10418+
c
10419+
DELETE FROM t1;
10420+
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES;
10421+
Warnings:
10422+
Warning 1638 Non-ASCII separator arguments are not fully supported
10423+
SELECT c1 FROM t1;
10424+
c1
10425+
b
10426+
c
10427+
DROP TABLE t1;
10428+
#
10429+
# End of 10.2 tests
10430+
#
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
�@a�A@�B@
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
aёёbёёcёё

mysql-test/t/ctype_gbk.test

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8));
435435
--echo #
436436
--echo # End of 10.1 tests
437437
--echo #
438+
439+
--echo #
440+
--echo # Start of 10.2 tests
441+
--echo #
442+
443+
--echo #
444+
--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
445+
--echo #
446+
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk);
447+
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@';
448+
SELECT HEX(a) FROM t1;
449+
DELETE FROM t1;
450+
LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES;
451+
SELECT HEX(a) FROM t1;
452+
DROP TABLE t1;
453+
454+
--echo #
455+
--echo # End of 10.2 tests
456+
--echo #

mysql-test/t/ctype_utf8.test

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default;
19501950
--echo #
19511951
--echo # End of 10.1 tests
19521952
--echo #
1953+
1954+
--echo #
1955+
--echo # Start of 10.2 tests
1956+
--echo #
1957+
1958+
--echo #
1959+
--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
1960+
--echo #
1961+
CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8);
1962+
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё';
1963+
SELECT c1 FROM t1;
1964+
DELETE FROM t1;
1965+
LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ёё' IGNORE 1 LINES;
1966+
SELECT c1 FROM t1;
1967+
DROP TABLE t1;
1968+
1969+
--echo #
1970+
--echo # End of 10.2 tests
1971+
--echo #

mysys/charset.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,7 @@ static void init_available_charsets(void)
545545
{
546546
if (*cs)
547547
{
548+
DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
548549
if (cs[0]->ctype)
549550
if (init_state_maps(*cs))
550551
*cs= NULL;

sql/sql_load.cc

Lines changed: 72 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ class READ_INFO {
7979
NET *io_net;
8080
int level; /* for load xml */
8181

82+
bool getbyte(char *to)
83+
{
84+
int chr= GET;
85+
if (chr == my_b_EOF)
86+
return (eof= true);
87+
*to= chr;
88+
return false;
89+
}
8290
public:
8391
bool error,line_cuted,found_null,enclosed;
8492
uchar *row_start, /* Found row starts here */
@@ -1706,33 +1714,76 @@ int READ_INFO::next_line()
17061714
return 0; // No lines
17071715
for (;;)
17081716
{
1709-
int chr = GET;
1710-
#ifdef USE_MB
1711-
if (my_mbcharlen(read_charset, chr) > 1)
1712-
{
1713-
for (uint i=1;
1714-
chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
1715-
i++)
1716-
chr = GET;
1717-
if (chr == escape_char)
1718-
continue;
1719-
}
1720-
#endif
1721-
if (chr == my_b_EOF)
1722-
{
1723-
eof=1;
1724-
return 1;
1717+
int chlen;
1718+
char buf[MY_CS_MBMAXLEN];
1719+
1720+
if (getbyte(&buf[0]))
1721+
return 1; // EOF
1722+
1723+
if (use_mb(read_charset) &&
1724+
(chlen= my_charlen(read_charset, buf, buf + 1)) != 1)
1725+
{
1726+
uint i;
1727+
for (i= 1; MY_CS_IS_TOOSMALL(chlen); )
1728+
{
1729+
DBUG_ASSERT(i < sizeof(buf));
1730+
DBUG_ASSERT(chlen != 1);
1731+
if (getbyte(&buf[i++]))
1732+
return 1; // EOF
1733+
chlen= my_charlen(read_charset, buf, buf + i);
1734+
}
1735+
1736+
/*
1737+
Either a complete multi-byte sequence,
1738+
or a broken byte sequence was found.
1739+
Check if the sequence is a prefix of the "LINES TERMINATED BY" string.
1740+
*/
1741+
if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
1742+
!memcmp(buf, line_term_ptr, i))
1743+
{
1744+
if (line_term_length == i)
1745+
{
1746+
/*
1747+
We found a "LINES TERMINATED BY" string that consists
1748+
of a single multi-byte character.
1749+
*/
1750+
return 0;
1751+
}
1752+
/*
1753+
buf[] is a prefix of "LINES TERMINATED BY".
1754+
Now check the suffix. Length of the suffix of line_term_ptr
1755+
that still needs to be checked is (line_term_length - i).
1756+
Note, READ_INFO::terminator() assumes that the leftmost byte of the
1757+
argument is already scanned from the file and is checked to
1758+
be a known prefix (e.g. against line_term_char).
1759+
So we need to pass one extra byte.
1760+
*/
1761+
if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
1762+
return 0;
1763+
}
1764+
/*
1765+
Here we have a good multi-byte sequence or a broken byte sequence,
1766+
and the sequence is not equal to "LINES TERMINATED BY".
1767+
No needs to check for escape_char, because:
1768+
- multi-byte escape characters in "FIELDS ESCAPED BY" are not
1769+
supported and are rejected at parse time.
1770+
- broken single-byte sequences are not recognized as escapes,
1771+
they are considered to be a part of the data and are converted to
1772+
question marks.
1773+
*/
1774+
line_cuted= true;
1775+
continue;
17251776
}
1726-
if (chr == escape_char)
1777+
if (buf[0] == escape_char)
17271778
{
1728-
line_cuted=1;
1779+
line_cuted= true;
17291780
if (GET == my_b_EOF)
1730-
return 1;
1781+
return 1;
17311782
continue;
17321783
}
1733-
if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
1784+
if (buf[0] == line_term_char && terminator(line_term_ptr,line_term_length))
17341785
return 0;
1735-
line_cuted=1;
1786+
line_cuted= true;
17361787
}
17371788
}
17381789

0 commit comments

Comments
 (0)