Skip to content

Commit 6a576f1

Browse files
author
Alexander Barkov
committed
Adding thorough tests covering what happens with escaped sequences
in the SQL parser. Various backslash escapes and quote-quote escaped sequences are covered in combination with single and multi-byte characters. This is especially important for the character sets that can have 0x5C as the second byte in a multi-byte character (big5, cp932, gbk, sjis). swe7 is also a special character set, because in swe7 0x5C is used for both escape character and for "LATIN CAPITAL LETTER O WITH DIAERESIS".
1 parent d2ae40a commit 6a576f1

17 files changed

+21909
-0
lines changed

mysql-test/include/bytes.inc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#
2+
# Create a table with all byte values
3+
#
4+
CREATE TABLE halfs (a INT);
5+
INSERT INTO halfs VALUES (0x00),(0x01),(0x02),(0x03),(0x04),(0x05),(0x06),(0x07);
6+
INSERT INTO halfs VALUES (0x08),(0x09),(0x0A),(0x0B),(0x0C),(0x0D),(0x0E),(0x0F);
7+
CREATE TEMPORARY TABLE bytes (a BINARY(1), KEY(a)) ENGINE=MyISAM;
8+
INSERT INTO bytes SELECT CHAR((t1.a << 4) | t2.a USING BINARY) FROM halfs t1, halfs t2;
9+
DROP TABLE halfs;

mysql-test/include/bytes2.inc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#
2+
# Create a table with all 2 byte sequence values
3+
#
4+
5+
--source include/bytes.inc
6+
7+
CREATE TABLE halfs (a BINARY(1));
8+
# "bytes" is a temporary table, hence is not allowed in joins
9+
# Create a non-temporary copy.
10+
INSERT INTO halfs SELECT * FROM bytes;
11+
CREATE TEMPORARY TABLE bytes2 (
12+
a BINARY(2),
13+
hi BINARY(1),
14+
lo BINARY(1),
15+
KEY(a),
16+
KEY(lo)
17+
) ENGINE=MyISAM;
18+
INSERT INTO bytes2
19+
SELECT CONCAT(t1.a, t2.a), t1.a, t2.a FROM halfs t1, halfs t2
20+
ORDER BY t1.a, t2.a;
21+
DROP TABLE halfs;

mysql-test/include/ctype_unescape.inc

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
--echo # Start of ctype_unescape.inc
2+
3+
#
4+
# Testing how string literals with backslash and quote-quote are unescaped.
5+
# The tests assume that single quote (') is used as a delimiter.
6+
#
7+
8+
#
9+
# Make sure that the parser really works using the character set we need.
10+
# We use binary strings to compose strings, to be able to test get malformed
11+
# sequences, which are possible as a result of mysql_real_escape_string().
12+
# The important thing for this test is to make the parser unescape using
13+
# the client character set, rather than binary. Currently it works exactly
14+
# that way by default, so the query below should return @@character_set_client
15+
#
16+
SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
17+
PREPARE stmt FROM @query;
18+
EXECUTE stmt;
19+
DEALLOCATE PREPARE stmt;
20+
21+
let $CHARSET=`SELECT @@character_set_connection`;
22+
23+
CREATE TABLE allbytes (a VARBINARY(10));
24+
25+
#
26+
# Create various byte sequences to test. Testing the full banch of
27+
# possible combinations takes about 2 minutes. So this test provides
28+
# variants to run with:
29+
# - the full set of possible combinations
30+
# - a reduced test of combinations for selected bytes only
31+
#
32+
33+
# Create selected byte combinations
34+
if ($ctype_unescape_combinations == 'selected')
35+
{
36+
--echo # Using selected bytes combinations
37+
--source include/bytes.inc
38+
#
39+
# Populate "selected_bytes" with bytes that have a special meaning.
40+
# We'll use "selected_bytes" to generate byte seqeunces,
41+
# instead of the full possible byte combinations, to reduce test time.
42+
#
43+
CREATE TABLE selected_bytes (a VARBINARY(10));
44+
45+
# Bytes that have a special meaning in all character sets:
46+
# 0x00 - mysql_real_escape_string() quotes this to '\0'
47+
# 0x0D - mysql_real_escape_string() quotes this to '\r'
48+
# 0x0A - mysql_real_escape_string() quotes this to '\n'
49+
# 0x1A - mysql_real_escape_string() quotes this to '\Z'
50+
# 0x08 - mysql_real_escape_string() does not quote this,
51+
# but '\b' is unescaped to 0x08.
52+
# 0x09 - mysql_real_escape_string() does not quote this,
53+
# but '\t' is unescaped to 0x09.
54+
# 0x30 - '0', as in '\0'
55+
# 0x5A - 'Z', as in '\Z'
56+
# 0x62 - 'b', as in '\b'
57+
# 0x6E - 'n', as in '\n'
58+
# 0x72 - 't', as in '\r'
59+
# 0x74 - 't', as in '\t'
60+
61+
INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
62+
INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
63+
64+
# 0x22 - double quote
65+
# 0x25 - percent sign, '\%' is preserved as is for LIKE.
66+
# 0x27 - single quote
67+
# 0x5C - backslash
68+
# 0x5F - underscore, '\_' is preserved as is for LIKE.
69+
INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
70+
71+
# Some bytes do not have any special meaning, for example basic Latin letters.
72+
# Let's add, one should be enough for a good enough coverage.
73+
INSERT INTO selected_bytes (a) VALUES ('a');
74+
75+
#
76+
# This maps summarizes bytes that have a special
77+
# meaning in various character sets:
78+
#
79+
# MBHEAD MBTAIL NONASCII-8BIT BAD
80+
# ------ ------ -------------- ----------
81+
# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF]
82+
# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
83+
# gbk: [81..FE] [40..7E,80..FE] N/A [FF]
84+
# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
85+
# swe7: N/A N/A [5B..5E,7B..7E] [80..FF]
86+
#
87+
88+
INSERT INTO selected_bytes (a) VALUES
89+
(0x3F), # 7bit
90+
(0x40), # 7bit mbtail
91+
(0x7E), # 7bit mbtail nonascii-8bit
92+
(0x7F), # 7bit nonascii-8bit
93+
(0x80), # mbtail bad-mb
94+
(0x81), # mbhead mbtail
95+
(0x9F), # mbhead mbtail bad-mb
96+
(0xA0), # mbhead mbtail bad-mb
97+
(0xA1), # mbhead mbtail nonascii-8bit
98+
(0xE0), # mbhead mbtai
99+
(0xEF), # mbhead mbtail
100+
(0xF9), # mbhead mbtail
101+
(0xFA), # mbhead mbtail bad-mb
102+
(0xFC), # mbhead mbtail bad-mb
103+
(0xFD), # mbhead mbtail bad-mb
104+
(0xFE), # mbhead mbtial bad-mb
105+
(0xFF); # bad-mb
106+
107+
#
108+
# Now populate the test table
109+
#
110+
111+
# Use all single bytes, this is cheap, there are only 256 values.
112+
INSERT INTO allbytes (a) SELECT a FROM bytes;
113+
114+
# Add selected bytes combinations
115+
INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
116+
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
117+
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
118+
DROP TABLE selected_bytes;
119+
120+
# Delete all non-single byte sequences that do not have
121+
# backslashes or quotes at all. There is nothing special with these strings.
122+
DELETE FROM allbytes WHERE
123+
OCTET_LENGTH(a)>1 AND
124+
LOCATE(0x5C,a)=0 AND
125+
a NOT LIKE '%\'%' AND
126+
a NOT LIKE '%"%';
127+
128+
}
129+
130+
if ($ctype_unescape_combinations=='')
131+
{
132+
--echo # Using full byte combinations
133+
--source include/bytes2.inc
134+
INSERT INTO allbytes (a) SELECT a FROM bytes;
135+
INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
136+
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
137+
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
138+
}
139+
140+
141+
DELIMITER //;
142+
143+
#
144+
# A procedure that make an SQL query using 'val' as a string literal.
145+
# The result of the query execution is written into the table 't1'.
146+
# NULL in t1.b means that query failed due to syntax error,
147+
# typically because of mis-interpreted closing quote delimiter.
148+
#
149+
CREATE PROCEDURE p1(val VARBINARY(10))
150+
BEGIN
151+
DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
152+
SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
153+
PREPARE stmt FROM @query;
154+
EXECUTE stmt;
155+
DEALLOCATE PREPARE stmt;
156+
END//
157+
158+
#
159+
# A procedure that iterates through all records in "allbytes".
160+
# And runs p1() for every record.
161+
#
162+
CREATE PROCEDURE p2()
163+
BEGIN
164+
DECLARE val VARBINARY(10);
165+
DECLARE done INT DEFAULT FALSE;
166+
DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
167+
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
168+
OPEN stmt;
169+
read_loop1: LOOP
170+
FETCH stmt INTO val;
171+
IF done THEN
172+
LEAVE read_loop1;
173+
END IF;
174+
CALL p1(val);
175+
END LOOP;
176+
CLOSE stmt;
177+
END//
178+
179+
180+
# A function that converts the value from binary to $CHARSET
181+
# and check if it has changed. CONVERT() fixes malformed strings.
182+
# So if the string changes in CONVERT(), it means it was not wellformed.
183+
--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
184+
185+
#
186+
# A function that approximately reproduces how the SQL parser
187+
# would unescape a binary string.
188+
#
189+
CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
190+
BEGIN
191+
# We need to do it in a way to avoid producing new escape sequences
192+
# First, enclose all known escsape sequences to '{{xx}}'
193+
# - Backslash not followed by a LIKE pattern characters _ and %
194+
# - Double escapes
195+
# This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
196+
# So '\\1' in the last argument always means the match, no matter
197+
# which alternative it came from.
198+
SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
199+
# Now unescape all enclosed standard escape sequences
200+
SET a=REPLACE(a,'{{\\0}}', '\0');
201+
SET a=REPLACE(a,'{{\\b}}', '\b');
202+
SET a=REPLACE(a,'{{\\t}}', '\t');
203+
SET a=REPLACE(a,'{{\\r}}', '\r');
204+
SET a=REPLACE(a,'{{\\n}}', '\n');
205+
SET a=REPLACE(a,'{{\\Z}}', '\Z');
206+
SET a=REPLACE(a,'{{\\\'}}', '\'');
207+
# Unescape double quotes
208+
SET a=REPLACE(a,'{{\'\'}}', '\'');
209+
# Unescape the rest: all other \x sequences mean just 'x'
210+
SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
211+
RETURN a;
212+
END//
213+
214+
215+
#
216+
# A function that checks what happened during unescaping.
217+
#
218+
# @param a - the value before unescaping
219+
# @param b - the value after unescaping
220+
#
221+
# The following return values are possible:
222+
# - SyntErr - b IS NULL, which means syntax error happened in p1().
223+
# - Preserv - the value was not modified during unescaping.
224+
# This is possible if 0x5C was treated as mbtail.
225+
# Or only LIKE escape sequences were found: '\_' and '\%'.
226+
# - Trivial - only 0x5C were removed.
227+
# - Regular - the value was unescaped like a binary string.
228+
# Some standard escape sequences were found.
229+
# No special multi-byte handling happened.
230+
# - Special - Something else happened. Should not happen.
231+
#
232+
CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
233+
BEGIN
234+
RETURN CASE
235+
WHEN b IS NULL THEN '[SyntErr]'
236+
WHEN a=b THEN CASE
237+
WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
238+
WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
239+
WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
240+
ELSE '[Preserv][MB]' END
241+
WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
242+
WHEN UNESCAPE(a)=b THEN '[Regular]'
243+
ELSE '[Special]' END;
244+
END//
245+
246+
247+
#
248+
# Check what happened with wellformedness during unescaping
249+
# @param a - the value before unescaping
250+
# @param b - the value after unescaping
251+
#
252+
# Returned values:
253+
# [FIXED] - the value was malformed and become wellformed after unescaping
254+
# [BROKE] - the value was wellformed and become malformed after unescaping
255+
# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
256+
# '' - both values are wellformed
257+
#
258+
CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
259+
RETURNS VARBINARY(256)
260+
BEGIN
261+
RETURN CASE
262+
WHEN b IS NULL THEN ''
263+
WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
264+
WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
265+
WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
266+
ELSE ''
267+
END;
268+
END//
269+
270+
271+
#
272+
# Check if the value could be generated by mysql_real_escape_string(),
273+
# or can only come from a direct user input.
274+
#
275+
# @param a - the value before unescaping
276+
#
277+
# Returns:
278+
# [USER] - if the value could not be generated by mysql_real_escape_string()
279+
# '' - if the value was possibly generated by mysql_real_escape_string()
280+
#
281+
#
282+
CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
283+
RETURNS VARBINARY(256)
284+
BEGIN
285+
DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
286+
DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
287+
DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
288+
DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
289+
DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
290+
RETURN CASE
291+
WHEN (a1=0x5C) AND
292+
(a3=0x5C) AND
293+
(a2>0x7F) AND
294+
(a4 NOT IN ('_','%','0','t','r','n','Z')) AND
295+
iswellformed(a2a4) THEN '[USER]'
296+
ELSE ''
297+
END;
298+
END//
299+
300+
DELIMITER ;//
301+
302+
303+
CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
304+
CALL p2();
305+
# Avoid "Invalid XXX character string" warnings
306+
# We mark malformed strings in the output anyway
307+
--disable_warnings
308+
# All records marked with '[BAD]' mean that the string was unescaped
309+
# in a unexpected way, that means there is a bug in UNESCAPE() above.
310+
SELECT HEX(a),HEX(b),
311+
CONCAT(unescape_type(a,b),
312+
wellformedness(a,b),
313+
mysql_real_escape_string_generated(a),
314+
IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
315+
FROM t1 ORDER BY LENGTH(a),a;
316+
--enable_warnings
317+
DROP TABLE t1;
318+
DROP PROCEDURE p1;
319+
DROP PROCEDURE p2;
320+
DROP FUNCTION unescape;
321+
DROP FUNCTION unescape_type;
322+
DROP FUNCTION wellformedness;
323+
DROP FUNCTION mysql_real_escape_string_generated;
324+
DROP FUNCTION iswellformed;
325+
DROP TABLE allbytes;
326+
327+
--echo # End of ctype_backslash.inc

0 commit comments

Comments
 (0)