|
| 1 | +--echo # Start of ctype_unescape.inc |
| 2 | + |
| 3 | +# |
| 4 | +# Testing how string literals with backslash and quote-quote are unescaped. |
| 5 | +# The tests assume that single quote (') is used as a delimiter. |
| 6 | +# |
| 7 | + |
| 8 | +# |
| 9 | +# Make sure that the parser really works using the character set we need. |
| 10 | +# We use binary strings to compose strings, to be able to test get malformed |
| 11 | +# sequences, which are possible as a result of mysql_real_escape_string(). |
| 12 | +# The important thing for this test is to make the parser unescape using |
| 13 | +# the client character set, rather than binary. Currently it works exactly |
| 14 | +# that way by default, so the query below should return @@character_set_client |
| 15 | +# |
| 16 | +SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection'; |
| 17 | +PREPARE stmt FROM @query; |
| 18 | +EXECUTE stmt; |
| 19 | +DEALLOCATE PREPARE stmt; |
| 20 | + |
| 21 | +let $CHARSET=`SELECT @@character_set_connection`; |
| 22 | + |
| 23 | +CREATE TABLE allbytes (a VARBINARY(10)); |
| 24 | + |
| 25 | +# |
| 26 | +# Create various byte sequences to test. Testing the full banch of |
| 27 | +# possible combinations takes about 2 minutes. So this test provides |
| 28 | +# variants to run with: |
| 29 | +# - the full set of possible combinations |
| 30 | +# - a reduced test of combinations for selected bytes only |
| 31 | +# |
| 32 | + |
| 33 | +# Create selected byte combinations |
| 34 | +if ($ctype_unescape_combinations == 'selected') |
| 35 | +{ |
| 36 | +--echo # Using selected bytes combinations |
| 37 | +--source include/bytes.inc |
| 38 | +# |
| 39 | +# Populate "selected_bytes" with bytes that have a special meaning. |
| 40 | +# We'll use "selected_bytes" to generate byte seqeunces, |
| 41 | +# instead of the full possible byte combinations, to reduce test time. |
| 42 | +# |
| 43 | +CREATE TABLE selected_bytes (a VARBINARY(10)); |
| 44 | + |
| 45 | +# Bytes that have a special meaning in all character sets: |
| 46 | +# 0x00 - mysql_real_escape_string() quotes this to '\0' |
| 47 | +# 0x0D - mysql_real_escape_string() quotes this to '\r' |
| 48 | +# 0x0A - mysql_real_escape_string() quotes this to '\n' |
| 49 | +# 0x1A - mysql_real_escape_string() quotes this to '\Z' |
| 50 | +# 0x08 - mysql_real_escape_string() does not quote this, |
| 51 | +# but '\b' is unescaped to 0x08. |
| 52 | +# 0x09 - mysql_real_escape_string() does not quote this, |
| 53 | +# but '\t' is unescaped to 0x09. |
| 54 | +# 0x30 - '0', as in '\0' |
| 55 | +# 0x5A - 'Z', as in '\Z' |
| 56 | +# 0x62 - 'b', as in '\b' |
| 57 | +# 0x6E - 'n', as in '\n' |
| 58 | +# 0x72 - 't', as in '\r' |
| 59 | +# 0x74 - 't', as in '\t' |
| 60 | + |
| 61 | +INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z'); |
| 62 | +INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z'); |
| 63 | + |
| 64 | +# 0x22 - double quote |
| 65 | +# 0x25 - percent sign, '\%' is preserved as is for LIKE. |
| 66 | +# 0x27 - single quote |
| 67 | +# 0x5C - backslash |
| 68 | +# 0x5F - underscore, '\_' is preserved as is for LIKE. |
| 69 | +INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27); |
| 70 | + |
| 71 | +# Some bytes do not have any special meaning, for example basic Latin letters. |
| 72 | +# Let's add, one should be enough for a good enough coverage. |
| 73 | +INSERT INTO selected_bytes (a) VALUES ('a'); |
| 74 | + |
| 75 | +# |
| 76 | +# This maps summarizes bytes that have a special |
| 77 | +# meaning in various character sets: |
| 78 | +# |
| 79 | +# MBHEAD MBTAIL NONASCII-8BIT BAD |
| 80 | +# ------ ------ -------------- ---------- |
| 81 | +# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF] |
| 82 | +# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF] |
| 83 | +# gbk: [81..FE] [40..7E,80..FE] N/A [FF] |
| 84 | +# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF] |
| 85 | +# swe7: N/A N/A [5B..5E,7B..7E] [80..FF] |
| 86 | +# |
| 87 | + |
| 88 | +INSERT INTO selected_bytes (a) VALUES |
| 89 | +(0x3F), # 7bit |
| 90 | +(0x40), # 7bit mbtail |
| 91 | +(0x7E), # 7bit mbtail nonascii-8bit |
| 92 | +(0x7F), # 7bit nonascii-8bit |
| 93 | +(0x80), # mbtail bad-mb |
| 94 | +(0x81), # mbhead mbtail |
| 95 | +(0x9F), # mbhead mbtail bad-mb |
| 96 | +(0xA0), # mbhead mbtail bad-mb |
| 97 | +(0xA1), # mbhead mbtail nonascii-8bit |
| 98 | +(0xE0), # mbhead mbtai |
| 99 | +(0xEF), # mbhead mbtail |
| 100 | +(0xF9), # mbhead mbtail |
| 101 | +(0xFA), # mbhead mbtail bad-mb |
| 102 | +(0xFC), # mbhead mbtail bad-mb |
| 103 | +(0xFD), # mbhead mbtail bad-mb |
| 104 | +(0xFE), # mbhead mbtial bad-mb |
| 105 | +(0xFF); # bad-mb |
| 106 | + |
| 107 | +# |
| 108 | +# Now populate the test table |
| 109 | +# |
| 110 | + |
| 111 | +# Use all single bytes, this is cheap, there are only 256 values. |
| 112 | +INSERT INTO allbytes (a) SELECT a FROM bytes; |
| 113 | + |
| 114 | +# Add selected bytes combinations |
| 115 | +INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2; |
| 116 | +INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2; |
| 117 | +INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2; |
| 118 | +DROP TABLE selected_bytes; |
| 119 | + |
| 120 | +# Delete all non-single byte sequences that do not have |
| 121 | +# backslashes or quotes at all. There is nothing special with these strings. |
| 122 | +DELETE FROM allbytes WHERE |
| 123 | + OCTET_LENGTH(a)>1 AND |
| 124 | + LOCATE(0x5C,a)=0 AND |
| 125 | + a NOT LIKE '%\'%' AND |
| 126 | + a NOT LIKE '%"%'; |
| 127 | + |
| 128 | +} |
| 129 | + |
| 130 | +if ($ctype_unescape_combinations=='') |
| 131 | +{ |
| 132 | +--echo # Using full byte combinations |
| 133 | +--source include/bytes2.inc |
| 134 | +INSERT INTO allbytes (a) SELECT a FROM bytes; |
| 135 | +INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2; |
| 136 | +INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2; |
| 137 | +INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2; |
| 138 | +} |
| 139 | + |
| 140 | + |
| 141 | +DELIMITER //; |
| 142 | + |
| 143 | +# |
| 144 | +# A procedure that make an SQL query using 'val' as a string literal. |
| 145 | +# The result of the query execution is written into the table 't1'. |
| 146 | +# NULL in t1.b means that query failed due to syntax error, |
| 147 | +# typically because of mis-interpreted closing quote delimiter. |
| 148 | +# |
| 149 | +CREATE PROCEDURE p1(val VARBINARY(10)) |
| 150 | +BEGIN |
| 151 | + DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL); |
| 152 | + SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')"); |
| 153 | + PREPARE stmt FROM @query; |
| 154 | + EXECUTE stmt; |
| 155 | + DEALLOCATE PREPARE stmt; |
| 156 | +END// |
| 157 | + |
| 158 | +# |
| 159 | +# A procedure that iterates through all records in "allbytes". |
| 160 | +# And runs p1() for every record. |
| 161 | +# |
| 162 | +CREATE PROCEDURE p2() |
| 163 | +BEGIN |
| 164 | + DECLARE val VARBINARY(10); |
| 165 | + DECLARE done INT DEFAULT FALSE; |
| 166 | + DECLARE stmt CURSOR FOR SELECT a FROM allbytes; |
| 167 | + DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE; |
| 168 | + OPEN stmt; |
| 169 | +read_loop1: LOOP |
| 170 | + FETCH stmt INTO val; |
| 171 | + IF done THEN |
| 172 | + LEAVE read_loop1; |
| 173 | + END IF; |
| 174 | + CALL p1(val); |
| 175 | +END LOOP; |
| 176 | + CLOSE stmt; |
| 177 | +END// |
| 178 | + |
| 179 | + |
| 180 | +# A function that converts the value from binary to $CHARSET |
| 181 | +# and check if it has changed. CONVERT() fixes malformed strings. |
| 182 | +# So if the string changes in CONVERT(), it means it was not wellformed. |
| 183 | +--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET); |
| 184 | + |
| 185 | +# |
| 186 | +# A function that approximately reproduces how the SQL parser |
| 187 | +# would unescape a binary string. |
| 188 | +# |
| 189 | +CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256) |
| 190 | +BEGIN |
| 191 | + # We need to do it in a way to avoid producing new escape sequences |
| 192 | + # First, enclose all known escsape sequences to '{{xx}}' |
| 193 | + # - Backslash not followed by a LIKE pattern characters _ and % |
| 194 | + # - Double escapes |
| 195 | + # This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)). |
| 196 | + # So '\\1' in the last argument always means the match, no matter |
| 197 | + # which alternative it came from. |
| 198 | + SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}'); |
| 199 | + # Now unescape all enclosed standard escape sequences |
| 200 | + SET a=REPLACE(a,'{{\\0}}', '\0'); |
| 201 | + SET a=REPLACE(a,'{{\\b}}', '\b'); |
| 202 | + SET a=REPLACE(a,'{{\\t}}', '\t'); |
| 203 | + SET a=REPLACE(a,'{{\\r}}', '\r'); |
| 204 | + SET a=REPLACE(a,'{{\\n}}', '\n'); |
| 205 | + SET a=REPLACE(a,'{{\\Z}}', '\Z'); |
| 206 | + SET a=REPLACE(a,'{{\\\'}}', '\''); |
| 207 | + # Unescape double quotes |
| 208 | + SET a=REPLACE(a,'{{\'\'}}', '\''); |
| 209 | + # Unescape the rest: all other \x sequences mean just 'x' |
| 210 | + SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1'); |
| 211 | + RETURN a; |
| 212 | +END// |
| 213 | + |
| 214 | + |
| 215 | +# |
| 216 | +# A function that checks what happened during unescaping. |
| 217 | +# |
| 218 | +# @param a - the value before unescaping |
| 219 | +# @param b - the value after unescaping |
| 220 | +# |
| 221 | +# The following return values are possible: |
| 222 | +# - SyntErr - b IS NULL, which means syntax error happened in p1(). |
| 223 | +# - Preserv - the value was not modified during unescaping. |
| 224 | +# This is possible if 0x5C was treated as mbtail. |
| 225 | +# Or only LIKE escape sequences were found: '\_' and '\%'. |
| 226 | +# - Trivial - only 0x5C were removed. |
| 227 | +# - Regular - the value was unescaped like a binary string. |
| 228 | +# Some standard escape sequences were found. |
| 229 | +# No special multi-byte handling happened. |
| 230 | +# - Special - Something else happened. Should not happen. |
| 231 | +# |
| 232 | +CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256) |
| 233 | +BEGIN |
| 234 | + RETURN CASE |
| 235 | + WHEN b IS NULL THEN '[SyntErr]' |
| 236 | + WHEN a=b THEN CASE |
| 237 | + WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]' |
| 238 | + WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]' |
| 239 | + WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]' |
| 240 | + ELSE '[Preserv][MB]' END |
| 241 | + WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]' |
| 242 | + WHEN UNESCAPE(a)=b THEN '[Regular]' |
| 243 | + ELSE '[Special]' END; |
| 244 | +END// |
| 245 | + |
| 246 | + |
| 247 | +# |
| 248 | +# Check what happened with wellformedness during unescaping |
| 249 | +# @param a - the value before unescaping |
| 250 | +# @param b - the value after unescaping |
| 251 | +# |
| 252 | +# Returned values: |
| 253 | +# [FIXED] - the value was malformed and become wellformed after unescaping |
| 254 | +# [BROKE] - the value was wellformed and become malformed after unescaping |
| 255 | +# [ILSEQ] - both values (before unescaping and after unescaping) are malformed |
| 256 | +# '' - both values are wellformed |
| 257 | +# |
| 258 | +CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256)) |
| 259 | + RETURNS VARBINARY(256) |
| 260 | +BEGIN |
| 261 | + RETURN CASE |
| 262 | + WHEN b IS NULL THEN '' |
| 263 | + WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]' |
| 264 | + WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]' |
| 265 | + WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]' |
| 266 | + ELSE '' |
| 267 | + END; |
| 268 | +END// |
| 269 | + |
| 270 | + |
| 271 | +# |
| 272 | +# Check if the value could be generated by mysql_real_escape_string(), |
| 273 | +# or can only come from a direct user input. |
| 274 | +# |
| 275 | +# @param a - the value before unescaping |
| 276 | +# |
| 277 | +# Returns: |
| 278 | +# [USER] - if the value could not be generated by mysql_real_escape_string() |
| 279 | +# '' - if the value was possibly generated by mysql_real_escape_string() |
| 280 | +# |
| 281 | +# |
| 282 | +CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256)) |
| 283 | + RETURNS VARBINARY(256) |
| 284 | +BEGIN |
| 285 | + DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1); |
| 286 | + DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1); |
| 287 | + DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1); |
| 288 | + DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1); |
| 289 | + DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4); |
| 290 | + RETURN CASE |
| 291 | + WHEN (a1=0x5C) AND |
| 292 | + (a3=0x5C) AND |
| 293 | + (a2>0x7F) AND |
| 294 | + (a4 NOT IN ('_','%','0','t','r','n','Z')) AND |
| 295 | + iswellformed(a2a4) THEN '[USER]' |
| 296 | + ELSE '' |
| 297 | + END; |
| 298 | +END// |
| 299 | + |
| 300 | +DELIMITER ;// |
| 301 | + |
| 302 | + |
| 303 | +CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10)); |
| 304 | +CALL p2(); |
| 305 | +# Avoid "Invalid XXX character string" warnings |
| 306 | +# We mark malformed strings in the output anyway |
| 307 | +--disable_warnings |
| 308 | +# All records marked with '[BAD]' mean that the string was unescaped |
| 309 | +# in a unexpected way, that means there is a bug in UNESCAPE() above. |
| 310 | +SELECT HEX(a),HEX(b), |
| 311 | + CONCAT(unescape_type(a,b), |
| 312 | + wellformedness(a,b), |
| 313 | + mysql_real_escape_string_generated(a), |
| 314 | + IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment |
| 315 | +FROM t1 ORDER BY LENGTH(a),a; |
| 316 | +--enable_warnings |
| 317 | +DROP TABLE t1; |
| 318 | +DROP PROCEDURE p1; |
| 319 | +DROP PROCEDURE p2; |
| 320 | +DROP FUNCTION unescape; |
| 321 | +DROP FUNCTION unescape_type; |
| 322 | +DROP FUNCTION wellformedness; |
| 323 | +DROP FUNCTION mysql_real_escape_string_generated; |
| 324 | +DROP FUNCTION iswellformed; |
| 325 | +DROP TABLE allbytes; |
| 326 | + |
| 327 | +--echo # End of ctype_backslash.inc |
0 commit comments