Skip to content

Commit

Permalink
URL: Fix URLSearchParams to further avoid decoding URIError
Browse files Browse the repository at this point in the history
Ref JakeChampion#1173.
Ref JakeChampion#4.

Co-authored-by: David Chan <david@troi.org>
  • Loading branch information
Krinkle and divec committed Mar 15, 2022
1 parent 04ad871 commit d80a5ce
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 36 deletions.
69 changes: 54 additions & 15 deletions polyfills/URL/polyfill.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,62 @@
return output.replace(/%20/g, '+');
}

// NOTE: URL API accepts inputs like `?x=%`, `?x=%a`, and `?x=%2sf`
// as literals, whereas legacy decodeURIComponent would throw
// URIError (as specified by ECMAScript).
//
// https://url.spec.whatwg.org/#percent-decode
var cachedDecodePattern;
function percent_decode(bytes) {
// NOTE:
// * Only decode pairs of exactly two bytes.
// * Only decode bytes in range 0-9, A-F, a-f.
// * Decode as many pairs at the same time as possible.
// This is because we're not actually operating on internal bytes,
// but on a valid UTF string, and the string must remain valid at
// all times, and decodeURIComponent will throw when attempting to
// decode a byte that represents only part of a codepoint, for example
// "%7F" separately from "%7F%C3%BF".
return bytes.replace(/((%[0-9A-Fa-f]{2})*)/g, function (_, m) {
return decodeURIComponent(m);
// This can't simply use decodeURIComponent (part of ECMAScript) as that's limited to
// decoding to valid UTF-8 only. It throws URIError for literals that look like percent
// encoding (e.g. `x=%`, `x=%a`, and `x=a%2sf`) and for non-UTF8 binary data that was
// percent encoded and cannot be turned back into binary within a JavaScript string.
//
// The spec deals with this as follows:
// * Read input as UTF-8 encoded bytes. This needs low-level access or a modern
// Web API, like TextDecoder. Old browsers don't have that, and it'd a large
// dependency to add to this polyfill.
// * For each percentage sign followed by two hex, blindly decode the byte in binary
// form. This would require TextEncoder to not corrupt multi-byte chars.
// * Replace any bytes that would be invalid under UTF-8 with U+FFFD.
//
// Instead we:
// * Use the fact that UTF-8 is designed to make validation easy in binary.
// You don't have to decode first. There are only a handful of valid prefixes and
// ranges, per RFC 3629. <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
// * Safely create multi-byte chars with decodeURIComponent, by only passing it
// valid and full characters (e.g. "%F0" separately from "%F0%9F%92%A9" throws).
// Anything else is kept as literal or replaced with U+FFFD, as per the URL spec.

if (!cachedDecodePattern) {
// In a UTF-8 multibyte sequence, non-initial bytes are always between %80 and %BF
var uContinuation = '%[89AB][0-9A-F]';

// The length of a UTF-8 sequence is specified by the first byte
//
// One-byte sequences: 0xxxxxxx
// So the byte is between %00 and %7F
var u1Bytes = '%[0-7][0-9A-F]';
// Two-byte sequences: 110xxxxx 10xxxxxx
// So the first byte is between %C0 and %DF
var u2Bytes = '%[CD][0-9A-F]' + uContinuation;
// Three-byte sequences: 1110xxxx 10xxxxxx 10xxxxxx
// So the first byte is between %E0 and %EF
var u3Bytes = '%E[0-9A-F]' + uContinuation + uContinuation;
// Four-byte sequences: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// So the first byte is between %F0 and %F7
var u4Bytes = '%F[0-7]' + uContinuation + uContinuation +uContinuation;

var anyByte = '%[0-9A-F][0-9A-F]';

// Match some consecutive percent-escaped bytes. More precisely, match
// 1-4 bytes that validly encode one character in UTF-8, or 1 byte that
// would be invalid in UTF-8 in this location.
cachedDecodePattern = new RegExp(
'(' + u4Bytes + ')|(' + u3Bytes + ')|(' + u2Bytes + ')|(' + u1Bytes + ')|(' + anyByte + ')',
'gi'
);
}

return bytes.replace(cachedDecodePattern, function (match, u4, u3, u2, u1, uBad) {
return (uBad !== undefined) ? '\uFFFD' : decodeURIComponent(match);
});
}

Expand Down
62 changes: 41 additions & 21 deletions polyfills/URL/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,47 @@ describe('WPT tests', function () {
proclaim["throws"](function() { new URLSearchParams([[1,2,3]]); });
});

// Examples from wpt/url/urlencoded-parser.any.js
/* eslint-disable quote-props */
[
// Cases from WPT: urlencoded-parser
// https://github.com/web-platform-tests/wpt/blob/5f5ec4cff4/url/urlencoded-parser.any.js
{ "input": "test", "output": [["test", ""]] },
{ "input": "\uFEFFtest=\uFEFF", "output": [["\uFEFFtest", "\uFEFF"]] },
{ "input": "%EF%BB%BFtest=%EF%BB%BF", "output": [["\uFEFFtest", "\uFEFF"]] },
{ "input": "%FE%FF", "output": [["\uFFFD\uFFFD", ""]] },
{ "input": "%FF%FE", "output": [["\uFFFD\uFFFD", ""]] },
{ "input": "†&†=x", "output": [["†", ""], ["†", "x"]] },
{ "input": "%C2", "output": [["\uFFFD", ""]] },
{ "input": "%C2x", "output": [["\uFFFDx", ""]] },
{ "input": "_charset_=windows-1252&test=%C2x", "output": [["_charset_", "windows-1252"], ["test", "\uFFFDx"]] },
{ "input": '', "output": [] },
{ "input": 'a', "output": [['a', '']] },
{ "input": 'a=b', "output": [['a', 'b']] },
{ "input": 'a=', "output": [['a', '']] },
{ "input": '=b', "output": [['', 'b']] },
{ "input": '&', "output": [] },
{ "input": '&a', "output": [['a', '']] },
{ "input": 'a&', "output": [['a', '']] },
{ "input": 'a&a', "output": [['a', ''], ['a', '']] },
{ "input": 'a&b&c', "output": [['a', ''], ['b', ''], ['c', '']] },
{ "input": 'a=b&c=d', "output": [['a', 'b'], ['c', 'd']] },
{ "input": 'a=b&c=d&', "output": [['a', 'b'], ['c', 'd']] },
{ "input": '&&&a=b&&&&c=d&', "output": [['a', 'b'], ['c', 'd']] },
{ "input": 'a=a&a=b&a=c', "output": [['a', 'a'], ['a', 'b'], ['a', 'c']] },
{ "input": 'a==a', "output": [['a', '=a']] },
{ "input": 'a=a+b+c+d', "output": [['a', 'a b c d']] },
{ "input": '%=a', "output": [['%', 'a']] },
{ "input": '%a=a', "output": [['%a', 'a']] },
{ "input": '%a_=a', "output": [['%a_', 'a']] },
{ "input": '%61=a', "output": [['a', 'a']] },
{ "input": '%61+%4d%4D=', "output": [['a MM', '']] },
{ "input": "id=0&value=%", "output": [['id', '0'], ['value', '%']] },
{ "input": "b=%2sf%2a", "output": [['b', '%2sf*']]},
{ "input": "b=%2%2af%2a", "output": [['b', '%2*f*']]},
{ "input": "b=%%2a", "output": [['b', '%*']]},

// Cases from WPT: urlencoded-sort
// https://github.com/web-platform-tests/wpt/blob/5f5ec4cff4/url/urlsearchparams-sort.any.js
{
input: "z=b&a=b&z=a&a=a",
output: [["a", "b"], ["a", "a"], ["z", "b"], ["z", "a"]]
Expand All @@ -604,26 +643,6 @@ describe('WPT tests', function () {
input: "\uFFFD=x&\uFFFC&\uFFFD=a",
output: [["\uFFFC", ""], ["\uFFFD", "x"], ["\uFFFD", "a"]]
},
{
input: '%a=a',
output: [['%a', 'a']]
},
{
input: "id=0&value=%",
output: [['id', '0'], ['value', '%']]
},
{
input: "b=%2sf%2a",
output: [['b', '%2sf*']]
},
{
input: "b=%2%2af%2a",
output: [['b', '%2*f*']]
},
{
input: "b=%%2a",
output: [['b', '%*']]
},
{
input: "ffi&🌈", // 🌈 > code point, but < code unit because two code units
output: [["🌈", ""], ["ffi", ""]]
Expand All @@ -649,6 +668,7 @@ describe('WPT tests', function () {
input: "a🌈&a💩",
output: [["a🌈", ""], ["a💩", ""]]
}
/* eslint-enable */
].forEach(function(val) {
it( "parses and sorts: " + val.input, function() {
var params = new URLSearchParams(val.input);
Expand Down

0 comments on commit d80a5ce

Please sign in to comment.