JakeChampion · JakeChampion · Mar 16, 2022 · Mar 15, 2022 · JakeChampion · Mar 16, 2022
diff --git a/polyfills/URL/polyfill.js b/polyfills/URL/polyfill.js
@@ -60,23 +60,62 @@
 			return output.replace(/%20/g, '+');
 		}
 
-		// NOTE: URL API accepts inputs like `?x=%`, `?x=%a`, and `?x=%2sf`
-		// as literals, whereas legacy decodeURIComponent would throw
-		// URIError (as specified by ECMAScript).
-		//
 		// https://url.spec.whatwg.org/#percent-decode
+		var cachedDecodePattern;
 		function percent_decode(bytes) {
-			// NOTE:
-			// * Only decode pairs of exactly two bytes.
-			// * Only decode bytes in range 0-9, A-F, a-f.
-			// * Decode as many pairs at the same time as possible.
-			//   This is because we're not actually operating on internal bytes,
-			//   but on a valid UTF string, and the string must remain valid at
-			//   all times, and decodeURIComponent will throw when attempting to
-			//   decode a byte that represents only part of a codepoint, for example
-			//   "%7F" separately from "%7F%C3%BF".
-			return bytes.replace(/((%[0-9A-Fa-f]{2})*)/g, function (_, m) {
-				return decodeURIComponent(m);
+			// This can't simply use decodeURIComponent (part of ECMAScript) as that's limited to
+			// decoding to valid UTF-8 only. It throws URIError for literals that look like percent
+			// encoding (e.g. `x=%`, `x=%a`, and `x=a%2sf`) and for non-UTF8 binary data that was
+			// percent encoded and cannot be turned back into binary within a JavaScript string.
+			//
+			// The spec deals with this as follows:
+			// * Read input as UTF-8 encoded bytes. This needs low-level access or a modern
+			//   Web API, like TextDecoder. Old browsers don't have that, and it'd a large
+			//   dependency to add to this polyfill.
+			// * For each percentage sign followed by two hex, blindly decode the byte in binary
+			//   form. This would require TextEncoder to not corrupt multi-byte chars.
+			// * Replace any bytes that would be invalid under UTF-8 with U+FFFD.
+			//
+			// Instead we:
+			// * Use the fact that UTF-8 is designed to make validation easy in binary.
+			//   You don't have to decode first. There are only a handful of valid prefixes and
+			//   ranges, per RFC 3629. <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
+			// * Safely create multi-byte chars with decodeURIComponent, by only passing it
+			//   valid and full characters (e.g. "%F0" separately from "%F0%9F%92%A9" throws).
+			//   Anything else is kept as literal or replaced with U+FFFD, as per the URL spec.
+
+			if (!cachedDecodePattern) {
+				// In a UTF-8 multibyte sequence, non-initial bytes are always between %80 and %BF
+				var uContinuation = '%[89AB][0-9A-F]';
+
+				// The length of a UTF-8 sequence is specified by the first byte
+				//
+				// One-byte sequences: 0xxxxxxx
+				// So the byte is between %00 and %7F
+				var u1Bytes = '%[0-7][0-9A-F]';
+				// Two-byte sequences: 110xxxxx 10xxxxxx
+				// So the first byte is between %C0 and %DF
+				var u2Bytes = '%[CD][0-9A-F]' + uContinuation;
+				// Three-byte sequences: 1110xxxx 10xxxxxx 10xxxxxx
+				// So the first byte is between %E0 and %EF
+				var u3Bytes = '%E[0-9A-F]' + uContinuation + uContinuation;
+				// Four-byte sequences: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+				// So the first byte is between %F0 and %F7
+				var u4Bytes = '%F[0-7]' + uContinuation + uContinuation +uContinuation;
+
+				var anyByte = '%[0-9A-F][0-9A-F]';
+
+				// Match some consecutive percent-escaped bytes. More precisely, match
+				// 1-4 bytes that validly encode one character in UTF-8, or 1 byte that
+				// would be invalid in UTF-8 in this location.
+				cachedDecodePattern = new RegExp(
+					'(' + u4Bytes + ')|(' + u3Bytes + ')|(' + u2Bytes + ')|(' + u1Bytes + ')|(' + anyByte + ')',
+					'gi'
+				);
+			}
+
+			return bytes.replace(cachedDecodePattern, function (match, u4, u3, u2, u1, uBad) {
+				return (uBad !== undefined) ? '\uFFFD' : decodeURIComponent(match);
 			});
 		}
 

diff --git a/polyfills/URL/tests.js b/polyfills/URL/tests.js
@@ -594,8 +594,47 @@ describe('WPT tests', function () {
 		proclaim["throws"](function() { new URLSearchParams([[1,2,3]]); });
 	});
 
-	// Examples from wpt/url/urlencoded-parser.any.js
+	/* eslint-disable quote-props */
 	[
+		// Cases from WPT: urlencoded-parser
+		// https://github.com/web-platform-tests/wpt/blob/5f5ec4cff4/url/urlencoded-parser.any.js
+		{ "input": "test", "output": [["test", ""]] },
+		{ "input": "\uFEFFtest=\uFEFF", "output": [["\uFEFFtest", "\uFEFF"]] },
+		{ "input": "%EF%BB%BFtest=%EF%BB%BF", "output": [["\uFEFFtest", "\uFEFF"]] },
+		{ "input": "%FE%FF", "output": [["\uFFFD\uFFFD", ""]] },
+		{ "input": "%FF%FE", "output": [["\uFFFD\uFFFD", ""]] },
+		{ "input": "†&†=x", "output": [["†", ""], ["†", "x"]] },
+		{ "input": "%C2", "output": [["\uFFFD", ""]] },
+		{ "input": "%C2x", "output": [["\uFFFDx", ""]] },
+		{ "input": "_charset_=windows-1252&test=%C2x", "output": [["_charset_", "windows-1252"], ["test", "\uFFFDx"]] },
+		{ "input": '', "output": [] },
+		{ "input": 'a', "output": [['a', '']] },
+		{ "input": 'a=b', "output": [['a', 'b']] },
+		{ "input": 'a=', "output": [['a', '']] },
+		{ "input": '=b', "output": [['', 'b']] },
+		{ "input": '&', "output": [] },
+		{ "input": '&a', "output": [['a', '']] },
+		{ "input": 'a&', "output": [['a', '']] },
+		{ "input": 'a&a', "output": [['a', ''], ['a', '']] },
+		{ "input": 'a&b&c', "output": [['a', ''], ['b', ''], ['c', '']] },
+		{ "input": 'a=b&c=d', "output": [['a', 'b'], ['c', 'd']] },
+		{ "input": 'a=b&c=d&', "output": [['a', 'b'], ['c', 'd']] },
+		{ "input": '&&&a=b&&&&c=d&', "output": [['a', 'b'], ['c', 'd']] },
+		{ "input": 'a=a&a=b&a=c', "output": [['a', 'a'], ['a', 'b'], ['a', 'c']] },
+		{ "input": 'a==a', "output": [['a', '=a']] },
+		{ "input": 'a=a+b+c+d', "output": [['a', 'a b c d']] },
+		{ "input": '%=a', "output": [['%', 'a']] },
+		{ "input": '%a=a', "output": [['%a', 'a']] },
+		{ "input": '%a_=a', "output": [['%a_', 'a']] },
+		{ "input": '%61=a', "output": [['a', 'a']] },
+		{ "input": '%61+%4d%4D=', "output": [['a MM', '']] },
+		{ "input": "id=0&value=%", "output": [['id', '0'], ['value', '%']] },
+		{ "input": "b=%2sf%2a", "output": [['b', '%2sf*']]},
+		{ "input": "b=%2%2af%2a", "output": [['b', '%2*f*']]},
+		{ "input": "b=%%2a", "output": [['b', '%*']]},
+
+		// Cases from WPT: urlencoded-sort
+		// https://github.com/web-platform-tests/wpt/blob/5f5ec4cff4/url/urlsearchparams-sort.any.js
 		{
 			input: "z=b&a=b&z=a&a=a",
 			output: [["a", "b"], ["a", "a"], ["z", "b"], ["z", "a"]]
@@ -604,26 +643,6 @@ describe('WPT tests', function () {
 			input: "\uFFFD=x&\uFFFC&\uFFFD=a",
 			output: [["\uFFFC", ""], ["\uFFFD", "x"], ["\uFFFD", "a"]]
 		},
-		{
-			input: '%a=a',
-			output: [['%a', 'a']]
-		},
-		{
-			input: "id=0&value=%",
-			output: [['id', '0'], ['value', '%']]
-		},
-		{
-			input: "b=%2sf%2a",
-			output: [['b', '%2sf*']]
-		},
-		{
-			input: "b=%2%2af%2a",
-			output: [['b', '%2*f*']]
-		},
-		{
-			input: "b=%%2a",
-			output: [['b', '%*']]
-		},
 		{
 			input: "ﬃ&🌈", // 🌈 > code point, but < code unit because two code units
 			output: [["🌈", ""], ["ﬃ", ""]]
@@ -649,6 +668,7 @@ describe('WPT tests', function () {
 			input: "a🌈&a💩",
 			output: [["a🌈", ""], ["a💩", ""]]
 		}
+		/* eslint-enable */
 	].forEach(function(val) {
 		it( "parses and sorts: " + val.input, function() {
 			var params = new URLSearchParams(val.input);