Skip to content

Commit eed4dd3

Browse files
aplefullalimpfard
authored andcommitted
LibRegex: Add support for string literals in character classes
1 parent a49c39d commit eed4dd3

File tree

4 files changed

+230
-25
lines changed

4 files changed

+230
-25
lines changed

Libraries/LibJS/Tests/builtins/RegExp/RegExp.js

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,69 @@ test("Unicode properties of strings", () => {
226226
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
227227
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
228228
});
229+
230+
testExtendedCharacterClass({
231+
regExp: /^[\d--\q{0|2|4|9\uFE0F\u20E3}]+$/v,
232+
expression: "[\d--\q{0|2|4|9\uFE0F\u20E3}]",
233+
matchStrings: ["1", "9"],
234+
nonMatchStrings: ["0", "9\uFE0F\u20E3", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
235+
});
236+
237+
testExtendedCharacterClass({
238+
regExp: /^[\d&&\q{0|2|4|9\uFE0F\u20E3}]+$/v,
239+
expression: "[\d&&\q{0|2|4|9\uFE0F\u20E3}]",
240+
matchStrings: ["0", "2", "4"],
241+
nonMatchStrings: ["1", "9\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
242+
});
243+
244+
testExtendedCharacterClass({
245+
regExp: /^[\d\q{0|2|4|9\uFE0F\u20E3}]+$/v,
246+
expression: "[\d\q{0|2|4|9\uFE0F\u20E3}]",
247+
matchStrings: ["0", "9\uFE0F\u20E3"],
248+
nonMatchStrings: ["6\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
249+
});
250+
251+
testExtendedCharacterClass({
252+
regExp: /^[\p{Emoji_Keycap_Sequence}--\q{0|2|4|9\uFE0F\u20E3}]+$/v,
253+
expression: "[\p{Emoji_Keycap_Sequence}--\q{0|2|4|9\uFE0F\u20E3}]",
254+
matchStrings: ["#\uFE0F\u20E3", "8\uFE0F\u20E3"],
255+
nonMatchStrings: ["7", "9\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
256+
});
257+
258+
testExtendedCharacterClass({
259+
regExp: /^[\p{Emoji_Keycap_Sequence}\q{0|2|4|9\uFE0F\u20E3}]+$/v,
260+
expression: "[\p{Emoji_Keycap_Sequence}\q{0|2|4|9\uFE0F\u20E3}]",
261+
matchStrings: ["#\uFE0F\u20E3", "0", "9\uFE0F\u20E3"],
262+
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
263+
});
264+
265+
testExtendedCharacterClass({
266+
regExp: /^[\q{0|2|4|9\uFE0F\u20E3}--\q{0|2|4|9\uFE0F\u20E3}]+$/v,
267+
expression: "[\q{0|2|4|9\uFE0F\u20E3}--\q{0|2|4|9\uFE0F\u20E3}]",
268+
matchStrings: [],
269+
nonMatchStrings: ["0", "9\uFE0F\u20E3", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
270+
});
271+
272+
testExtendedCharacterClass({
273+
regExp: /^[\q{0|2|4|9\uFE0F\u20E3}&&\q{0|2|4|9\uFE0F\u20E3}]+$/v,
274+
expression: "[\q{0|2|4|9\uFE0F\u20E3}&&\q{0|2|4|9\uFE0F\u20E3}]",
275+
matchStrings: ["0", "2", "4", "9\uFE0F\u20E3"],
276+
nonMatchStrings: ["6\uFE0F\u20E3", "7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
277+
});
278+
279+
testExtendedCharacterClass({
280+
regExp: /^[\q{0|2|4|9\uFE0F\u20E3}\q{0|2|4|9\uFE0F\u20E3}]+$/v,
281+
expression: "[\q{0|2|4|9\uFE0F\u20E3}\q{0|2|4|9\uFE0F\u20E3}]",
282+
matchStrings: ["0", "2", "4", "9\uFE0F\u20E3"],
283+
nonMatchStrings: ["6\uFE0F\u20E3", "7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
284+
});
285+
286+
testExtendedCharacterClass({
287+
regExp: /^[\q{0|2|4|9\uFE0F\u20E3}&&\p{Emoji_Keycap_Sequence}]+$/v,
288+
expression: "[\q{0|2|4|9\uFE0F\u20E3}&&\p{Emoji_Keycap_Sequence}]",
289+
matchStrings: ["9\uFE0F\u20E3"],
290+
nonMatchStrings: ["0", "2", "4", "6\uFE0F\u20E3", "7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
291+
});
229292
});
230293

231294
test("Unicode matching with u and v flags", () => {
@@ -261,3 +324,53 @@ test("Unicode matching with u and v flags", () => {
261324
expect(result).toEqual(test.expected);
262325
}
263326
});
327+
328+
test("RegExp string literal", () => {
329+
[
330+
{ pattern: /[\q{abc}]/v, match: "abc", expected: ["abc"] },
331+
{ pattern: /[\q{abc}]/v, match: "a", expected: null },
332+
{ pattern: /[\q{a|b}]/v, match: "b", expected: ["b"] },
333+
{ pattern: /[\q{a\\b}]/v, match: "a\\b", expected: ["a\\b"] },
334+
{ pattern: /[\q{}]/v, match: "", expected: [""] },
335+
{ pattern: /[\q{😀|😁|😂}]/v, match: "😁", expected: ["😁"] },
336+
{ pattern: /[\q{1|1\uFE0F\u20E3}]/v, match: "1️⃣", expected: ["1️⃣"] },
337+
{ pattern: /[\q{1}]/v, match: "1️⃣", expected: ["1"] },
338+
{ pattern: /[\d&&\q{2}]/v, match: "123", expected: ["2"] },
339+
{ pattern: /[^\q{a|b}]/v, match: "abc", expected: ["c"] },
340+
{ pattern: /[\q{\n}]/v, match: "\n", expected: ["\n"] },
341+
{ pattern: /[\q{\b}]/v, match: "\b", expected: ["\b"] },
342+
{ pattern: /[\q{\0}]/v, match: "\0", expected: ["\0"] },
343+
{ pattern: /[\q{\|}]/v, match: "|", expected: ["|"] },
344+
{ pattern: /[\q{\x41}]/v, match: "A", expected: ["A"] },
345+
{
346+
pattern: /[\q{\uD83D\uDC68\u200d\uD83D\uDC69\u200d\uD83D\uDC66\u200d\uD83D\uDC66}]/v,
347+
match: "👨‍👩‍👦‍👦",
348+
expected: ["👨‍👩‍👦‍👦"],
349+
},
350+
{ pattern: /[\q{\u{1F600}}]/v, match: "😀", expected: ["😀"] },
351+
{ pattern: /[\q{\cZ}]/v, match: "\x1A", expected: ["\x1A"] },
352+
{ pattern: /[\q{ }]/v, match: " ", expected: [" "] },
353+
{ pattern: /[[\d+]--[\q{1}]]/gv, match: "12", expected: ["2"] },
354+
{ pattern: /[[\d]&&[\q{1}]]/gv, match: "21", expected: ["1"] },
355+
{ pattern: /[\d\q{a}]/gv, match: "a1", expected: ["a", "1"] },
356+
].forEach(test => {
357+
const result = test.match.match(test.pattern);
358+
expect(result).toEqual(test.expected);
359+
});
360+
361+
[
362+
"[\\q{(a)}]",
363+
"[\\q{[a]}]",
364+
"[\\q{{a}}]",
365+
"[^\\q{bad}]",
366+
"[\\q{a-b}]",
367+
"[^\\q{a|bc}]",
368+
"[^\\q{\\b+}]",
369+
"[\\q{\\d}]",
370+
"[\\q{\\w}]",
371+
"[\\q{\\q}]",
372+
"[^\\q{\\(\\)}]",
373+
].forEach(pattern => {
374+
expect(() => new RegExp(pattern, "v")).toThrow(SyntaxError);
375+
});
376+
});

Libraries/LibRegex/RegexByteCode.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
486486
bool active { false };
487487
bool is_conjunction { false };
488488
bool is_subtraction { false };
489+
bool is_and_operation { false };
489490
bool fail { false };
490491
bool inverse_matched { false };
491492
size_t subtraction_operand_index { 0 };
@@ -774,6 +775,11 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
774775
auto const* current = ≜
775776
size_t current_code_unit_offset = state.string_position_in_code_units;
776777

778+
if (current->has_metadata() && current->metadata_value()) {
779+
matched = true;
780+
longest_match_length = 0;
781+
}
782+
777783
while (true) {
778784
u32 value;
779785

@@ -855,6 +861,8 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
855861
}
856862

857863
if (matched) {
864+
if (longest_match_length == 0)
865+
had_zero_length_match = true;
858866
if (current_inversion_state()) {
859867
inverse_matched = true;
860868
} else {
@@ -872,6 +880,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
872880
disjunction_states.append({
873881
.active = true,
874882
.is_conjunction = current_inversion_state(),
883+
.is_and_operation = true,
875884
.fail = current_inversion_state(),
876885
.inverse_matched = current_inversion_state(),
877886
.initial_position = state.string_position,
@@ -933,6 +942,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
933942
if (!has_single_argument && new_disjunction_state.active) {
934943
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
935944

945+
if (!failed && new_disjunction_state.is_and_operation
946+
&& new_disjunction_state.last_accepted_position.has_value()
947+
&& new_disjunction_state.last_accepted_position.value() != state.string_position) {
948+
949+
failed = true;
950+
}
951+
936952
if (!failed) {
937953
new_disjunction_state.last_accepted_position = state.string_position;
938954
new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units;

Libraries/LibRegex/RegexParser.cpp

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ static constexpr StringView identity_escape_characters(bool unicode, bool browse
3434
return "^$\\.*+?()[]{}|"sv;
3535
}
3636

37+
static bool has_multiple_code_points(String const& str)
38+
{
39+
Utf8View utf8_view { str.bytes_as_string_view() };
40+
auto it = utf8_view.begin();
41+
if (it == utf8_view.end())
42+
return false;
43+
return ++it != utf8_view.end();
44+
}
45+
3746
ALWAYS_INLINE bool Parser::set_error(Error error)
3847
{
3948
if (m_parser_state.error == Error::NoError) {
@@ -1855,7 +1864,16 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
18551864
compares.empend(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
18561865
}
18571866

1858-
match_length_minimum += 1;
1867+
bool has_empty_string_set = any_of(compares, [this](auto const& compare) {
1868+
if (compare.type != CharacterCompareType::StringSet)
1869+
return false;
1870+
1871+
auto const& trie = m_parser_state.bytecode.string_set_table().get_u8_trie(compare.value);
1872+
return trie.has_metadata() && trie.metadata_value();
1873+
});
1874+
1875+
if (!has_empty_string_set)
1876+
match_length_minimum += 1;
18591877
stack.insert_bytecode_compare_values(move(compares));
18601878
return true;
18611879
}
@@ -2299,6 +2317,15 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
22992317
return {};
23002318
}
23012319

2320+
if (match(TokenType::EscapeSequence)) {
2321+
auto escape_value = m_parser_state.current_token.value();
2322+
consume();
2323+
2324+
if (escape_value[0] == '\\' && escape_value.length() == 2) {
2325+
return escape_value[1];
2326+
}
2327+
}
2328+
23022329
auto start_position = tell();
23032330
ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } };
23042331

@@ -2359,6 +2386,62 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
23592386
{
23602387
auto start_position = tell();
23612388

2389+
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
2390+
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
2391+
// ClassString :: [empty] | NonEmptyClassString
2392+
// NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt]
2393+
if (try_skip("\\q"sv)) {
2394+
if (!match(TokenType::LeftCurly)) {
2395+
back(2);
2396+
return false;
2397+
}
2398+
consume();
2399+
2400+
Vector<String> strings;
2401+
StringBuilder current_string;
2402+
2403+
while (!match(TokenType::RightCurly)) {
2404+
if (done()) {
2405+
set_error(Error::MismatchingBrace);
2406+
return false;
2407+
}
2408+
2409+
if (match(TokenType::Pipe)) {
2410+
consume();
2411+
strings.append(MUST(current_string.to_string()));
2412+
current_string.clear();
2413+
continue;
2414+
}
2415+
2416+
auto character = parse_class_set_character();
2417+
if (!character.has_value()) {
2418+
if (has_error())
2419+
return false;
2420+
set_error(Error::InvalidCharacterClass);
2421+
return false;
2422+
}
2423+
2424+
current_string.append_code_point(character.value());
2425+
}
2426+
2427+
strings.append(MUST(current_string.to_string()));
2428+
consume(TokenType::RightCurly, Error::MismatchingBrace);
2429+
2430+
bool is_negated = any_of(compares, [](auto const& compare) {
2431+
return compare.type == CharacterCompareType::Inverse;
2432+
});
2433+
2434+
if (is_negated && any_of(strings, has_multiple_code_points)) {
2435+
set_error(Error::NegatedCharacterClassStrings);
2436+
return false;
2437+
}
2438+
2439+
auto string_set_index = m_parser_state.bytecode.string_set_table().set(strings);
2440+
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
2441+
2442+
return true;
2443+
}
2444+
23622445
// ClassSetOperand :: ClassSetCharacter | ClassStringDisjunction | NestedClass
23632446
if (auto character = parse_class_set_character(); character.has_value()) {
23642447
compares.append({ CharacterCompareType::Char, character.value() });
@@ -2414,15 +2497,6 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
24142497
if (has_error())
24152498
return false;
24162499

2417-
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
2418-
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
2419-
// ClassString :: [empty] | NonEmptyClassString
2420-
// NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt]
2421-
if (try_skip("\\q{"sv)) {
2422-
// FIXME: Implement this :P
2423-
return set_error(Error::InvalidCharacterClass);
2424-
}
2425-
24262500
back(tell() - start_position + 1);
24272501
return false;
24282502
}
@@ -2491,14 +2565,17 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
24912565
return;
24922566
}
24932567

2494-
for (auto const& compare : compares) {
2495-
if (compare.type == CharacterCompareType::Inverse) {
2496-
set_error(Error::NegatedCharacterClassStrings);
2497-
return;
2498-
}
2568+
auto strings = Unicode::get_property_strings(property);
2569+
2570+
bool is_negated = any_of(compares, [](auto const& compare) {
2571+
return compare.type == CharacterCompareType::Inverse;
2572+
});
2573+
2574+
if (is_negated && any_of(strings, has_multiple_code_points)) {
2575+
set_error(Error::NegatedCharacterClassStrings);
2576+
return;
24992577
}
25002578

2501-
auto strings = Unicode::get_property_strings(property);
25022579
if (!strings.is_empty()) {
25032580
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
25042581
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });

Tests/LibWeb/Text/expected/wpt-import/html/semantics/forms/constraints/form-validation-validity-patternMismatch.txt

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@ Harness status: OK
22

33
Found 85 tests
44

5-
78 Pass
6-
7 Fail
5+
85 Pass
76
Pass [INPUT in TEXT status] The pattern attribute is not set
87
Pass [INPUT in TEXT status] The value attibute is empty string
98
Pass [INPUT in TEXT status] The value attribute matches the pattern attribute
@@ -15,7 +14,7 @@ Pass [INPUT in TEXT status] Invalid `v` regular expression gets ignored
1514
Pass [INPUT in TEXT status] The pattern attribute tries to escape a group
1615
Pass [INPUT in TEXT status] The pattern attribute uses Unicode features
1716
Pass [INPUT in TEXT status] The value attribute matches JavaScript-specific regular expression
18-
Fail [INPUT in TEXT status] The value attribute mismatches JavaScript-specific regular expression
17+
Pass [INPUT in TEXT status] The value attribute mismatches JavaScript-specific regular expression
1918
Pass [INPUT in SEARCH status] The pattern attribute is not set
2019
Pass [INPUT in SEARCH status] The value attibute is empty string
2120
Pass [INPUT in SEARCH status] The value attribute matches the pattern attribute
@@ -27,7 +26,7 @@ Pass [INPUT in SEARCH status] Invalid `v` regular expression gets ignored
2726
Pass [INPUT in SEARCH status] The pattern attribute tries to escape a group
2827
Pass [INPUT in SEARCH status] The pattern attribute uses Unicode features
2928
Pass [INPUT in SEARCH status] The value attribute matches JavaScript-specific regular expression
30-
Fail [INPUT in SEARCH status] The value attribute mismatches JavaScript-specific regular expression
29+
Pass [INPUT in SEARCH status] The value attribute mismatches JavaScript-specific regular expression
3130
Pass [INPUT in TEL status] The pattern attribute is not set
3231
Pass [INPUT in TEL status] The value attibute is empty string
3332
Pass [INPUT in TEL status] The value attribute matches the pattern attribute
@@ -39,7 +38,7 @@ Pass [INPUT in TEL status] Invalid `v` regular expression gets ignored
3938
Pass [INPUT in TEL status] The pattern attribute tries to escape a group
4039
Pass [INPUT in TEL status] The pattern attribute uses Unicode features
4140
Pass [INPUT in TEL status] The value attribute matches JavaScript-specific regular expression
42-
Fail [INPUT in TEL status] The value attribute mismatches JavaScript-specific regular expression
41+
Pass [INPUT in TEL status] The value attribute mismatches JavaScript-specific regular expression
4342
Pass [INPUT in URL status] The pattern attribute is not set
4443
Pass [INPUT in URL status] The value attibute is empty string
4544
Pass [INPUT in URL status] The value attribute matches the pattern attribute
@@ -51,7 +50,7 @@ Pass [INPUT in URL status] Invalid `v` regular expression gets ignored
5150
Pass [INPUT in URL status] The pattern attribute tries to escape a group
5251
Pass [INPUT in URL status] The pattern attribute uses Unicode features
5352
Pass [INPUT in URL status] The value attribute matches JavaScript-specific regular expression
54-
Fail [INPUT in URL status] The value attribute mismatches JavaScript-specific regular expression
53+
Pass [INPUT in URL status] The value attribute mismatches JavaScript-specific regular expression
5554
Pass [INPUT in EMAIL status] The pattern attribute is not set
5655
Pass [INPUT in EMAIL status] The value attibute is empty string
5756
Pass [INPUT in EMAIL status] The value attribute matches the pattern attribute
@@ -63,7 +62,7 @@ Pass [INPUT in EMAIL status] Invalid `v` regular expression gets ignored
6362
Pass [INPUT in EMAIL status] The pattern attribute tries to escape a group
6463
Pass [INPUT in EMAIL status] The pattern attribute uses Unicode features
6564
Pass [INPUT in EMAIL status] The value attribute matches JavaScript-specific regular expression
66-
Fail [INPUT in EMAIL status] The value attribute mismatches JavaScript-specific regular expression
65+
Pass [INPUT in EMAIL status] The value attribute mismatches JavaScript-specific regular expression
6766
Pass [INPUT in PASSWORD status] The pattern attribute is not set
6867
Pass [INPUT in PASSWORD status] The value attibute is empty string
6968
Pass [INPUT in PASSWORD status] The value attribute matches the pattern attribute
@@ -75,7 +74,7 @@ Pass [INPUT in PASSWORD status] Invalid `v` regular expression gets ignored
7574
Pass [INPUT in PASSWORD status] The pattern attribute tries to escape a group
7675
Pass [INPUT in PASSWORD status] The pattern attribute uses Unicode features
7776
Pass [INPUT in PASSWORD status] The value attribute matches JavaScript-specific regular expression
78-
Fail [INPUT in PASSWORD status] The value attribute mismatches JavaScript-specific regular expression
77+
Pass [INPUT in PASSWORD status] The value attribute mismatches JavaScript-specific regular expression
7978
Pass [INPUT in EMAIL status] The pattern attribute is not set, if multiple is present
8079
Pass [INPUT in EMAIL status] The value attibute is empty string, if multiple is present
8180
Pass [INPUT in EMAIL status] The value attribute matches the pattern attribute, if multiple is present
@@ -87,5 +86,5 @@ Pass [INPUT in EMAIL status] Invalid `v` regular expression gets ignored, if mul
8786
Pass [INPUT in EMAIL status] The pattern attribute tries to escape a group, if multiple is present
8887
Pass [INPUT in EMAIL status] The pattern attribute uses Unicode features, if multiple is present
8988
Pass [INPUT in EMAIL status] The value attribute matches JavaScript-specific regular expression, if multiple is present
90-
Fail [INPUT in EMAIL status] The value attribute mismatches JavaScript-specific regular expression, if multiple is present
89+
Pass [INPUT in EMAIL status] The value attribute mismatches JavaScript-specific regular expression, if multiple is present
9190
Pass [INPUT in EMAIL status] Commas should be stripped from regex input, if multiple is present

0 commit comments

Comments
 (0)