Skip to content

Commit 57ef949

Browse files
committed
LibRegex: Account for nested 'or' compare ops
Closes #6647.
1 parent 8c9c2ee commit 57ef949

File tree

2 files changed

+59
-46
lines changed

2 files changed

+59
-46
lines changed

Libraries/LibRegex/RegexOptimizer.cpp

Lines changed: 56 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -430,9 +430,18 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
430430
temporary_inverse = false;
431431
reset_temporary_inverse = false;
432432
inverse = false;
433-
auto in_or = false; // We're in an OR block, so we should wait for the EndAndOr to decide if we would match.
434-
auto matched_in_or = false;
435-
auto inverse_matched_in_or = false;
433+
struct DisjunctionState {
434+
bool in_or = false; // We're in an OR block, so we should wait for the EndAndOr to decide if we would match.
435+
bool matched_in_or = false;
436+
bool inverse_matched_in_or = false;
437+
};
438+
439+
Vector<DisjunctionState, 2> disjunction_stack;
440+
disjunction_stack.empend();
441+
442+
auto in_or = [&] -> bool& { return disjunction_stack.last().in_or; };
443+
auto matched_in_or = [&] -> bool& { return disjunction_stack.last().matched_in_or; };
444+
auto inverse_matched_in_or = [&] -> bool& { return disjunction_stack.last().inverse_matched_in_or; };
436445

437446
for (auto const& pair : rhs) {
438447
if (reset_temporary_inverse) {
@@ -452,7 +461,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
452461
dbgln(" {}", character_class_name(char_class));
453462
for (auto& char_class : lhs_negated_char_classes)
454463
dbgln(" ^{}", character_class_name(char_class));
455-
dbgln("}}, in or: {}, matched in or: {}, inverse matched in or: {}", in_or, matched_in_or, inverse_matched_in_or);
464+
dbgln("}}, in or: {}, matched in or: {}, inverse matched in or: {}", in_or(), matched_in_or(), inverse_matched_in_or());
456465
}
457466

458467
switch (pair.type) {
@@ -465,20 +474,20 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
465474
break;
466475
case CharacterCompareType::AnyChar:
467476
// Special case: if not inverted, AnyChar is always in the range.
468-
if (!in_or && !current_lhs_inversion_state())
477+
if (!in_or() && !current_lhs_inversion_state())
469478
return true;
470-
if (in_or) {
471-
matched_in_or = true;
472-
inverse_matched_in_or = false;
479+
if (in_or()) {
480+
matched_in_or() = true;
481+
inverse_matched_in_or() = false;
473482
}
474483
break;
475484
case CharacterCompareType::Char: {
476485
auto matched = range_contains(pair.value);
477-
if (!in_or && (current_lhs_inversion_state() ^ matched))
486+
if (!in_or() && (current_lhs_inversion_state() ^ matched))
478487
return true;
479-
if (in_or) {
480-
matched_in_or |= matched;
481-
inverse_matched_in_or |= !matched;
488+
if (in_or()) {
489+
matched_in_or() |= matched;
490+
inverse_matched_in_or() |= !matched;
482491
}
483492
break;
484493
}
@@ -488,23 +497,23 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
488497
return true;
489498
case CharacterCompareType::CharClass: {
490499
auto contains = char_class_contains(static_cast<CharClass>(pair.value));
491-
if (!in_or && (current_lhs_inversion_state() ^ contains))
500+
if (!in_or() && (current_lhs_inversion_state() ^ contains))
492501
return true;
493-
if (in_or) {
494-
matched_in_or |= contains;
495-
inverse_matched_in_or |= !contains;
502+
if (in_or()) {
503+
matched_in_or() |= contains;
504+
inverse_matched_in_or() |= !contains;
496505
}
497506
break;
498507
}
499508
case CharacterCompareType::CharRange: {
500509
auto range = CharRange(pair.value);
501510
auto contains = range_contains(range);
502-
if (!in_or && (contains ^ current_lhs_inversion_state()))
511+
if (!in_or() && (contains ^ current_lhs_inversion_state()))
503512
return true;
504513

505-
if (in_or) {
506-
matched_in_or |= contains;
507-
inverse_matched_in_or |= !contains;
514+
if (in_or()) {
515+
matched_in_or() |= contains;
516+
inverse_matched_in_or() |= !contains;
508517
}
509518

510519
break;
@@ -525,16 +534,16 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
525534
return true;
526535
if (has_any_unicode_property && !lhs_unicode_properties.is_empty() && !lhs_negated_unicode_properties.is_empty()) {
527536
auto contains = lhs_unicode_properties.contains(static_cast<Unicode::Property>(pair.value));
528-
if (!in_or && (current_lhs_inversion_state() ^ contains))
537+
if (!in_or() && (current_lhs_inversion_state() ^ contains))
529538
return true;
530539

531540
auto inverse_contains = lhs_negated_unicode_properties.contains(static_cast<Unicode::Property>(pair.value));
532-
if (!in_or && !(current_lhs_inversion_state() ^ inverse_contains))
541+
if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains))
533542
return true;
534543

535-
if (in_or) {
536-
matched_in_or |= contains;
537-
inverse_matched_in_or |= inverse_contains;
544+
if (in_or()) {
545+
matched_in_or() |= contains;
546+
inverse_matched_in_or() |= inverse_contains;
538547
}
539548
}
540549
break;
@@ -543,14 +552,14 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
543552
return true;
544553
if (has_any_unicode_property && !lhs_unicode_general_categories.is_empty() && !lhs_negated_unicode_general_categories.is_empty()) {
545554
auto contains = lhs_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value));
546-
if (!in_or && (current_lhs_inversion_state() ^ contains))
555+
if (!in_or() && (current_lhs_inversion_state() ^ contains))
547556
return true;
548557
auto inverse_contains = lhs_negated_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value));
549-
if (!in_or && !(current_lhs_inversion_state() ^ inverse_contains))
558+
if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains))
550559
return true;
551-
if (in_or) {
552-
matched_in_or |= contains;
553-
inverse_matched_in_or |= inverse_contains;
560+
if (in_or()) {
561+
matched_in_or() |= contains;
562+
inverse_matched_in_or() |= inverse_contains;
554563
}
555564
}
556565
break;
@@ -559,14 +568,14 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
559568
return true;
560569
if (has_any_unicode_property && !lhs_unicode_scripts.is_empty() && !lhs_negated_unicode_scripts.is_empty()) {
561570
auto contains = lhs_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value));
562-
if (!in_or && (current_lhs_inversion_state() ^ contains))
571+
if (!in_or() && (current_lhs_inversion_state() ^ contains))
563572
return true;
564573
auto inverse_contains = lhs_negated_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value));
565-
if (!in_or && !(current_lhs_inversion_state() ^ inverse_contains))
574+
if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains))
566575
return true;
567-
if (in_or) {
568-
matched_in_or |= contains;
569-
inverse_matched_in_or |= inverse_contains;
576+
if (in_or()) {
577+
matched_in_or() |= contains;
578+
inverse_matched_in_or() |= inverse_contains;
570579
}
571580
}
572581
break;
@@ -575,33 +584,34 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
575584
return true;
576585
if (has_any_unicode_property && !lhs_unicode_script_extensions.is_empty() && !lhs_negated_unicode_script_extensions.is_empty()) {
577586
auto contains = lhs_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value));
578-
if (!in_or && (current_lhs_inversion_state() ^ contains))
587+
if (!in_or() && (current_lhs_inversion_state() ^ contains))
579588
return true;
580589
auto inverse_contains = lhs_negated_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value));
581-
if (!in_or && !(current_lhs_inversion_state() ^ inverse_contains))
590+
if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains))
582591
return true;
583-
if (in_or) {
584-
matched_in_or |= contains;
585-
inverse_matched_in_or |= inverse_contains;
592+
if (in_or()) {
593+
matched_in_or() |= contains;
594+
inverse_matched_in_or() |= inverse_contains;
586595
}
587596
}
588597
break;
589598
case CharacterCompareType::Or:
590-
in_or = true;
599+
disjunction_stack.empend(true);
591600
break;
592-
case CharacterCompareType::EndAndOr:
601+
case CharacterCompareType::EndAndOr: {
593602
// FIXME: Handle And when we support it below.
594-
VERIFY(in_or);
595-
in_or = false;
603+
VERIFY(in_or());
604+
auto state = disjunction_stack.take_last();
596605
if (current_lhs_inversion_state()) {
597-
if (!inverse_matched_in_or)
606+
if (!state.inverse_matched_in_or)
598607
return true;
599608
} else {
600-
if (matched_in_or)
609+
if (state.matched_in_or)
601610
return true;
602611
}
603612

604613
break;
614+
}
605615
case CharacterCompareType::And:
606616
// FIXME: These are too difficult to handle, so bail out.
607617
return true;

Tests/LibRegex/TestRegex.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,9 @@ TEST_CASE(ECMA262_match)
738738
{ "((?<x>a)|(?<x>b))"sv, "aa"sv, false },
739739
// Insensitive charclasses should accept upper/lowercase in pattern (lookup table should still be ordered if insensitive lookup is used), ladybird#5399.
740740
{ "[aBc]"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
741+
// Optimizer bug: nested 'or' compare ops caused a crash, ladybird#6647.
742+
{ "([[[]]])*0"sv, ""sv, false, ECMAScriptFlags::UnicodeSets },
743+
{ "(([[[]]]{2,})\\s)*"sv, ""sv, true, (ECMAScriptFlags::UnicodeSets | ECMAScriptFlags::Global).value() },
741744
};
742745

743746
for (auto& test : tests) {

0 commit comments

Comments
 (0)