Skip to content

Commit

Permalink
refactor(parser): reduce work parsing regexps (oxc-project#1999)
Browse files Browse the repository at this point in the history
oxc-project#1926 produced a small performance regression because when parsing a
regexp, some work is repeated.
  • Loading branch information
overlookmotel authored and IWANABETHATGUY committed May 29, 2024
1 parent 43285d2 commit 1af92cf
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 29 deletions.
7 changes: 5 additions & 2 deletions crates/oxc_parser/src/cursor.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Code related to navigating `Token`s from the lexer

use oxc_ast::ast::RegExpFlags;
use oxc_diagnostics::Result;
use oxc_span::Span;

Expand Down Expand Up @@ -200,8 +201,10 @@ impl<'a> Parser<'a> {
}

/// Tell lexer to read a regex
pub(crate) fn read_regex(&mut self) {
self.token = self.lexer.next_regex(self.cur_kind());
pub(crate) fn read_regex(&mut self) -> (u32, RegExpFlags) {
let (token, pattern_end, flags) = self.lexer.next_regex(self.cur_kind());
self.token = token;
(pattern_end, flags)
}

/// Tell lexer to read a template substitution tail
Expand Down
21 changes: 4 additions & 17 deletions crates/oxc_parser/src/js/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ impl<'a> Parser<'a> {
}
Kind::LParen => self.parse_parenthesized_expression(span),
Kind::Slash | Kind::SlashEq => {
self.read_regex();
let literal = self.parse_literal_regexp();
Ok(self.ast.literal_regexp_expression(literal))
}
Expand Down Expand Up @@ -320,22 +319,10 @@ impl<'a> Parser<'a> {
pub(crate) fn parse_literal_regexp(&mut self) -> RegExpLiteral {
let span = self.start_span();

// split out the flag part of `/regex/flag` by looking for `/` from the end
let regex_src = self.cur_src();
let mut flags = RegExpFlags::empty();

let mut split_index = None;
for (i, c) in regex_src.char_indices().rev() {
if let Ok(flag) = RegExpFlags::try_from(c) {
flags |= flag;
} else {
split_index.replace(i);
break;
}
}

// `/` are omitted from the pattern
let pattern = split_index.map_or(regex_src, |i| regex_src.get(1..i).unwrap_or(""));
// split out pattern
let (pattern_end, flags) = self.read_regex();
let pattern_start = self.cur_token().start + 1; // +1 to exclude `/`
let pattern = &self.source_text[pattern_start as usize..pattern_end as usize];

self.bump_any();
self.ast.reg_exp_literal(self.end_span(span), pattern, flags)
Expand Down
24 changes: 14 additions & 10 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,16 +192,17 @@ impl<'a> Lexer<'a> {
/// where a `RegularExpressionLiteral` is permitted
/// Which means the parser needs to re-tokenize on `PrimaryExpression`,
/// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression`
pub fn next_regex(&mut self, kind: Kind) -> Token {
pub fn next_regex(&mut self, kind: Kind) -> (Token, u32, RegExpFlags) {
self.current.token.start = self.offset()
- match kind {
Kind::Slash => 1,
Kind::SlashEq => 2,
_ => unreachable!(),
};
let kind = self.read_regex();
let (pattern_end, flags) = self.read_regex();
self.lookahead.clear();
self.finish_next(kind)
let token = self.finish_next(Kind::RegExp);
(token, pattern_end, flags)
}

pub fn next_right_angle(&mut self) -> Token {
Expand Down Expand Up @@ -828,18 +829,20 @@ impl<'a> Lexer<'a> {
}

/// 12.9.5 Regular Expression Literals
fn read_regex(&mut self) -> Kind {
fn read_regex(&mut self) -> (u32, RegExpFlags) {
let mut in_escape = false;
let mut in_character_class = false;
loop {
match self.current.chars.next() {
None => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
return Kind::Undetermined;
return (self.offset(), RegExpFlags::empty());
}
Some(c) if is_line_terminator(c) => {
self.error(diagnostics::UnterminatedRegExp(self.unterminated_range()));
return Kind::Undetermined;
#[allow(clippy::cast_possible_truncation)]
let pattern_end = self.offset() - c.len_utf8() as u32;
return (pattern_end, RegExpFlags::empty());
}
Some(c) => {
if in_escape {
Expand All @@ -857,28 +860,29 @@ impl<'a> Lexer<'a> {
}
}

let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty();

while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
self.current.chars.next();
if !ch.is_ascii_lowercase() {
self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
return Kind::Undetermined;
break;
}
let flag = if let Ok(flag) = RegExpFlags::try_from(ch) {
flag
} else {
self.error(diagnostics::RegExpFlag(ch, self.current_offset()));
return Kind::Undetermined;
break;
};
if flags.contains(flag) {
self.error(diagnostics::RegExpFlagTwice(ch, self.current_offset()));
return Kind::Undetermined;
break;
}
flags |= flag;
}

Kind::RegExp
(pattern_end, flags)
}

/// 12.8.6 Template Literal Lexical Components
Expand Down

0 comments on commit 1af92cf

Please sign in to comment.