Skip to content

Commit 7c7a035

Browse files
committed
LibJS: Cache an Utf16View for the full code string in SourceCode
This avoids doing the ASCII/UTF-16 flag check every time we access character data from the lexer.
1 parent fa44fd5 commit 7c7a035

File tree

4 files changed

+40
-36
lines changed

4 files changed

+40
-36
lines changed

Libraries/LibJS/Lexer.cpp

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ static consteval AK::Array<TokenType, 256> make_single_char_tokens_array()
225225

226226
static constexpr auto s_single_char_tokens = make_single_char_tokens_array();
227227

228-
Lexer::Lexer(NonnullRefPtr<SourceCode const> source, size_t line_number, size_t line_column)
229-
: m_source(move(source))
228+
Lexer::Lexer(NonnullRefPtr<SourceCode const> source_code, size_t line_number, size_t line_column)
229+
: m_source_code(move(source_code))
230230
, m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0)
231231
, m_line_number(line_number)
232232
, m_line_column(line_column)
@@ -280,16 +280,16 @@ Lexer::Lexer(NonnullRefPtr<SourceCode const> source, size_t line_number, size_t
280280
void Lexer::consume()
281281
{
282282
auto did_reach_eof = [this] {
283-
if (m_position < m_source->code().length_in_code_units())
283+
if (m_position < source().length_in_code_units())
284284
return false;
285285
m_eof = true;
286286
m_current_code_unit = '\0';
287-
m_position = m_source->code().length_in_code_units() + 1;
287+
m_position = source().length_in_code_units() + 1;
288288
m_line_column++;
289289
return true;
290290
};
291291

292-
if (m_position > m_source->code().length_in_code_units())
292+
if (m_position > source().length_in_code_units())
293293
return;
294294

295295
if (did_reach_eof())
@@ -315,7 +315,7 @@ void Lexer::consume()
315315
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
316316
// The sequence <CR><LF> is commonly used as a line terminator.
317317
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
318-
auto second_char_of_crlf = m_position > 1 && m_source->code().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
318+
auto second_char_of_crlf = m_position > 1 && source().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
319319

320320
if (!second_char_of_crlf) {
321321
m_line_number++;
@@ -325,8 +325,8 @@ void Lexer::consume()
325325
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
326326
}
327327
} else {
328-
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source->code().length_in_code_units()) {
329-
if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source->code().code_unit_at(m_position))) {
328+
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < source().length_in_code_units()) {
329+
if (AK::UnicodeUtils::is_utf16_low_surrogate(source().code_unit_at(m_position))) {
330330
++m_position;
331331

332332
if (did_reach_eof())
@@ -337,7 +337,7 @@ void Lexer::consume()
337337
++m_line_column;
338338
}
339339

340-
m_current_code_unit = m_source->code().code_unit_at(m_position++);
340+
m_current_code_unit = source().code_unit_at(m_position++);
341341
}
342342

343343
bool Lexer::consume_decimal_number()
@@ -412,40 +412,40 @@ bool Lexer::consume_binary_number()
412412
template<typename Callback>
413413
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
414414
{
415-
if (m_position >= m_source->code().length_in_code_units())
415+
if (m_position >= source().length_in_code_units())
416416
return false;
417417
return m_current_code_unit == '_'
418-
&& callback(m_source->code().code_unit_at(m_position));
418+
&& callback(source().code_unit_at(m_position));
419419
}
420420

421421
bool Lexer::match(char16_t a, char16_t b) const
422422
{
423-
if (m_position >= m_source->code().length_in_code_units())
423+
if (m_position >= source().length_in_code_units())
424424
return false;
425425

426426
return m_current_code_unit == a
427-
&& m_source->code().code_unit_at(m_position) == b;
427+
&& source().code_unit_at(m_position) == b;
428428
}
429429

430430
bool Lexer::match(char16_t a, char16_t b, char16_t c) const
431431
{
432-
if (m_position + 1 >= m_source->code().length_in_code_units())
432+
if (m_position + 1 >= source().length_in_code_units())
433433
return false;
434434

435435
return m_current_code_unit == a
436-
&& m_source->code().code_unit_at(m_position) == b
437-
&& m_source->code().code_unit_at(m_position + 1) == c;
436+
&& source().code_unit_at(m_position) == b
437+
&& source().code_unit_at(m_position + 1) == c;
438438
}
439439

440440
bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const
441441
{
442-
if (m_position + 2 >= m_source->code().length_in_code_units())
442+
if (m_position + 2 >= source().length_in_code_units())
443443
return false;
444444

445445
return m_current_code_unit == a
446-
&& m_source->code().code_unit_at(m_position) == b
447-
&& m_source->code().code_unit_at(m_position + 1) == c
448-
&& m_source->code().code_unit_at(m_position + 2) == d;
446+
&& source().code_unit_at(m_position) == b
447+
&& source().code_unit_at(m_position + 1) == c
448+
&& source().code_unit_at(m_position + 2) == d;
449449
}
450450

451451
bool Lexer::is_eof() const
@@ -469,7 +469,7 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const
469469
if (m_position == 0)
470470
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
471471

472-
auto substring = m_source->code().substring_view(m_position - 1);
472+
auto substring = source().substring_view(m_position - 1);
473473
if (substring.is_empty())
474474
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
475475

@@ -591,7 +591,7 @@ bool Lexer::is_block_comment_end() const
591591

592592
bool Lexer::is_numeric_literal_start() const
593593
{
594-
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position)));
594+
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < source().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position)));
595595
}
596596

597597
bool Lexer::slash_means_division() const
@@ -837,7 +837,7 @@ Token const& Lexer::next()
837837
while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) {
838838
if (m_current_code_unit == '\\') {
839839
consume();
840-
if (m_current_code_unit == '\r' && m_position < m_source->code().length_in_code_units() && m_source->code().code_unit_at(m_position) == '\n') {
840+
if (m_current_code_unit == '\r' && m_position < source().length_in_code_units() && source().code_unit_at(m_position) == '\n') {
841841
consume();
842842
}
843843
}
@@ -872,8 +872,8 @@ Token const& Lexer::next()
872872
consume();
873873
}
874874

875-
if (!found_token && m_position + 1 < m_source->code().length_in_code_units()) {
876-
auto three_chars_view = m_source->code().substring_view(m_position - 1, 3);
875+
if (!found_token && m_position + 1 < source().length_in_code_units()) {
876+
auto three_chars_view = source().substring_view(m_position - 1, 3);
877877
if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) {
878878
found_token = true;
879879
token_type = type;
@@ -883,11 +883,11 @@ Token const& Lexer::next()
883883
}
884884
}
885885

886-
if (!found_token && m_position < m_source->code().length_in_code_units()) {
887-
auto two_chars_view = m_source->code().substring_view(m_position - 1, 2);
886+
if (!found_token && m_position < source().length_in_code_units()) {
887+
auto two_chars_view = source().substring_view(m_position - 1, 2);
888888
if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) {
889889
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
890-
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position + 1)))) {
890+
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < source().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position + 1)))) {
891891
found_token = true;
892892
token_type = type;
893893
consume();
@@ -921,8 +921,8 @@ Token const& Lexer::next()
921921
m_current_token = Token(
922922
token_type,
923923
token_message,
924-
m_source->code().substring_view(trivia_start - 1, value_start - trivia_start),
925-
m_source->code().substring_view(value_start - 1, m_position - value_start),
924+
source().substring_view(trivia_start - 1, value_start - trivia_start),
925+
source().substring_view(value_start - 1, m_position - value_start),
926926
value_start_line_number,
927927
value_start_column_number,
928928
value_start - 1);
@@ -952,7 +952,7 @@ Token const& Lexer::force_slash_as_regex()
952952
size_t value_start = m_position - 1;
953953

954954
if (has_equals) {
955-
VERIFY(m_source->code().code_unit_at(value_start - 1) == '=');
955+
VERIFY(source().code_unit_at(value_start - 1) == '=');
956956
--value_start;
957957
--m_position;
958958
m_current_code_unit = '=';
@@ -964,7 +964,7 @@ Token const& Lexer::force_slash_as_regex()
964964
token_type,
965965
Token::Message::None,
966966
m_current_token.trivia(),
967-
m_source->code().substring_view(value_start - 1, m_position - value_start),
967+
source().substring_view(value_start - 1, m_position - value_start),
968968
m_current_token.line_number(),
969969
m_current_token.line_column(),
970970
value_start - 1);

Libraries/LibJS/Lexer.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@ class JS_API Lexer {
2525

2626
[[nodiscard]] Token const& current_token() const { return m_current_token; }
2727

28-
SourceCode const& source_code() const { return m_source; }
29-
Utf16String const& source() const { return m_source->code(); }
30-
String const& filename() const { return m_source->filename(); }
28+
SourceCode const& source_code() const { return m_source_code; }
29+
Utf16View const& source() const { return m_source_code->code_view(); }
30+
Utf16String const& source_string() const { return m_source_code->code(); }
31+
String const& filename() const { return m_source_code->filename(); }
3132

3233
void disallow_html_comments() { m_allow_html_comments = false; }
3334

@@ -60,7 +61,7 @@ class JS_API Lexer {
6061

6162
TokenType consume_regex_literal();
6263

63-
NonnullRefPtr<SourceCode const> m_source;
64+
NonnullRefPtr<SourceCode const> m_source_code;
6465
size_t m_position { 0 };
6566
Token m_current_token;
6667
char16_t m_current_code_unit { 0 };

Libraries/LibJS/SourceCode.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ NonnullRefPtr<SourceCode const> SourceCode::create(String filename, Utf16String
2020
SourceCode::SourceCode(String filename, Utf16String code)
2121
: m_filename(move(filename))
2222
, m_code(move(code))
23+
, m_code_view(m_code.utf16_view())
2324
{
2425
}
2526

Libraries/LibJS/SourceCode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class JS_API SourceCode : public RefCounted<SourceCode> {
2121

2222
String const& filename() const { return m_filename; }
2323
Utf16String const& code() const { return m_code; }
24+
Utf16View const& code_view() const { return m_code_view; }
2425

2526
SourceRange range_from_offsets(u32 start_offset, u32 end_offset) const;
2627

@@ -29,6 +30,7 @@ class JS_API SourceCode : public RefCounted<SourceCode> {
2930

3031
String m_filename;
3132
Utf16String m_code;
33+
Utf16View m_code_view;
3234

3335
// For fast mapping of offsets to line/column numbers, we build a list of
3436
// starting points (with byte offsets into the source string) and which

0 commit comments

Comments
 (0)