Skip to content

Commit 0dacc94

Browse files
committed
LibJS: Have JS::Lexer take a JS::SourceCode as input
This moves the responsibility of setting up a SourceCode object to the users of JS::Lexer. This means Lexer and Parser are free to use string views into the SourceCode internally while working. It also means Lexer no longer has to think about anything other than UTF-16 (or ASCII) inputs. So the unit test for parsing various invalid UTF-8 sequences is deleted here.
1 parent 9ca25e5 commit 0dacc94

File tree

16 files changed

+59
-227
lines changed

16 files changed

+59
-227
lines changed

Libraries/LibJS/Lexer.cpp

Lines changed: 33 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ static constexpr TokenType parse_three_char_token(Utf16View const& view)
192192
}
193193
}
194194

195-
static consteval Array<TokenType, 256> make_single_char_tokens_array()
195+
static consteval AK::Array<TokenType, 256> make_single_char_tokens_array()
196196
{
197-
Array<TokenType, 256> array;
197+
AK::Array<TokenType, 256> array;
198198
array.fill(TokenType::Invalid);
199199
array['&'] = TokenType::Ampersand;
200200
array['*'] = TokenType::Asterisk;
@@ -225,33 +225,9 @@ static consteval Array<TokenType, 256> make_single_char_tokens_array()
225225

226226
static constexpr auto s_single_char_tokens = make_single_char_tokens_array();
227227

228-
static Utf16String create_utf16_string_from_possibly_invalid_utf8_string(StringView source)
229-
{
230-
Utf8View utf8_source { source };
231-
if (utf8_source.validate()) [[likely]]
232-
return Utf16String::from_utf8_without_validation(source);
233-
234-
StringBuilder builder(StringBuilder::Mode::UTF16);
235-
236-
for (auto code_point : utf8_source) {
237-
builder.append_code_point(code_point);
238-
if (code_point == AK::UnicodeUtils::REPLACEMENT_CODE_POINT)
239-
break;
240-
}
241-
242-
return builder.to_utf16_string();
243-
}
244-
245-
Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
246-
: Lexer(create_utf16_string_from_possibly_invalid_utf8_string(source), filename, line_number, line_column)
247-
{
248-
// FIXME: Remove this API once all callers are ported to UTF-16.
249-
}
250-
251-
Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t line_column)
228+
Lexer::Lexer(NonnullRefPtr<SourceCode const> source, size_t line_number, size_t line_column)
252229
: m_source(move(source))
253230
, m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0)
254-
, m_filename(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors())
255231
, m_line_number(line_number)
256232
, m_line_column(line_column)
257233
{
@@ -304,16 +280,16 @@ Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t
304280
void Lexer::consume()
305281
{
306282
auto did_reach_eof = [this] {
307-
if (m_position < m_source.length_in_code_units())
283+
if (m_position < m_source->code().length_in_code_units())
308284
return false;
309285
m_eof = true;
310286
m_current_code_unit = '\0';
311-
m_position = m_source.length_in_code_units() + 1;
287+
m_position = m_source->code().length_in_code_units() + 1;
312288
m_line_column++;
313289
return true;
314290
};
315291

316-
if (m_position > m_source.length_in_code_units())
292+
if (m_position > m_source->code().length_in_code_units())
317293
return;
318294

319295
if (did_reach_eof())
@@ -339,7 +315,7 @@ void Lexer::consume()
339315
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
340316
// The sequence <CR><LF> is commonly used as a line terminator.
341317
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
342-
auto second_char_of_crlf = m_position > 1 && m_source.code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
318+
auto second_char_of_crlf = m_position > 1 && m_source->code().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
343319

344320
if (!second_char_of_crlf) {
345321
m_line_number++;
@@ -349,8 +325,8 @@ void Lexer::consume()
349325
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
350326
}
351327
} else {
352-
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source.length_in_code_units()) {
353-
if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source.code_unit_at(m_position))) {
328+
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source->code().length_in_code_units()) {
329+
if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source->code().code_unit_at(m_position))) {
354330
++m_position;
355331

356332
if (did_reach_eof())
@@ -361,7 +337,7 @@ void Lexer::consume()
361337
++m_line_column;
362338
}
363339

364-
m_current_code_unit = m_source.code_unit_at(m_position++);
340+
m_current_code_unit = m_source->code().code_unit_at(m_position++);
365341
}
366342

367343
bool Lexer::consume_decimal_number()
@@ -436,40 +412,40 @@ bool Lexer::consume_binary_number()
436412
template<typename Callback>
437413
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
438414
{
439-
if (m_position >= m_source.length_in_code_units())
415+
if (m_position >= m_source->code().length_in_code_units())
440416
return false;
441417
return m_current_code_unit == '_'
442-
&& callback(m_source.code_unit_at(m_position));
418+
&& callback(m_source->code().code_unit_at(m_position));
443419
}
444420

445421
bool Lexer::match(char16_t a, char16_t b) const
446422
{
447-
if (m_position >= m_source.length_in_code_units())
423+
if (m_position >= m_source->code().length_in_code_units())
448424
return false;
449425

450426
return m_current_code_unit == a
451-
&& m_source.code_unit_at(m_position) == b;
427+
&& m_source->code().code_unit_at(m_position) == b;
452428
}
453429

454430
bool Lexer::match(char16_t a, char16_t b, char16_t c) const
455431
{
456-
if (m_position + 1 >= m_source.length_in_code_units())
432+
if (m_position + 1 >= m_source->code().length_in_code_units())
457433
return false;
458434

459435
return m_current_code_unit == a
460-
&& m_source.code_unit_at(m_position) == b
461-
&& m_source.code_unit_at(m_position + 1) == c;
436+
&& m_source->code().code_unit_at(m_position) == b
437+
&& m_source->code().code_unit_at(m_position + 1) == c;
462438
}
463439

464440
bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const
465441
{
466-
if (m_position + 2 >= m_source.length_in_code_units())
442+
if (m_position + 2 >= m_source->code().length_in_code_units())
467443
return false;
468444

469445
return m_current_code_unit == a
470-
&& m_source.code_unit_at(m_position) == b
471-
&& m_source.code_unit_at(m_position + 1) == c
472-
&& m_source.code_unit_at(m_position + 2) == d;
446+
&& m_source->code().code_unit_at(m_position) == b
447+
&& m_source->code().code_unit_at(m_position + 1) == c
448+
&& m_source->code().code_unit_at(m_position + 2) == d;
473449
}
474450

475451
bool Lexer::is_eof() const
@@ -493,7 +469,7 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const
493469
if (m_position == 0)
494470
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
495471

496-
auto substring = m_source.substring_view(m_position - 1);
472+
auto substring = m_source->code().substring_view(m_position - 1);
497473
if (substring.is_empty())
498474
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
499475

@@ -615,7 +591,7 @@ bool Lexer::is_block_comment_end() const
615591

616592
bool Lexer::is_numeric_literal_start() const
617593
{
618-
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position)));
594+
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position)));
619595
}
620596

621597
bool Lexer::slash_means_division() const
@@ -861,7 +837,7 @@ Token const& Lexer::next()
861837
while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) {
862838
if (m_current_code_unit == '\\') {
863839
consume();
864-
if (m_current_code_unit == '\r' && m_position < m_source.length_in_code_units() && m_source.code_unit_at(m_position) == '\n') {
840+
if (m_current_code_unit == '\r' && m_position < m_source->code().length_in_code_units() && m_source->code().code_unit_at(m_position) == '\n') {
865841
consume();
866842
}
867843
}
@@ -896,8 +872,8 @@ Token const& Lexer::next()
896872
consume();
897873
}
898874

899-
if (!found_token && m_position + 1 < m_source.length_in_code_units()) {
900-
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
875+
if (!found_token && m_position + 1 < m_source->code().length_in_code_units()) {
876+
auto three_chars_view = m_source->code().substring_view(m_position - 1, 3);
901877
if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) {
902878
found_token = true;
903879
token_type = type;
@@ -907,11 +883,11 @@ Token const& Lexer::next()
907883
}
908884
}
909885

910-
if (!found_token && m_position < m_source.length_in_code_units()) {
911-
auto two_chars_view = m_source.substring_view(m_position - 1, 2);
886+
if (!found_token && m_position < m_source->code().length_in_code_units()) {
887+
auto two_chars_view = m_source->code().substring_view(m_position - 1, 2);
912888
if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) {
913889
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
914-
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position + 1)))) {
890+
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position + 1)))) {
915891
found_token = true;
916892
token_type = type;
917893
consume();
@@ -945,8 +921,8 @@ Token const& Lexer::next()
945921
m_current_token = Token(
946922
token_type,
947923
token_message,
948-
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
949-
m_source.substring_view(value_start - 1, m_position - value_start),
924+
m_source->code().substring_view(trivia_start - 1, value_start - trivia_start),
925+
m_source->code().substring_view(value_start - 1, m_position - value_start),
950926
value_start_line_number,
951927
value_start_column_number,
952928
value_start - 1);
@@ -976,7 +952,7 @@ Token const& Lexer::force_slash_as_regex()
976952
size_t value_start = m_position - 1;
977953

978954
if (has_equals) {
979-
VERIFY(m_source.code_unit_at(value_start - 1) == '=');
955+
VERIFY(m_source->code().code_unit_at(value_start - 1) == '=');
980956
--value_start;
981957
--m_position;
982958
m_current_code_unit = '=';
@@ -988,7 +964,7 @@ Token const& Lexer::force_slash_as_regex()
988964
token_type,
989965
Token::Message::None,
990966
m_current_token.trivia(),
991-
m_source.substring_view(value_start - 1, m_position - value_start),
967+
m_source->code().substring_view(value_start - 1, m_position - value_start),
992968
m_current_token.line_number(),
993969
m_current_token.line_column(),
994970
value_start - 1);

Libraries/LibJS/Lexer.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,33 @@
11
/*
22
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
3+
* Copyright (c) 2020-2025, Andreas Kling <andreas@ladybird.org>
34
*
45
* SPDX-License-Identifier: BSD-2-Clause
56
*/
67

78
#pragma once
89

910
#include <AK/HashMap.h>
10-
#include <AK/StringView.h>
1111
#include <AK/Utf16String.h>
1212
#include <LibJS/Export.h>
13+
#include <LibJS/SourceCode.h>
1314
#include <LibJS/Token.h>
1415

1516
namespace JS {
1617

1718
class JS_API Lexer {
1819
public:
19-
explicit Lexer(StringView source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
20-
explicit Lexer(Utf16String source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
20+
explicit Lexer(NonnullRefPtr<SourceCode const>, size_t line_number = 1, size_t line_column = 0);
2121

2222
// These both advance the lexer and return a reference to the current token.
2323
Token const& next();
2424
Token const& force_slash_as_regex();
2525

2626
[[nodiscard]] Token const& current_token() const { return m_current_token; }
2727

28-
Utf16String const& source() const { return m_source; }
29-
String const& filename() const { return m_filename; }
28+
SourceCode const& source_code() const { return m_source; }
29+
Utf16String const& source() const { return m_source->code(); }
30+
String const& filename() const { return m_source->filename(); }
3031

3132
void disallow_html_comments() { m_allow_html_comments = false; }
3233

@@ -59,15 +60,14 @@ class JS_API Lexer {
5960

6061
TokenType consume_regex_literal();
6162

62-
Utf16String m_source;
63+
NonnullRefPtr<SourceCode const> m_source;
6364
size_t m_position { 0 };
6465
Token m_current_token;
6566
char16_t m_current_code_unit { 0 };
6667
bool m_eof { false };
6768
bool m_regex_is_in_character_class { false };
6869
bool m_allow_html_comments { true };
6970

70-
String m_filename;
7171
size_t m_line_number { 1 };
7272
size_t m_line_column { 0 };
7373

Libraries/LibJS/Parser.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,7 @@ Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
688688
}
689689

690690
Parser::Parser(Lexer lexer, Program::Type program_type, Optional<EvalInitialState> initial_state_for_eval)
691-
: m_source_code(SourceCode::create(lexer.filename(), lexer.source()))
691+
: m_source_code(lexer.source_code())
692692
, m_state(move(lexer), program_type)
693693
, m_program_type(program_type)
694694
{
@@ -2596,7 +2596,7 @@ RefPtr<BindingPattern const> Parser::synthesize_binding_pattern(Expression const
25962596
auto source_end_offset = expression.source_range().end.offset;
25972597
auto source = m_state.lexer.source().substring_view(source_start_offset, source_end_offset - source_start_offset);
25982598

2599-
Lexer lexer { Utf16String::from_utf16(source), m_state.lexer.filename(), expression.source_range().start.line, expression.source_range().start.column };
2599+
Lexer lexer(SourceCode::create(m_state.lexer.filename(), Utf16String::from_utf16(source)), expression.source_range().start.line, expression.source_range().start.column);
26002600
Parser parser { lexer };
26012601

26022602
parser.m_state.current_scope_pusher = m_state.current_scope_pusher;
@@ -5233,7 +5233,7 @@ Parser Parser::parse_function_body_from_string(ByteString const& body_string, u1
52335233
{
52345234
RefPtr<FunctionBody const> function_body;
52355235

5236-
auto body_parser = Parser { Lexer { body_string } };
5236+
auto body_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(body_string))));
52375237
{
52385238
// Set up some parser state to accept things like return await, and yield in the plain function body.
52395239
body_parser.m_state.in_function_context = true;

Libraries/LibJS/Runtime/AbstractOperations.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ ThrowCompletionOr<Value> perform_eval(VM& vm, Value x, CallerMode strict_caller,
623623
.in_class_field_initializer = in_class_field_initializer,
624624
};
625625

626-
Parser parser { Lexer { code_string->utf8_string_view() }, Program::Type::Script, move(initial_state) };
626+
Parser parser(Lexer(SourceCode::create({}, code_string->utf16_string())), Program::Type::Script, move(initial_state));
627627
auto program = parser.parse_program(strict_caller == CallerMode::Strict);
628628

629629
// b. If script is a List of errors, throw a SyntaxError exception.

Libraries/LibJS/Runtime/FunctionConstructor.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create
156156

157157
// 17. Let parameters be ParseText(P, parameterSym).
158158
i32 function_length = 0;
159-
auto parameters_parser = Parser { Lexer { parameters_string } };
159+
auto parameters_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(parameters_string))));
160160
auto parameters = parameters_parser.parse_formal_parameters(function_length, parse_options);
161161

162162
// 18. If parameters is a List of errors, throw a SyntaxError exception.
@@ -179,7 +179,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create
179179
// 22. NOTE: If this step is reached, sourceText must have the syntax of exprSym (although the reverse implication does not hold). The purpose of the next two steps is to enforce any Early Error rules which apply to exprSym directly.
180180

181181
// 23. Let expr be ParseText(sourceText, exprSym).
182-
auto source_parser = Parser { Lexer { source_text } };
182+
auto source_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(source_text))));
183183
// This doesn't need any parse_options, it determines those & the function type based on the tokens that were found.
184184
auto expr = source_parser.parse_function_node<FunctionExpression>();
185185

Libraries/LibJS/Runtime/ShadowRealm.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ ThrowCompletionOr<Value> perform_shadow_realm_eval(VM& vm, Value source, Realm&
123123
// 2. Perform the following substeps in an implementation-defined order, possibly interleaving parsing and error detection:
124124

125125
// a. Let script be ParseText(StringToCodePoints(sourceText), Script).
126-
auto parser = Parser(Lexer(source_text->utf8_string_view()), Program::Type::Script, Parser::EvalInitialState {});
126+
auto parser = Parser(Lexer(SourceCode::create({}, source_text->utf16_string())), Program::Type::Script, Parser::EvalInitialState {});
127127
auto program = parser.parse_program();
128128

129129
// b. If script is a List of errors, throw a SyntaxError exception.

Libraries/LibJS/Script.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ GC_DEFINE_ALLOCATOR(Script);
1818
Result<GC::Ref<Script>, Vector<ParserError>> Script::parse(StringView source_text, Realm& realm, StringView filename, HostDefined* host_defined, size_t line_number_offset)
1919
{
2020
// 1. Let script be ParseText(sourceText, Script).
21-
auto parser = Parser(Lexer(source_text, filename, line_number_offset));
21+
auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text)), line_number_offset));
2222
auto script = parser.parse_program();
2323

2424
// 2. If script is a List of errors, return body.

Libraries/LibJS/SourceTextModule.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ void SourceTextModule::visit_edges(Cell::Visitor& visitor)
132132
Result<GC::Ref<SourceTextModule>, Vector<ParserError>> SourceTextModule::parse(StringView source_text, Realm& realm, StringView filename, Script::HostDefined* host_defined)
133133
{
134134
// 1. Let body be ParseText(sourceText, Module).
135-
auto parser = Parser(Lexer(source_text, filename), Program::Type::Module);
135+
auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text))), Program::Type::Module);
136136
auto body = parser.parse_program();
137137

138138
// 2. If body is a List of errors, return body.

Libraries/LibJS/SyntaxHighlighter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)
5252
{
5353
auto text = m_client->get_text();
5454

55-
Lexer lexer(text);
55+
Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text)));
5656

5757
Vector<Syntax::TextDocumentSpan> spans;
5858
Vector<Syntax::TextDocumentFoldingRegion> folding_regions;

Libraries/LibWeb/DOM/EventTarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ WebIDL::CallbackType* EventTarget::get_current_value_of_event_handler(FlyString
444444

445445
auto source_text = builder.to_byte_string();
446446

447-
auto parser = JS::Parser(JS::Lexer(source_text));
447+
auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text))));
448448

449449
// FIXME: This should only be parsing the `body` instead of `source_text` and therefore use `JS::FunctionBody` instead of `JS::FunctionExpression`.
450450
// However, JS::ECMAScriptFunctionObject::create wants parameters and length and JS::FunctionBody does not inherit JS::FunctionNode.

0 commit comments

Comments
 (0)