Skip to content

Commit 5632a52

Browse files
aplefullalimpfard
authored andcommitted
LibRegex: Properly track code units in u-v modes
Previously, both string_position and view_index used code unit offsets regardless of mode. Now in unicode mode, these variables track code point positions while string_position_in_code_units is properly updated to reflect code unit offsets.
1 parent fb25863 commit 5632a52

File tree

2 files changed

+51
-3
lines changed

2 files changed

+51
-3
lines changed

Libraries/LibJS/Tests/builtins/RegExp/RegExp.js

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,37 @@ test("Unicode properties of strings", () => {
156156
expect(re.test(str)).toBeFalse();
157157
}
158158
});
159+
160+
test("Unicode matching with u and v flags", () => {
161+
const text = "𠮷a𠮷b𠮷";
162+
const complexText = "a\u{20BB7}b\u{10FFFF}c";
163+
164+
const cases = [
165+
{ pattern: /𠮷/, match: text, expected: ["𠮷"] },
166+
{ pattern: /𠮷/u, match: text, expected: ["𠮷"] },
167+
{ pattern: /𠮷/v, match: text, expected: ["𠮷"] },
168+
{ pattern: /\p{Script=Han}/u, match: text, expected: ["𠮷"] },
169+
{ pattern: /\p{Script=Han}/v, match: text, expected: ["𠮷"] },
170+
{ pattern: /./u, match: text, expected: ["𠮷"] },
171+
{ pattern: /./v, match: text, expected: ["𠮷"] },
172+
{ pattern: /\p{ASCII}/u, match: text, expected: ["a"] },
173+
{ pattern: /\p{ASCII}/v, match: text, expected: ["a"] },
174+
{ pattern: /x/u, match: text, expected: null },
175+
{ pattern: /x/v, match: text, expected: null },
176+
{ pattern: /\p{Script=Han}(.)/gu, match: text, expected: ["𠮷a", "𠮷b"] },
177+
{ pattern: /\p{Script=Han}(.)/gv, match: text, expected: ["𠮷a", "𠮷b"] },
178+
{ pattern: /\P{ASCII}/u, match: complexText, expected: ["\u{20BB7}"] },
179+
{ pattern: /\P{ASCII}/v, match: complexText, expected: ["\u{20BB7}"] },
180+
{ pattern: /\P{ASCII}/gu, match: complexText, expected: ["\u{20BB7}", "\u{10FFFF}"] },
181+
{ pattern: /\P{ASCII}/gv, match: complexText, expected: ["\u{20BB7}", "\u{10FFFF}"] },
182+
{ pattern: /./gu, match: text, expected: ["𠮷", "a", "𠮷", "b", "𠮷"] },
183+
{ pattern: /./gv, match: text, expected: ["𠮷", "a", "𠮷", "b", "𠮷"] },
184+
{ pattern: /(?:)/gu, match: text, expected: ["", "", "", "", "", ""] },
185+
{ pattern: /(?:)/gv, match: text, expected: ["", "", "", "", "", ""] },
186+
];
187+
188+
for (const test of cases) {
189+
const result = test.match.match(test.pattern);
190+
expect(result).toEqual(test.expected);
191+
}
192+
});

Libraries/LibRegex/RegexMatcher.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,17 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
237237
input.view = view;
238238
dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
239239

240-
auto view_length = view.length_in_code_units();
240+
auto view_length = view.length();
241241
size_t view_index = m_pattern->start_offset;
242242
state.string_position = view_index;
243-
state.string_position_in_code_units = view_index;
243+
if (view.unicode()) {
244+
if (view_index < view_length)
245+
state.string_position_in_code_units = view.code_unit_offset_of(view_index);
246+
else
247+
state.string_position_in_code_units = view.length_in_code_units();
248+
} else {
249+
state.string_position_in_code_units = view_index;
250+
}
244251
bool succeeded = false;
245252

246253
if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
@@ -303,7 +310,14 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
303310
input.match_index = match_count;
304311

305312
state.string_position = view_index;
306-
state.string_position_in_code_units = view_index;
313+
if (input.view.unicode()) {
314+
if (view_index < view_length)
315+
state.string_position_in_code_units = input.view.code_unit_offset_of(view_index);
316+
else
317+
state.string_position_in_code_units = input.view.length_in_code_units();
318+
} else {
319+
state.string_position_in_code_units = view_index;
320+
}
307321
state.instruction_position = 0;
308322
state.repetition_marks.clear();
309323

0 commit comments

Comments
 (0)