Skip to content

Commit 04920d0

Browse files
yynyawesomekling
authored andcommitted
AK: Use simdutf when appending UTF-16 to StringBuilder
Adds a fast path for valid UTF-16 using `simdutf`, and fall back to the slow path for unmatched surrogates.
1 parent ff6020c commit 04920d0

File tree

4 files changed

+122
-19
lines changed

4 files changed

+122
-19
lines changed

AK/ByteBuffer.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,17 @@ class ByteBuffer {
202202
MUST(try_ensure_capacity(new_capacity));
203203
}
204204

205+
void set_size(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No)
206+
{
207+
ASSERT(new_size <= capacity());
208+
209+
if (zero_fill_new_elements == ZeroFillNewElements::Yes) {
210+
__builtin_memset(data() + m_size, 0, new_size - m_size);
211+
}
212+
213+
m_size = new_size;
214+
}
215+
205216
ErrorOr<void> try_resize(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No)
206217
{
207218
if (new_size <= m_size) {
@@ -210,11 +221,8 @@ class ByteBuffer {
210221
}
211222
TRY(try_ensure_capacity(new_size));
212223

213-
if (zero_fill_new_elements == ZeroFillNewElements::Yes) {
214-
__builtin_memset(data() + m_size, 0, new_size - m_size);
215-
}
224+
set_size(new_size, zero_fill_new_elements);
216225

217-
m_size = new_size;
218226
return {};
219227
}
220228

AK/StringBuilder.cpp

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#include <AK/Utf16View.h>
1818
#include <AK/Utf32View.h>
1919

20+
#include <simdutf.h>
21+
2022
namespace AK {
2123

2224
static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
@@ -224,24 +226,69 @@ void StringBuilder::append_code_point(u32 code_point)
224226

225227
ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
226228
{
227-
// NOTE: This may under-allocate in the presence of surrogate pairs.
228-
// That's okay, appending will still grow the buffer as needed.
229-
TRY(will_append(utf16_view.length_in_code_units()));
230-
231-
for (size_t i = 0; i < utf16_view.length_in_code_units();) {
232-
// OPTIMIZATION: Fast path for ASCII characters.
233-
auto code_unit = utf16_view.data()[i];
234-
if (code_unit <= 0x7f) {
235-
append(static_cast<char>(code_unit));
236-
++i;
237-
continue;
238-
}
229+
if (utf16_view.is_empty())
230+
return {};
239231

240-
auto code_point = utf16_view.code_point_at(i);
241-
TRY(try_append_code_point(code_point));
232+
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span());
233+
234+
// Possibly over-allocate a little to ensure we don't have to allocate later.
235+
TRY(will_append(maximum_utf8_length));
236+
237+
Utf16View remaining_view = utf16_view;
238+
for (;;) {
239+
auto uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer());
240+
241+
// Fast path.
242+
auto result = [&]() {
243+
switch (remaining_view.endianness()) {
244+
case Endianness::Host:
245+
return simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
246+
case Endianness::Big:
247+
return simdutf::convert_utf16be_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
248+
case Endianness::Little:
249+
return simdutf::convert_utf16le_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
250+
}
251+
VERIFY_NOT_REACHED();
252+
}();
253+
if (result.error == simdutf::SUCCESS) {
254+
auto bytes_just_written = result.count;
255+
m_buffer.set_size(m_buffer.size() + bytes_just_written);
256+
break;
257+
}
242258

243-
i += (code_point > 0xffff ? 2 : 1);
259+
// Slow path. Found unmatched surrogate code unit.
260+
auto first_invalid_code_unit = result.count;
261+
ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units());
262+
263+
// Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves.
264+
auto bytes_just_written = [&]() {
265+
switch (remaining_view.endianness()) {
266+
case Endianness::Host:
267+
return simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit);
268+
case Endianness::Big:
269+
return simdutf::utf8_length_from_utf16be(remaining_view.char_data(), first_invalid_code_unit);
270+
case Endianness::Little:
271+
return simdutf::utf8_length_from_utf16le(remaining_view.char_data(), first_invalid_code_unit);
272+
}
273+
VERIFY_NOT_REACHED();
274+
}();
275+
276+
do {
277+
auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++);
278+
279+
// Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes.
280+
ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF);
281+
ASSERT(m_buffer.size() + bytes_just_written + 3 < m_buffer.capacity());
282+
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
283+
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
284+
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
285+
} while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
286+
287+
// Code unit might no longer be invalid, retry on the remaining data.
288+
m_buffer.set_size(m_buffer.size() + bytes_just_written);
289+
remaining_view = remaining_view.substring_view(first_invalid_code_unit);
244290
}
291+
245292
return {};
246293
}
247294

AK/UnicodeUtils.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,50 @@ template<FallibleFunction<char> Callback>
7878
return -1;
7979
}
8080

81+
/**
82+
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
83+
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
84+
*/
85+
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)
86+
{
87+
// # UTF-8 code point -> no. UTF-8 bytes needed
88+
// U+0000 - U+007F => 1 UTF-8 bytes
89+
// U+0080 - U+07FF => 2 UTF-8 bytes
90+
// U+0800 - U+FFFF => 3 UTF-8 bytes
91+
// U+010000 - U+10FFFF => 4 UTF-8 bytes
92+
93+
// # UTF-16 code unit -> no. UTF-8 bytes needed
94+
// 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes
95+
// 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes
96+
// 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes
97+
// 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units,
98+
// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
99+
// 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff),
100+
// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
101+
// 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes
102+
103+
// # UTF-16 code unit -> actual length added.
104+
// 0x0000 - 0x007f = 1
105+
// 0x0080 - 0x07ff = 2
106+
// 0x0800 - 0xd7ff = 3
107+
// 0xd800 - 0xdbff = 3
108+
// ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required.
109+
// Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF.
110+
// 0xdc00 - 0xdfff = 3
111+
// ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4.
112+
// Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF.
113+
// 0xe000 - 0xffff = 3
114+
115+
size_t maximum_utf8_length = 0;
116+
117+
// NOTE: This loop is designed to be easy to vectorize.
118+
for (auto code_unit : code_units) {
119+
maximum_utf8_length += 1;
120+
maximum_utf8_length += code_unit > 0x007f;
121+
maximum_utf8_length += code_unit > 0x07ff;
122+
}
123+
124+
return maximum_utf8_length;
125+
}
126+
81127
}

AK/Utf16View.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ class Utf16View {
109109
u16 const* data() const { return m_code_units.data(); }
110110
char16_t const* char_data() const { return reinterpret_cast<char16_t const*>(data()); }
111111

112+
ReadonlySpan<u16> span() const { return m_code_units; }
113+
112114
u16 code_unit_at(size_t index) const;
113115
u32 code_point_at(size_t index) const;
114116

0 commit comments

Comments
 (0)