|
17 | 17 | #include <AK/Utf16View.h>
|
18 | 18 | #include <AK/Utf32View.h>
|
19 | 19 |
|
| 20 | +#include <simdutf.h> |
| 21 | + |
20 | 22 | namespace AK {
|
21 | 23 |
|
22 | 24 | static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
|
@@ -224,24 +226,69 @@ void StringBuilder::append_code_point(u32 code_point)
|
224 | 226 |
|
225 | 227 | ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
226 | 228 | {
|
227 |
| - // NOTE: This may under-allocate in the presence of surrogate pairs. |
228 |
| - // That's okay, appending will still grow the buffer as needed. |
229 |
| - TRY(will_append(utf16_view.length_in_code_units())); |
230 |
| - |
231 |
| - for (size_t i = 0; i < utf16_view.length_in_code_units();) { |
232 |
| - // OPTIMIZATION: Fast path for ASCII characters. |
233 |
| - auto code_unit = utf16_view.data()[i]; |
234 |
| - if (code_unit <= 0x7f) { |
235 |
| - append(static_cast<char>(code_unit)); |
236 |
| - ++i; |
237 |
| - continue; |
238 |
| - } |
| 229 | + if (utf16_view.is_empty()) |
| 230 | + return {}; |
239 | 231 |
|
240 |
| - auto code_point = utf16_view.code_point_at(i); |
241 |
| - TRY(try_append_code_point(code_point)); |
| 232 | + auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span()); |
| 233 | + |
| 234 | + // Possibly over-allocate a little to ensure we don't have to allocate later. |
| 235 | + TRY(will_append(maximum_utf8_length)); |
| 236 | + |
| 237 | + Utf16View remaining_view = utf16_view; |
| 238 | + for (;;) { |
| 239 | + auto uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer()); |
| 240 | + |
| 241 | + // Fast path. |
| 242 | + auto result = [&]() { |
| 243 | + switch (remaining_view.endianness()) { |
| 244 | + case Endianness::Host: |
| 245 | + return simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); |
| 246 | + case Endianness::Big: |
| 247 | + return simdutf::convert_utf16be_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); |
| 248 | + case Endianness::Little: |
| 249 | + return simdutf::convert_utf16le_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); |
| 250 | + } |
| 251 | + VERIFY_NOT_REACHED(); |
| 252 | + }(); |
| 253 | + if (result.error == simdutf::SUCCESS) { |
| 254 | + auto bytes_just_written = result.count; |
| 255 | + m_buffer.set_size(m_buffer.size() + bytes_just_written); |
| 256 | + break; |
| 257 | + } |
242 | 258 |
|
243 |
| - i += (code_point > 0xffff ? 2 : 1); |
| 259 | + // Slow path. Found unmatched surrogate code unit. |
| 260 | + auto first_invalid_code_unit = result.count; |
| 261 | + ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units()); |
| 262 | + |
| 263 | + // Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves. |
| 264 | + auto bytes_just_written = [&]() { |
| 265 | + switch (remaining_view.endianness()) { |
| 266 | + case Endianness::Host: |
| 267 | + return simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit); |
| 268 | + case Endianness::Big: |
| 269 | + return simdutf::utf8_length_from_utf16be(remaining_view.char_data(), first_invalid_code_unit); |
| 270 | + case Endianness::Little: |
| 271 | + return simdutf::utf8_length_from_utf16le(remaining_view.char_data(), first_invalid_code_unit); |
| 272 | + } |
| 273 | + VERIFY_NOT_REACHED(); |
| 274 | + }(); |
| 275 | + |
| 276 | + do { |
| 277 | + auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++); |
| 278 | + |
| 279 | + // Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes. |
| 280 | + ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF); |
| 281 | + ASSERT(m_buffer.size() + bytes_just_written + 3 < m_buffer.capacity()); |
| 282 | + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0); |
| 283 | + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80); |
| 284 | + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80); |
| 285 | + } while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit])); |
| 286 | + |
| 287 | + // Code unit might no longer be invalid, retry on the remaining data. |
| 288 | + m_buffer.set_size(m_buffer.size() + bytes_just_written); |
| 289 | + remaining_view = remaining_view.substring_view(first_invalid_code_unit); |
244 | 290 | }
|
| 291 | + |
245 | 292 | return {};
|
246 | 293 | }
|
247 | 294 |
|
|
0 commit comments