Skip to content

Commit f53389b

Browse files
trflynn89gmta
authored andcommitted
AK: Add a couple of Utf16String factories
* Utf16String::from_utf8_with_replacement_character * Utf16String::from_code_point
1 parent b4435bd commit f53389b

File tree

3 files changed

+87
-0
lines changed

3 files changed

+87
-0
lines changed

AK/Utf16String.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,28 @@ namespace AK {
1313

1414
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
1515

16+
Utf16String Utf16String::from_utf8_with_replacement_character(StringView utf8_string, WithBOMHandling with_bom_handling)
17+
{
18+
if (auto bytes = utf8_string.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } }))
19+
utf8_string = utf8_string.substring_view(3);
20+
21+
Utf8View utf8_view { utf8_string };
22+
23+
if (utf8_view.validate(AllowLonelySurrogates::No))
24+
return Utf16String::from_utf8_without_validation(utf8_string);
25+
26+
StringBuilder builder(StringBuilder::Mode::UTF16);
27+
28+
for (auto code_point : utf8_view) {
29+
if (is_unicode_surrogate(code_point))
30+
builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
31+
else
32+
builder.append_code_point(code_point);
33+
}
34+
35+
return builder.to_utf16_string_without_validation();
36+
}
37+
1638
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
1739
{
1840
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {

AK/Utf16String.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ class [[nodiscard]] Utf16String : public Detail::Utf16StringBase {
4545
return from_utf8_without_validation(utf8_string);
4646
}
4747

48+
enum class WithBOMHandling {
49+
No,
50+
Yes,
51+
};
52+
static Utf16String from_utf8_with_replacement_character(StringView, WithBOMHandling = WithBOMHandling::Yes);
53+
4854
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
4955
{
5056
if (!Utf8View { utf8_string }.validate())
@@ -81,6 +87,18 @@ class [[nodiscard]] Utf16String : public Detail::Utf16StringBase {
8187
requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
8288
static Utf16String from_utf16_without_validation(T&&) = delete;
8389

90+
ALWAYS_INLINE static Utf16String from_code_point(u32 code_point)
91+
{
92+
Array<char16_t, 2> code_units;
93+
size_t length_in_code_units = 0;
94+
95+
(void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
96+
code_units[length_in_code_units++] = code_unit;
97+
});
98+
99+
return from_utf16_without_validation({ code_units.data(), length_in_code_units });
100+
}
101+
84102
template<typename... Parameters>
85103
ALWAYS_INLINE static Utf16String formatted(CheckedFormatString<Parameters...>&& format, Parameters const&... parameters)
86104
{

Tests/AK/TestUtf16String.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,27 @@ TEST_CASE(from_utf8)
9696
}
9797
}
9898

99+
TEST_CASE(from_utf8_with_replacement_character)
100+
{
101+
auto string1 = Utf16String::from_utf8_with_replacement_character("long string \xf4\x8f\xbf\xc0"sv, Utf16String::WithBOMHandling::No); // U+110000
102+
EXPECT_EQ(string1, u"long string \ufffd\ufffd\ufffd\ufffd"sv);
103+
104+
auto string3 = Utf16String::from_utf8_with_replacement_character("A valid string!"sv, Utf16String::WithBOMHandling::No);
105+
EXPECT_EQ(string3, "A valid string!"sv);
106+
107+
auto string4 = Utf16String::from_utf8_with_replacement_character(""sv, Utf16String::WithBOMHandling::No);
108+
EXPECT_EQ(string4, ""sv);
109+
110+
auto string5 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::Yes);
111+
EXPECT_EQ(string5, "WHF!"sv);
112+
113+
auto string6 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::No);
114+
EXPECT_EQ(string6, u"\ufeffWHF!"sv);
115+
116+
auto string7 = Utf16String::from_utf8_with_replacement_character("\xED\xA0\x80WHF!"sv); // U+D800
117+
EXPECT_EQ(string7, u"\ufffdWHF!"sv);
118+
}
119+
99120
TEST_CASE(from_utf16)
100121
{
101122
{
@@ -235,6 +256,32 @@ TEST_CASE(from_utf32)
235256
}
236257
}
237258

259+
TEST_CASE(from_code_point)
260+
{
261+
u32 code_point = 0;
262+
263+
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; ++code_point) {
264+
auto string = Utf16String::from_code_point(code_point);
265+
EXPECT_EQ(string.length_in_code_units(), 1uz);
266+
EXPECT_EQ(string.length_in_code_points(), 1uz);
267+
EXPECT_EQ(string.code_point_at(0), code_point);
268+
EXPECT_EQ(string.code_unit_at(0), code_point);
269+
}
270+
271+
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT + 10'000; ++code_point) {
272+
auto string = Utf16String::from_code_point(code_point);
273+
EXPECT_EQ(string.length_in_code_units(), 2uz);
274+
EXPECT_EQ(string.length_in_code_points(), 1uz);
275+
EXPECT_EQ(string.code_point_at(0), code_point);
276+
277+
size_t i = 0;
278+
(void)AK::UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
279+
EXPECT_EQ(string.code_unit_at(i++), code_unit);
280+
});
281+
EXPECT_EQ(i, 2uz);
282+
}
283+
}
284+
238285
TEST_CASE(formatted)
239286
{
240287
{

0 commit comments

Comments
 (0)