Skip to content
This repository has been archived by the owner on Aug 10, 2021. It is now read-only.

Commit

Permalink
Fix utf8 conversion (#1121)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilmat192 authored and olonho committed Dec 12, 2017
1 parent 8fa8e2e commit cba7319
Show file tree
Hide file tree
Showing 21 changed files with 709 additions and 43 deletions.
Expand Up @@ -18,9 +18,9 @@ package kotlinx.cinterop

import konan.internal.Intrinsic

internal fun decodeFromUtf8(bytes: ByteArray): String = kotlin.text.fromUtf8Array(bytes, 0, bytes.size)
internal fun decodeFromUtf8(bytes: ByteArray): String = bytes.stringFromUtf8()

fun encodeToUtf8(str: String): ByteArray = kotlin.text.toUtf8Array(str, 0, str.length)
fun encodeToUtf8(str: String): ByteArray = str.toUtf8()

@Intrinsic
external fun bitsToFloat(bits: Int): Float
Expand Down
7 changes: 7 additions & 0 deletions backend.native/tests/build.gradle
Expand Up @@ -1520,6 +1520,13 @@ task chars0(type: RunKonanTest) {
expectedExitStatus = 0
}


task utf8(type: RunKonanTest) {
expectedFail = (project.testTarget == 'wasm32') // Uses exceptions.
goldValue = "Hello\nПривет\n\uD800\uDC00\n\n\uFFFD\uFFFD\n\uFFFD12\n\uFFFD12\n12\uFFFD\n\uD83D\uDE25\n"
source = "runtime/text/utf8.kt"
}

task catch1(type: RunKonanTest) {
expectedFail = (project.testTarget == 'wasm32') // Uses exceptions.
goldValue = "Before\nCaught Throwable\nDone\n"
Expand Down
369 changes: 369 additions & 0 deletions backend.native/tests/runtime/text/utf8.kt

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion runtime/src/main/cpp/Console.cpp
Expand Up @@ -30,7 +30,8 @@ void Kotlin_io_Console_print(KString message) {
// TODO: system stdout must be aware about UTF-8.
const KChar* utf16 = CharArrayAddressOfElementAt(message, 0);
KStdString utf8;
utf8::unchecked::utf16to8(utf16, utf16 + message->count_, back_inserter(utf8));
// Replace incorrect sequences with a default codepoint (see utf8::with_replacement::default_replacement)
utf8::with_replacement::utf16to8(utf16, utf16 + message->count_, back_inserter(utf8));
konan::consoleWriteUtf8(utf8.c_str(), utf8.size());
}

Expand Down
2 changes: 2 additions & 0 deletions runtime/src/main/cpp/Exceptions.h
Expand Up @@ -48,6 +48,8 @@ void ThrowNumberFormatException();
void ThrowOutOfMemoryError();
// Throws not implemented error.
void ThrowNotImplementedError();
// Throws illegal character conversion exception (used in UTF8/UTF16 conversions).
void ThrowIllegalCharacterConversionException();
// Prints out mesage of Throwable.
void PrintThrowable(KRef);

Expand Down
86 changes: 65 additions & 21 deletions runtime/src/main/cpp/KString.cpp
Expand Up @@ -37,16 +37,55 @@

namespace {

OBJ_GETTER(utf8ToUtf16, const char* rawString, size_t rawStringLength) {
uint32_t charCount = utf8::unchecked::distance(rawString, rawString + rawStringLength);
ArrayHeader* result = AllocArrayInstance(
theStringTypeInfo, charCount, OBJ_RESULT)->array();
typedef std::back_insert_iterator<KStdString> KStdStringInserter;
typedef KChar* utf8to16(const char*, const char*, KChar*);
typedef KStdStringInserter utf16to8(const KChar*,const KChar*, KStdStringInserter);

KStdStringInserter utf16toUtf8OrThrow(const KChar* start, const KChar* end, KStdStringInserter result) {
TRY_CATCH(result = utf8::utf16to8(start, end, result),
result = utf8::unchecked::utf16to8(start, end, result),
ThrowIllegalCharacterConversionException());
return result;
}

template<utf8to16 conversion>
OBJ_GETTER(utf8ToUtf16Impl, const char* rawString, const char* end, uint32_t charCount) {
ArrayHeader* result = AllocArrayInstance(theStringTypeInfo, charCount, OBJ_RESULT)->array();
KChar* rawResult = CharArrayAddressOfElementAt(result, 0);
auto convertResult =
utf8::unchecked::utf8to16(rawString, rawString + rawStringLength, rawResult);
auto convertResult = conversion(rawString, end, rawResult);
RETURN_OBJ(result->obj());
}

template<utf16to8 conversion>
OBJ_GETTER(utf16ToUtf8Impl, KString thiz, KInt start, KInt size) {
RuntimeAssert(thiz->type_info() == theStringTypeInfo, "Must use String");
if (start < 0 || size < 0 || size > thiz->count_ - start) {
ThrowArrayIndexOutOfBoundsException();
}
const KChar* utf16 = CharArrayAddressOfElementAt(thiz, start);
KStdString utf8;
conversion(utf16, utf16 + size, back_inserter(utf8));
ArrayHeader* result = AllocArrayInstance(theByteArrayTypeInfo, utf8.size(), OBJ_RESULT)->array();
::memcpy(ByteArrayAddressOfElementAt(result, 0), utf8.c_str(), utf8.size());
RETURN_OBJ(result->obj());
}

OBJ_GETTER(utf8ToUtf16OrThrow, const char* rawString, size_t rawStringLength) {
const char* end = rawString + rawStringLength;
uint32_t charCount;
TRY_CATCH(charCount = utf8::utf16_length(rawString, end),
charCount = utf8::unchecked::utf16_length(rawString, end),
ThrowIllegalCharacterConversionException());
RETURN_RESULT_OF(utf8ToUtf16Impl<utf8::unchecked::utf8to16>, rawString, end, charCount);
}

OBJ_GETTER(utf8ToUtf16, const char* rawString, size_t rawStringLength) {
const char* end = rawString + rawStringLength;
uint32_t charCount = utf8::with_replacement::utf16_length(rawString, end);
RETURN_RESULT_OF(utf8ToUtf16Impl<utf8::with_replacement::utf8to16>, rawString, end, charCount);
}


// Case conversion is derived work from Apache Harmony.
// Unicode 3.0.1 (same as Unicode 3.0.0)
enum CharacterClass {
Expand Down Expand Up @@ -731,32 +770,37 @@ KInt Kotlin_String_getStringLength(KString thiz) {
return thiz->count_;
}

OBJ_GETTER(Kotlin_String_fromUtf8Array, KConstRef thiz, KInt start, KInt size) {
const char* byteArrayAsCString(KConstRef thiz, KInt start, KInt size) {
const ArrayHeader* array = thiz->array();
RuntimeAssert(array->type_info() == theByteArrayTypeInfo, "Must use a byte array");
if (start < 0 || size < 0 || size > array->count_ - start) {
ThrowArrayIndexOutOfBoundsException();
}
return reinterpret_cast<const char*>(ByteArrayAddressOfElementAt(array, start));
}

OBJ_GETTER(Kotlin_ByteArray_stringFromUtf8OrThrow, KConstRef thiz, KInt start, KInt size) {
const char* rawString = byteArrayAsCString(thiz, start, size);
if (size == 0) {
RETURN_RESULT_OF0(TheEmptyString);
}
const char* rawString =
reinterpret_cast<const char*>(ByteArrayAddressOfElementAt(array, start));
RETURN_RESULT_OF(utf8ToUtf16, rawString, size);
RETURN_RESULT_OF(utf8ToUtf16OrThrow, rawString, size);
}

OBJ_GETTER(Kotlin_String_toUtf8Array, KString thiz, KInt start, KInt size) {
RuntimeAssert(thiz->type_info() == theStringTypeInfo, "Must use String");
if (start < 0 || size < 0 || size > thiz->count_ - start) {
ThrowArrayIndexOutOfBoundsException();
OBJ_GETTER(Kotlin_ByteArray_stringFromUtf8, KConstRef thiz, KInt start, KInt size) {
const char* rawString = byteArrayAsCString(thiz, start, size);
if (size == 0) {
RETURN_RESULT_OF0(TheEmptyString);
}
const KChar* utf16 = CharArrayAddressOfElementAt(thiz, start);
KStdString utf8;
utf8::unchecked::utf16to8(utf16, utf16 + size, back_inserter(utf8));
ArrayHeader* result = AllocArrayInstance(
theByteArrayTypeInfo, utf8.size(), OBJ_RESULT)->array();
::memcpy(ByteArrayAddressOfElementAt(result, 0), utf8.c_str(), utf8.size());
RETURN_OBJ(result->obj());
RETURN_RESULT_OF(utf8ToUtf16, rawString, size);
}

OBJ_GETTER(Kotlin_String_toUtf8, KString thiz, KInt start, KInt size) {
RETURN_RESULT_OF(utf16ToUtf8Impl<utf8::with_replacement::utf16to8>, thiz, start, size);
}

OBJ_GETTER(Kotlin_String_toUtf8OrThrow, KString thiz, KInt start, KInt size) {
RETURN_RESULT_OF(utf16ToUtf8Impl<utf16toUtf8OrThrow>, thiz, start, size);
}

OBJ_GETTER(Kotlin_String_fromCharArray, KConstRef thiz, KInt start, KInt size) {
Expand Down
2 changes: 1 addition & 1 deletion runtime/src/main/cpp/KString.h
Expand Up @@ -27,7 +27,7 @@ extern "C" {
#endif

OBJ_GETTER(CreateStringFromCString, const char* cstring);
OBJ_GETTER(CreateStringFromUtf8, const char* utf8, uint32_t size);
OBJ_GETTER(CreateStringFromUtf8, const char* utf8, uint32_t lengthBytes);
char* CreateCStringFromString(KConstRef kstring);
void DisposeCString(char* cstring);

Expand Down
10 changes: 10 additions & 0 deletions runtime/src/main/cpp/Porting.h
Expand Up @@ -52,6 +52,16 @@ uint64_t getTimeMillis();
uint64_t getTimeMicros();
uint64_t getTimeNanos();

#if KONAN_NO_EXCEPTIONS
#define TRY_CATCH(tryAction, actionWithoutExceptions, catchAction) actionWithoutExceptions;
#else
#define TRY_CATCH(tryAction, actionWithoutExceptions, catchAction) \
do { \
try { tryAction; } \
catch(...) { catchAction; } \
} while(0)
#endif

} // namespace konan

#endif // RUNTIME_PORTING_H
4 changes: 3 additions & 1 deletion runtime/src/main/cpp/dtoa/dblparse.cpp
Expand Up @@ -642,7 +642,9 @@ KDouble Konan_FloatingPointParser_parseDoubleImpl (KString s, KInt e)
{
const KChar* utf16 = CharArrayAddressOfElementAt(s, 0);
KStdString utf8;
utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8));
TRY_CATCH(utf8::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
/* Illegal UTF-16 string. */ ThrowNumberFormatException());
const char *str = utf8.c_str();
auto dbl = createDouble (str, e);

Expand Down
4 changes: 3 additions & 1 deletion runtime/src/main/cpp/dtoa/fltparse.cpp
Expand Up @@ -542,7 +542,9 @@ Konan_FloatingPointParser_parseFloatImpl(KString s, KInt e)
{
const KChar* utf16 = CharArrayAddressOfElementAt(s, 0);
KStdString utf8;
utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8));
TRY_CATCH(utf8::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
/* Illegal UTF-16 string. */ ThrowNumberFormatException());
const char *str = utf8.c_str();
auto flt = createFloat(str, e);

Expand Down
5 changes: 5 additions & 0 deletions runtime/src/main/cpp/utf8.h
Expand Up @@ -29,5 +29,10 @@ DEALINGS IN THE SOFTWARE.
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731

#include "utf8/unchecked.h"
#include "utf8/with_replacement.h"

#if !KONAN_NO_EXCEPTIONS
#include "utf8/checked.h"
#endif

#endif // header guard
14 changes: 14 additions & 0 deletions runtime/src/main/cpp/utf8/checked.h
Expand Up @@ -193,6 +193,20 @@ namespace utf8
utf8::next(it, end);
}

/**
* Calculates a count of characters needed to represent the string from first to last in UTF-16
* taking into account surrogate symbols. Throws an exception if the input is invalid.
*/
template<typename octet_iterator>
uint32_t utf16_length(octet_iterator first, octet_iterator last) {
uint32_t dist = 0;
while(first < last) {
uint32_t cp = utf8::next(first, last);
dist += (cp > 0xffff) ? 2 : 1;
}
return dist;
}

template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
Expand Down
10 changes: 5 additions & 5 deletions runtime/src/main/cpp/utf8/core.h
Expand Up @@ -150,7 +150,7 @@ namespace internal

/// get_sequence_x functions decode utf-8 sequences of the length x
template <typename octet_iterator>
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_1(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
Expand All @@ -161,7 +161,7 @@ namespace internal
}

template <typename octet_iterator>
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_2(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
Expand All @@ -176,7 +176,7 @@ namespace internal
}

template <typename octet_iterator>
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_3(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
Expand All @@ -195,7 +195,7 @@ namespace internal
}

template <typename octet_iterator>
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_4(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
Expand All @@ -220,7 +220,7 @@ namespace internal
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR

template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error validate_next(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
{
// Save the original value of it so we can go back in case of failure
// Of course, it does not make much sense with i.e. stream iterators
Expand Down
18 changes: 16 additions & 2 deletions runtime/src/main/cpp/utf8/unchecked.h
Expand Up @@ -116,6 +116,20 @@ namespace utf8
utf8::unchecked::next(it);
}

/**
* Calculates a count of characters needed to represent the string from first to last in UTF-16
* taking into account surrogate symbols. Doesn't validate the input.
*/
template<typename octet_iterator>
uint32_t utf16_length(octet_iterator first, const octet_iterator last) {
uint32_t dist = 0;
while (first < last) {
uint32_t cp = utf8::unchecked::next(first);
dist += (cp > 0xffff) ? 2 : 1;
}
return dist;
}

template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
Expand All @@ -127,7 +141,7 @@ namespace utf8
}

template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
octet_iterator utf16to8 (u16bit_iterator start, const u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
Expand All @@ -142,7 +156,7 @@ namespace utf8
}

template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
u16bit_iterator utf8to16 (octet_iterator start, const octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
Expand Down

0 comments on commit cba7319

Please sign in to comment.