Fix utf8 conversion (#1121)

JetBrains · Dec 12, 2017 · cba7319 · cba7319
1 parent 8fa8e2e
commit cba7319
Show file tree

Hide file tree

Showing 21 changed files with 709 additions and 43 deletions.
diff --git a/Interop/Runtime/src/native/kotlin/kotlinx/cinterop/NativeUtils.kt b/Interop/Runtime/src/native/kotlin/kotlinx/cinterop/NativeUtils.kt
@@ -18,9 +18,9 @@ package kotlinx.cinterop
 
 import konan.internal.Intrinsic
 
-internal fun decodeFromUtf8(bytes: ByteArray): String = kotlin.text.fromUtf8Array(bytes, 0, bytes.size)
+internal fun decodeFromUtf8(bytes: ByteArray): String = bytes.stringFromUtf8()
 
-fun encodeToUtf8(str: String): ByteArray = kotlin.text.toUtf8Array(str, 0, str.length)
+fun encodeToUtf8(str: String): ByteArray = str.toUtf8()
 
 @Intrinsic
 external fun bitsToFloat(bits: Int): Float

diff --git a/backend.native/tests/build.gradle b/backend.native/tests/build.gradle
@@ -1520,6 +1520,13 @@ task chars0(type: RunKonanTest) {
     expectedExitStatus = 0
 }
 
+
+task utf8(type: RunKonanTest) {
+    expectedFail = (project.testTarget == 'wasm32') // Uses exceptions.
+    goldValue = "Hello\nПривет\n\uD800\uDC00\n\n\uFFFD\uFFFD\n\uFFFD12\n\uFFFD12\n12\uFFFD\n\uD83D\uDE25\n"
+    source = "runtime/text/utf8.kt"
+}
+
 task catch1(type: RunKonanTest) {
     expectedFail = (project.testTarget == 'wasm32') // Uses exceptions.
     goldValue = "Before\nCaught Throwable\nDone\n"

diff --git a/backend.native/tests/runtime/text/utf8.kt b/backend.native/tests/runtime/text/utf8.kt
diff --git a/runtime/src/main/cpp/Console.cpp b/runtime/src/main/cpp/Console.cpp
@@ -30,7 +30,8 @@ void Kotlin_io_Console_print(KString message) {
   // TODO: system stdout must be aware about UTF-8.
   const KChar* utf16 = CharArrayAddressOfElementAt(message, 0);
   KStdString utf8;
-  utf8::unchecked::utf16to8(utf16, utf16 + message->count_, back_inserter(utf8));
+  // Replace incorrect sequences with a default codepoint (see utf8::with_replacement::default_replacement)
+  utf8::with_replacement::utf16to8(utf16, utf16 + message->count_, back_inserter(utf8));
   konan::consoleWriteUtf8(utf8.c_str(), utf8.size());
 }
 

diff --git a/runtime/src/main/cpp/Exceptions.h b/runtime/src/main/cpp/Exceptions.h
@@ -48,6 +48,8 @@ void ThrowNumberFormatException();
 void ThrowOutOfMemoryError();
 // Throws not implemented error.
 void ThrowNotImplementedError();
+// Throws illegal character conversion exception (used in UTF8/UTF16 conversions).
+void ThrowIllegalCharacterConversionException();
 // Prints out mesage of Throwable.
 void PrintThrowable(KRef);
 

diff --git a/runtime/src/main/cpp/KString.cpp b/runtime/src/main/cpp/KString.cpp
@@ -37,16 +37,55 @@
 
 namespace {
 
-OBJ_GETTER(utf8ToUtf16, const char* rawString, size_t rawStringLength) {
-  uint32_t charCount = utf8::unchecked::distance(rawString, rawString + rawStringLength);
-  ArrayHeader* result = AllocArrayInstance(
-    theStringTypeInfo, charCount, OBJ_RESULT)->array();
+typedef std::back_insert_iterator<KStdString> KStdStringInserter;
+typedef KChar* utf8to16(const char*, const char*, KChar*);
+typedef KStdStringInserter utf16to8(const KChar*,const KChar*, KStdStringInserter);
+
+KStdStringInserter utf16toUtf8OrThrow(const KChar* start, const KChar* end, KStdStringInserter result) {
+  TRY_CATCH(result = utf8::utf16to8(start, end, result),
+            result = utf8::unchecked::utf16to8(start, end, result),
+            ThrowIllegalCharacterConversionException());
+  return result;
+}
+
+template<utf8to16 conversion>
+OBJ_GETTER(utf8ToUtf16Impl, const char* rawString, const char* end, uint32_t charCount) {
+  ArrayHeader* result = AllocArrayInstance(theStringTypeInfo, charCount, OBJ_RESULT)->array();
   KChar* rawResult = CharArrayAddressOfElementAt(result, 0);
-  auto convertResult =
-      utf8::unchecked::utf8to16(rawString, rawString + rawStringLength, rawResult);
+  auto convertResult = conversion(rawString, end, rawResult);
+  RETURN_OBJ(result->obj());
+}
+
+template<utf16to8 conversion>
+OBJ_GETTER(utf16ToUtf8Impl, KString thiz, KInt start, KInt size) {
+  RuntimeAssert(thiz->type_info() == theStringTypeInfo, "Must use String");
+  if (start < 0 || size < 0 || size > thiz->count_ - start) {
+    ThrowArrayIndexOutOfBoundsException();
+  }
+  const KChar* utf16 = CharArrayAddressOfElementAt(thiz, start);
+  KStdString utf8;
+  conversion(utf16, utf16 + size, back_inserter(utf8));
+  ArrayHeader* result = AllocArrayInstance(theByteArrayTypeInfo, utf8.size(), OBJ_RESULT)->array();
+  ::memcpy(ByteArrayAddressOfElementAt(result, 0), utf8.c_str(), utf8.size());
   RETURN_OBJ(result->obj());
 }
 
+OBJ_GETTER(utf8ToUtf16OrThrow, const char* rawString, size_t rawStringLength) {
+  const char* end = rawString + rawStringLength;
+  uint32_t charCount;
+  TRY_CATCH(charCount = utf8::utf16_length(rawString, end),
+            charCount = utf8::unchecked::utf16_length(rawString, end),
+            ThrowIllegalCharacterConversionException());
+  RETURN_RESULT_OF(utf8ToUtf16Impl<utf8::unchecked::utf8to16>, rawString, end, charCount);
+}
+
+OBJ_GETTER(utf8ToUtf16, const char* rawString, size_t rawStringLength) {
+  const char* end = rawString + rawStringLength;
+  uint32_t charCount = utf8::with_replacement::utf16_length(rawString, end);
+  RETURN_RESULT_OF(utf8ToUtf16Impl<utf8::with_replacement::utf8to16>, rawString, end, charCount);
+}
+
+
 // Case conversion is derived work from Apache Harmony.
 // Unicode 3.0.1 (same as Unicode 3.0.0)
 enum CharacterClass {
@@ -731,32 +770,37 @@ KInt Kotlin_String_getStringLength(KString thiz) {
   return thiz->count_;
 }
 
-OBJ_GETTER(Kotlin_String_fromUtf8Array, KConstRef thiz, KInt start, KInt size) {
+const char* byteArrayAsCString(KConstRef thiz, KInt start, KInt size) {
   const ArrayHeader* array = thiz->array();
   RuntimeAssert(array->type_info() == theByteArrayTypeInfo, "Must use a byte array");
   if (start < 0 || size < 0 || size > array->count_ - start) {
     ThrowArrayIndexOutOfBoundsException();
   }
+  return reinterpret_cast<const char*>(ByteArrayAddressOfElementAt(array, start));
+}
+
+OBJ_GETTER(Kotlin_ByteArray_stringFromUtf8OrThrow, KConstRef thiz, KInt start, KInt size) {
+  const char* rawString = byteArrayAsCString(thiz, start, size);
   if (size == 0) {
     RETURN_RESULT_OF0(TheEmptyString);
   }
-  const char* rawString =
-    reinterpret_cast<const char*>(ByteArrayAddressOfElementAt(array, start));
-  RETURN_RESULT_OF(utf8ToUtf16, rawString, size);
+  RETURN_RESULT_OF(utf8ToUtf16OrThrow, rawString, size);
 }
 
-OBJ_GETTER(Kotlin_String_toUtf8Array, KString thiz, KInt start, KInt size) {
-  RuntimeAssert(thiz->type_info() == theStringTypeInfo, "Must use String");
-  if (start < 0 || size < 0 || size > thiz->count_ - start) {
-    ThrowArrayIndexOutOfBoundsException();
+OBJ_GETTER(Kotlin_ByteArray_stringFromUtf8, KConstRef thiz, KInt start, KInt size) {
+  const char* rawString = byteArrayAsCString(thiz, start, size);
+  if (size == 0) {
+    RETURN_RESULT_OF0(TheEmptyString);
   }
-  const KChar* utf16 = CharArrayAddressOfElementAt(thiz, start);
-  KStdString utf8;
-  utf8::unchecked::utf16to8(utf16, utf16 + size, back_inserter(utf8));
-  ArrayHeader* result = AllocArrayInstance(
-      theByteArrayTypeInfo, utf8.size(), OBJ_RESULT)->array();
-  ::memcpy(ByteArrayAddressOfElementAt(result, 0), utf8.c_str(), utf8.size());
-  RETURN_OBJ(result->obj());
+  RETURN_RESULT_OF(utf8ToUtf16, rawString, size);
+}
+
+OBJ_GETTER(Kotlin_String_toUtf8, KString thiz, KInt start, KInt size) {
+  RETURN_RESULT_OF(utf16ToUtf8Impl<utf8::with_replacement::utf16to8>, thiz, start, size);
+}
+
+OBJ_GETTER(Kotlin_String_toUtf8OrThrow, KString thiz, KInt start, KInt size) {
+  RETURN_RESULT_OF(utf16ToUtf8Impl<utf16toUtf8OrThrow>, thiz, start, size);
 }
 
 OBJ_GETTER(Kotlin_String_fromCharArray, KConstRef thiz, KInt start, KInt size) {

diff --git a/runtime/src/main/cpp/KString.h b/runtime/src/main/cpp/KString.h
@@ -27,7 +27,7 @@ extern "C" {
 #endif
 
 OBJ_GETTER(CreateStringFromCString, const char* cstring);
-OBJ_GETTER(CreateStringFromUtf8, const char* utf8, uint32_t size);
+OBJ_GETTER(CreateStringFromUtf8, const char* utf8, uint32_t lengthBytes);
 char* CreateCStringFromString(KConstRef kstring);
 void DisposeCString(char* cstring);
 

diff --git a/runtime/src/main/cpp/Porting.h b/runtime/src/main/cpp/Porting.h
@@ -52,6 +52,16 @@ uint64_t getTimeMillis();
 uint64_t getTimeMicros();
 uint64_t getTimeNanos();
 
+#if KONAN_NO_EXCEPTIONS
+#define TRY_CATCH(tryAction, actionWithoutExceptions, catchAction) actionWithoutExceptions;
+#else
+#define TRY_CATCH(tryAction, actionWithoutExceptions, catchAction) \
+do {                          \
+  try { tryAction; }          \
+  catch(...) { catchAction; } \
+} while(0)
+#endif
+
 }  // namespace konan
 
 #endif  // RUNTIME_PORTING_H
diff --git a/runtime/src/main/cpp/dtoa/dblparse.cpp b/runtime/src/main/cpp/dtoa/dblparse.cpp
@@ -642,7 +642,9 @@ KDouble Konan_FloatingPointParser_parseDoubleImpl (KString s, KInt e)
 {
   const KChar* utf16 = CharArrayAddressOfElementAt(s, 0);
   KStdString utf8;
-  utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8));
+  TRY_CATCH(utf8::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
+            utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
+            /* Illegal UTF-16 string. */ ThrowNumberFormatException());
   const char *str = utf8.c_str();
   auto dbl = createDouble (str, e);
 

diff --git a/runtime/src/main/cpp/dtoa/fltparse.cpp b/runtime/src/main/cpp/dtoa/fltparse.cpp
@@ -542,7 +542,9 @@ Konan_FloatingPointParser_parseFloatImpl(KString s, KInt e)
 {
   const KChar* utf16 = CharArrayAddressOfElementAt(s, 0);
   KStdString utf8;
-  utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8));
+  TRY_CATCH(utf8::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
+            utf8::unchecked::utf16to8(utf16, utf16 + s->count_, back_inserter(utf8)),
+            /* Illegal UTF-16 string. */ ThrowNumberFormatException());
   const char *str = utf8.c_str();
   auto flt = createFloat(str, e);
 

diff --git a/runtime/src/main/cpp/utf8.h b/runtime/src/main/cpp/utf8.h
@@ -29,5 +29,10 @@ DEALINGS IN THE SOFTWARE.
 #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
 
 #include "utf8/unchecked.h"
+#include "utf8/with_replacement.h"
+
+#if !KONAN_NO_EXCEPTIONS
+#include "utf8/checked.h"
+#endif
 
 #endif // header guard
diff --git a/runtime/src/main/cpp/utf8/checked.h b/runtime/src/main/cpp/utf8/checked.h
@@ -193,6 +193,20 @@ namespace utf8
             utf8::next(it, end);
     }
 
+    /**
+     * Calculates a count of characters needed to represent the string from first to last in UTF-16
+     * taking into account surrogate symbols. Throws an exception if the input is invalid.
+     */
+    template<typename octet_iterator>
+    uint32_t utf16_length(octet_iterator first, octet_iterator last) {
+      uint32_t dist = 0;
+      while(first < last) {
+        uint32_t cp = utf8::next(first, last);
+        dist += (cp > 0xffff) ? 2 : 1;
+      }
+      return dist;
+    }
+
     template <typename octet_iterator>
     typename std::iterator_traits<octet_iterator>::difference_type
     distance (octet_iterator first, octet_iterator last)

diff --git a/runtime/src/main/cpp/utf8/core.h b/runtime/src/main/cpp/utf8/core.h
@@ -150,7 +150,7 @@ namespace internal
 
     /// get_sequence_x functions decode utf-8 sequences of the length x
     template <typename octet_iterator>
-    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    utf_error get_sequence_1(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
     {
         if (it == end)
             return NOT_ENOUGH_ROOM;
@@ -161,7 +161,7 @@ namespace internal
     }
 
     template <typename octet_iterator>
-    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    utf_error get_sequence_2(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
     {
         if (it == end) 
             return NOT_ENOUGH_ROOM;
@@ -176,7 +176,7 @@ namespace internal
     }
 
     template <typename octet_iterator>
-    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    utf_error get_sequence_3(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
     {
         if (it == end)
             return NOT_ENOUGH_ROOM;
@@ -195,7 +195,7 @@ namespace internal
     }
 
     template <typename octet_iterator>
-    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    utf_error get_sequence_4(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
     {
         if (it == end)
            return NOT_ENOUGH_ROOM;
@@ -220,7 +220,7 @@ namespace internal
     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
 
     template <typename octet_iterator>
-    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    utf_error validate_next(octet_iterator& it, const octet_iterator end, uint32_t& code_point)
     {
         // Save the original value of it so we can go back in case of failure
         // Of course, it does not make much sense with i.e. stream iterators

diff --git a/runtime/src/main/cpp/utf8/unchecked.h b/runtime/src/main/cpp/utf8/unchecked.h
@@ -116,6 +116,20 @@ namespace utf8
                 utf8::unchecked::next(it);
         }
 
+        /**
+         * Calculates a count of characters needed to represent the string from first to last in UTF-16
+         * taking into account surrogate symbols. Doesn't validate the input.
+         */
+        template<typename octet_iterator>
+        uint32_t utf16_length(octet_iterator first, const octet_iterator last) {
+            uint32_t dist = 0;
+            while (first < last) {
+                uint32_t cp = utf8::unchecked::next(first);
+                dist += (cp > 0xffff) ? 2 : 1;
+            }
+            return dist;
+        }
+
         template <typename octet_iterator>
         typename std::iterator_traits<octet_iterator>::difference_type
         distance (octet_iterator first, octet_iterator last)
@@ -127,7 +141,7 @@ namespace utf8
         }
 
         template <typename u16bit_iterator, typename octet_iterator>
-        octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+        octet_iterator utf16to8 (u16bit_iterator start, const u16bit_iterator end, octet_iterator result)
         {       
             while (start != end) {
                 uint32_t cp = utf8::internal::mask16(*start++);
@@ -142,7 +156,7 @@ namespace utf8
         }
 
         template <typename u16bit_iterator, typename octet_iterator>
-        u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+        u16bit_iterator utf8to16 (octet_iterator start, const octet_iterator end, u16bit_iterator result)
         {
             while (start < end) {
                 uint32_t cp = utf8::unchecked::next(start);