Merge pull request #11551 from ScottPJones/spj/fixutf

Fix #10959 bugs with UTF-16 conversions
JuliaLang · Jul 1, 2015 · 9071f14 · quinnj · Jul 1, 2015 · tkelman
2 parents 1c4dae4 + 2ab9334
commit 9071f14
Show file tree

Hide file tree

Showing 4 changed files with 350 additions and 67 deletions.
diff --git a/base/utf16.jl b/base/utf16.jl
@@ -1,17 +1,42 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
-utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
-utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
-utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
+# Quickly copy and set trailing \0
+@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
+			      ::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
+     S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
+end
+
+# Get rest of character ch from 3-byte UTF-8 sequence in dat
+@inline function get_utf8_3byte(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
+end
+# Get rest of character ch from 4-byte UTF-8 sequence in dat
+@inline function get_utf8_4byte(dat, pos, ch)
+    @inbounds return (((ch & 0x7) << 18)
+                        | (UInt32(dat[pos-2] & 0x3f) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+end
+
+# Output a character as a 4-byte UTF-8 sequence
+@inline function output_utf8_4byte!(buf, out, ch)
+    @inbounds begin
+        buf[out + 1] = 0xf0 | (ch >>> 18)
+        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
+        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
+        buf[out + 4] = 0x80 | (ch & 0x3f)
+    end
+end
+
+const empty_utf16 = UTF16String(UInt16[0])
 
 function length(s::UTF16String)
     d = s.data
     len = length(d) - 1
     len == 0 && return 0
     cnum = 0
     for i = 1:len
-        @inbounds cnum += !utf16_is_trail(d[i])
+        @inbounds cnum += !is_surrogate_trail(d[i])
     end
     cnum
 end
@@ -20,100 +45,220 @@ function endof(s::UTF16String)
     d = s.data
     i = length(d) - 1
     i == 0 && return i
-    utf16_is_surrogate(d[i]) ? i-1 : i
+    return is_surrogate_codeunit(d[i]) ? i-1 : i
 end
 
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
 function next(s::UTF16String, i::Int)
-    if !utf16_is_surrogate(s.data[i])
-        return Char(s.data[i]), i+1
-    elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
-        return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
-    end
-    throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
+    ch = s.data[i]
+    !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
+    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
+    Char(get_supplementary(ch, ct)), i+2
 end
 
 function reverseind(s::UTF16String, i::Integer)
     j = length(s.data) - i
-    return Base.utf16_is_trail(s.data[j]) ? j-1 : j
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
 end
 
 lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
 
 function reverse(s::UTF16String)
-    d =s.data
+    d = s.data
     out = similar(d)
     out[end] = 0 # NULL termination
     n = length(d)
-    for i = 1:n-1
-        out[i] = d[n-i]
-        if Base.utf16_is_lead(out[i])
-            out[i],out[i-1] = out[i-1],out[i]
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
         end
     end
-    return UTF16String(out)
+    UTF16String(out)
 end
 
-# TODO: optimize this
-function encode16(s::AbstractString)
-    buf = UInt16[]
-    for ch in s
-        c = reinterpret(UInt32, ch)
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_codeunit(data[i])
+            return false
+        else
+            i += 1
+        end
+    end
+    return i > n || !is_surrogate_codeunit(data[i])
+end
+
+"
+Converts an `AbstractString` to a `UTF16String`
+
+### Returns:
+*   `UTF16String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::AbstractString)
+    len, flags, num4byte = unsafe_checkstring(str)
+    buf = Vector{UInt16}(len+num4byte+1)
+    out = 0
+    @inbounds for ch in str
+        c = UInt32(ch)
         if c < 0x10000
-            push!(buf, UInt16(c))
-        elseif c <= 0x10ffff
-            push!(buf, UInt16(0xd7c0 + (c>>10)))
-            push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
+            buf[out += 1] = UInt16(c)
         else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
         end
     end
-    push!(buf, 0) # NULL termination
+    @inbounds buf[out + 1] = 0 # NULL termination
     UTF16String(buf)
 end
 
-utf16(x) = convert(UTF16String, x)
-convert(::Type{UTF16String}, s::UTF16String) = s
-convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
-convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
-convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
+"
+Converts a `UTF8String` to a `UTF16String`
 
-# TODO: optimize this
-convert(::Type{UTF8String}, s::UTF16String) =
-    sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
+### Returns:
+*   `UTF16String`
 
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
-unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
-    convert(Ptr{T}, pointer(s))
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf16
+    # Check that is correct UTF-8 encoding and get number of words needed
+    len, flags, num4byte = unsafe_checkstring(dat)
+    len += num4byte
+    buf = Vector{UInt16}(len+1)
+    @inbounds buf[len+1] = 0
+    # Optimize case where no characters > 0x7f
+    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            pos += 2
+            buf[out += 1] = get_utf8_3byte(dat, pos, ch)
+        # Handle range 0x10000-0x10ffff
+        else
+            pos += 3
+            ch = get_utf8_4byte(dat, pos, ch)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
+        end
+    end
+    UTF16String(buf)
+end
 
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    while i < n # check for unpaired surrogates
-        if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
-            i += 2
-        elseif utf16_is_surrogate(data[i])
-            return false
+"
+Converts a `UTF16String` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat) >>> 1
+    # handle zero length string quickly
+    len <= 1 && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+"
+Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
+
+### Input Arguments:
+* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
+* `len` length of output in bytes
+
+### Returns:
+* `UTF8String`
+"
+function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle 0x80-0x7ff
+        elseif ch < 0x800
+            buf[out += 1] = 0xc0 | (ch >>> 6)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
+        # Handle 0x10000-0x10ffff (if input is UInt32)
+        elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
+            output_utf8_4byte!(buf, out, ch)
+            out += 4
+        # Handle surrogate pairs
+        elseif is_surrogate_codeunit(ch)
+            output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
+            out += 4
+        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
         else
-            i += 1
+            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
+            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
         end
     end
-    return i > n || !utf16_is_surrogate(data[i])
+    UTF8String(buf)
 end
 
-function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
-    len = length(data)
-    d = Array(UInt16, len + 1)
-    d[end] = 0 # NULL terminate
-    UTF16String(copy!(d,1, data,1, len))
+function convert(::Type{UTF16String}, str::ASCIIString)
+    dat = str.data
+    @inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
 end
 
+convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
+convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
+
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
+    convert(Ptr{T}, pointer(s))
+
 convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
     convert(T, reshape(data, length(data)))
 
 convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
     convert(T, reinterpret(UInt16, data))
 
+function convert(::Type{UTF16String}, dat::AbstractVector{UInt16})
+    len, flags, num4byte = unsafe_checkstring(dat)
+    @inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true)
+end
+
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
     isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
@@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     UTF16String(d)
 end
 
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+utf16(x) = convert(UTF16String, x)
 utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
 utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
 function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
@@ -154,7 +302,7 @@ function map(fun, str::UTF16String)
         end
         uc = reinterpret(UInt32, c2)
         if uc < 0x10000
-            if utf16_is_surrogate(UInt16(uc))
+            if is_surrogate_codeunit(UInt16(uc))
                 throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
             end
             push!(buf, UInt16(uc))

diff --git a/base/utf32.jl b/base/utf32.jl
@@ -1,5 +1,6 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
+# UTF-32 basic functions
 next(s::UTF32String, i::Int) = (s.data[i], i+1)
 endof(s::UTF32String) = length(s.data) - 1
 length(s::UTF32String) = length(s.data) - 1
@@ -40,8 +41,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
     convert(T, takebuf_string(s))
 end
 
-convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
-convert(::Type{Array{Char}}, s::UTF32String) = s.data
+convert(::Type{Vector{Char}}, str::UTF32String) = str.data
+convert(::Type{Array{Char}},  str::UTF32String) = str.data
 
 reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
 
@@ -60,19 +61,19 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     elseif data[1] == Char(0xfffe0000) # byte-swapped
         d = Array(Char, length(data))
         for i = 2:length(data)
-            d[i-1] = bswap(data[i])
+            @inbounds d[i-1] = bswap(data[i])
         end
     else
         d = Array(Char, length(data) + 1)
         copy!(d, 1, data, 1, length(data)) # assume native byte order
     end
-    d[end] = Char(0) # NULL terminate
+    d[end] = 0 # NULL terminate
     UTF32String(d)
 end
 
 function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
     for i=1:length(str)
-        @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
+        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
     end
     return true
 end
@@ -89,9 +90,9 @@ end
 function map(f, s::UTF32String)
     d = s.data
     out = similar(d)
-    out[end] = Char(0)
+    out[end] = 0
 
-    for i = 1:(length(d)-1)
+    @inbounds for i = 1:(length(d)-1)
         c2 = f(d[i])
         if !isa(c2, Char)
             throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))