Skip to content

Commit

Permalink
Fix #10959 bugs with UTF-16/UTF-32 conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 3, 2015
1 parent c85f1be commit 0b6619a
Show file tree
Hide file tree
Showing 8 changed files with 984 additions and 109 deletions.
4 changes: 4 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,15 @@ include("iterator.jl")
include("osutils.jl")

# strings & printing
include("utferror.jl")
include("utftype.jl")
include("utfcheck.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
include("utf16.jl")
include("utf32.jl")
include("utfconvert.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
Expand Down
95 changes: 29 additions & 66 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

immutable UTF16String <: AbstractString
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(ArgumentError("UTF16String data must be NULL-terminated"))
end
new(data)
end
end

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !utf16_is_trail(d[i])
@inbounds cnum += !is_surrogate_trail(d[i])
end
cnum
end
Expand All @@ -30,92 +15,69 @@ function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
return is_surrogate_codeunit(d[i]) ? i-1 : i
end

get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return Char(s.data[i]), i+1
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(ArgumentError("invalid UTF-16 character index"))
ch = s.data[i]
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
# check length, account for terminating \0
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
ct = s.data[i+1]
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
Char(get_supplementary(ch, ct)), i+2
end

function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
return is_surrogate_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
d =s.data
d = s.data
out = similar(d)
out[end] = 0 # NULL termination
n = length(d)
for i = 1:n-1
out[i] = d[n-i]
if Base.utf16_is_lead(out[i])
out[i],out[i-1] = out[i-1],out[i]
end
end
return UTF16String(out)
end

# TODO: optimize this
function encode16(s::AbstractString)
buf = UInt16[]
for ch in s
c = reinterpret(UInt32, ch)
if c < 0x10000
push!(buf, UInt16(c))
elseif c <= 0x10ffff
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
@inbounds for i = 1:n-1
ch = d[n-i]
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
out[i] = ch
end
end
push!(buf, 0) # NULL termination
UTF16String(buf)
UTF16String(out)
end

utf16(x) = convert(UTF16String, x)
convert(::Type{UTF16String}, s::UTF16String) = s
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data

# TODO: optimize this
convert(::Type{UTF8String}, s::UTF16String) =
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)

sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
while i < n # check for unpaired surrogates
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
@inbounds while i < n # check for unpaired surrogates
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
i += 2
elseif utf16_is_surrogate(data[i])
elseif is_surrogate_codeunit(data[i])
return false
else
i += 1
end
end
return i > n || !utf16_is_surrogate(data[i])
return i > n || !is_surrogate_codeunit(data[i])
end

unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
UTF16String(copy!(d,1, data,1, len))
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
end

convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
Expand Down Expand Up @@ -146,6 +108,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
UTF16String(d)
end

utf16(x) = convert(UTF16String, x)
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
function utf16(p::Union(Ptr{UInt16}, Ptr{Int16}))
Expand Down
56 changes: 13 additions & 43 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -1,42 +1,17 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## UTF-32 in the native byte order, i.e. plain old character arrays ##

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(a::Vector{Char})
if length(a) < 1 || a[end] != Char(0)
throw(ArgumentError("UTF32String data must be NULL-terminated"))
end
new(a)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

# UTF-32 basic functions
next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1

utf32(x) = convert(UTF32String, x)
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
convert(::Type{UTF32String}, s::UTF32String) = s
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

function convert(::Type{UTF32String}, s::AbstractString)
a = Array(Char, length(s) + 1)
i = 0
for c in s
a[i += 1] = c
end
a[end] = Char(0) # NULL terminate
UTF32String(a)
end
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
d = Array(Char, len + 1)
d[end] = Char(0) # NULL terminate
UTF32String(copy!(d,1, data,1, len))
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
end

convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
Expand All @@ -54,12 +29,9 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
convert(T, takebuf_string(s))
end

convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
convert(::Type{Array{Char}}, s::UTF32String) = s.data

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
convert(::Type{Vector{Char}}, str::UTF32String) = str.data
convert(::Type{Array{Char}}, str::UTF32String) = str.data

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
convert(Ptr{T}, pointer(s))

Expand All @@ -73,26 +45,24 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
copy!(d,1, data, 2, length(data)-1)
elseif data[1] == Char(0xfffe0000) # byte-swapped
d = Array(Char, length(data))
for i = 2:length(data)
d[i-1] = bswap(data[i])
end
@inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end
else
d = Array(Char, length(data) + 1)
copy!(d, 1, data, 1, length(data)) # assume native byte order
end
d[end] = Char(0) # NULL terminate
d[end] = 0 # NULL terminate
UTF32String(d)
end

function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
for i=1:length(str)
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
end
return true
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)

utf32(x) = convert(UTF32String, x)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
Expand All @@ -105,9 +75,9 @@ end
function map(f, s::UTF32String)
d = s.data
out = similar(d)
out[end] = Char(0)
out[end] = 0

for i = 1:(length(d)-1)
@inbounds for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
Expand Down
Loading

0 comments on commit 0b6619a

Please sign in to comment.