Skip to content

Commit

Permalink
Update based on review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 3, 2015
1 parent 7c57f34 commit 58cc026
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 38 deletions.
2 changes: 1 addition & 1 deletion base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,12 @@ include("osutils.jl")
include("utferror.jl")
include("utftype.jl")
include("utfcheck.jl")
include("utfconvert.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
include("utf16.jl")
include("utf32.jl")
include("utfconvert.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
Expand Down
6 changes: 0 additions & 6 deletions base/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,6 @@ end
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
len = length(data)
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
end

convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
convert(T, reshape(data, length(data)))

Expand Down
9 changes: 3 additions & 6 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@ reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)

function convert(::Type{UTF32String}, data::AbstractVector{Char})
len = length(data)
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
end

convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
convert(UTF32String, reinterpret(Char, data))

Expand Down Expand Up @@ -45,7 +40,9 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
copy!(d,1, data, 2, length(data)-1)
elseif data[1] == Char(0xfffe0000) # byte-swapped
d = Array(Char, length(data))
@inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end
for i = 2:length(data)
@inbounds d[i-1] = bswap(data[i])
end
else
d = Array(Char, length(data) + 1)
copy!(d, 1, data, 1, length(data)) # assume native byte order
Expand Down
54 changes: 30 additions & 24 deletions base/utfconvert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,25 @@
# Functions to convert to different UTF encodings

# Quickly copy and set trailing \0
@inline function fast_utf_copy(T::Type{UInt16}, len, dat)
@inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len))
end
@inline function fast_utf_copy(T::Type{Char}, len, dat)
@inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len))
@inline function fast_utf_copy{S <: Union(UTF16String, UTF32String), T <: Union(UInt16, Char)}(::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
@inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len))
end

# Get rest of character ch from 3-byte UTF-8 sequence in dat
@inline function get_utf8_3(dat, pos, ch)
@inline function get_utf8_3byte(dat, pos, ch)
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
end

# Get rest of character ch from 4-byte UTF-8 sequence in dat
@inline function get_utf8_4(dat, pos, ch)
@inline function get_utf8_4byte(dat, pos, ch)
@inbounds return (((ch & 0x7) << 18)
| (UInt32(dat[pos-2] & 0x3f) << 12)
| (UInt32(dat[pos-1] & 0x3f) << 6)
| (dat[pos] & 0x3f))
end

# Output a character as a 4-byte UTF-8 sequence
@inline function output_utf8_4(buf, out, ch)
@inline function output_utf8_4byte!(buf, out, ch)
@inbounds begin
buf[out + 1] = 0xf0 | (ch >>> 18)
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
Expand Down Expand Up @@ -117,11 +114,11 @@ function convert(::Type{UTF16String}, str::UTF8String)
# Handle range 0x800-0xffff
elseif ch < 0xf0
pos += 2
buf[out += 1] = get_utf8_3(dat, pos, ch)
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
# Handle range 0x10000-0x10ffff
else
pos += 3
ch = get_utf8_4(dat, pos, ch)
ch = get_utf8_4byte(dat, pos, ch)
# output surrogate pair
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
Expand Down Expand Up @@ -241,12 +238,12 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
buf[out += 1] = 0xc0 | (ch >>> 6)
buf[out += 1] = 0x80 | (ch & 0x3f)
# Handle 0x10000-0x10ffff (if input is UInt32)
elseif T == UInt32 && ch > 0xffff
output_utf8_4(buf, out, ch)
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
output_utf8_4byte!(buf, out, ch)
out += 4
# Handle surrogate pairs
elseif is_surrogate_codeunit(ch)
output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1]))
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
out += 4
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
else
Expand Down Expand Up @@ -277,7 +274,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
len, flags = check_string_utf8(dat)
# Optimize case where no characters > 0x7f
totlen = len+1
flags == 0 && return fast_utf_copy(Char, totlen, dat)
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
# has multi-byte UTF-8 sequences
buf = Vector{Char}(totlen)
@inbounds buf[totlen] = 0 # NULL termination
Expand All @@ -295,7 +292,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
# Handle range 0x800-0xffff
elseif ch < 0xf0
pos += 2
ch = get_utf8_3(dat, pos, ch)
ch = get_utf8_3byte(dat, pos, ch)
# Handle surrogate pairs (should have been encoded in 4 bytes)
if is_surrogate_lead(ch)
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
Expand All @@ -309,7 +306,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
# Handle range 0x10000-0x10ffff
else
pos += 3
buf[out += 1] = get_utf8_4(dat, pos, ch)
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
end
end
UTF32String(buf)
Expand Down Expand Up @@ -367,7 +364,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
len, flags, num4byte = check_string_utf32(dat, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && return fast_utf_copy(UInt16, len, dat)
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
return encode_to_utf16(dat, len)
end

Expand Down Expand Up @@ -423,22 +420,31 @@ end

convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat))

convert(::Type{UTF16String}, str::UTF16String) = str
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))

function convert(::Type{UTF16String}, str::ASCIIString)
dat = str.data
fast_utf_copy(UInt16, length(dat)+1, dat)
fast_utf_copy(UTF16String, UInt16, length(dat)+1, dat)
end

function convert(::Type{UTF32String}, str::ASCIIString)
dat = str.data
fast_utf_copy(Char, length(dat)+1, dat)
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
fast_utf_copy(UTF16String, UInt16, length(data), data, true)
end

convert(::Type{UTF16String}, str::UTF16String) = str
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))

convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data

convert(::Type{UTF32String}, str::UTF32String) = str

convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])

function convert(::Type{UTF32String}, str::ASCIIString)
dat = str.data
fast_utf_copy(UTF32String, Char, length(dat)+1, dat)
end

convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true)


2 changes: 1 addition & 1 deletion base/utferror.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ const errMsgs = [
@throws never returns, always throws ArgumentError
""" ->
=#
function utf_errfunc(errcode::Integer, charpos, invchar)
@noinline function utf_errfunc(errcode::Integer, charpos, invchar)
if errcode < 1 || errcode > UTF_ERR_MAX
throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
end
Expand Down

0 comments on commit 58cc026

Please sign in to comment.