Skip to content

Commit

Permalink
Changes to error handling and check_string
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jun 3, 2015
1 parent 83bb673 commit 4592c7b
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 60 deletions.
4 changes: 2 additions & 2 deletions base/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
isodd(length(bytes)) && utf_errfunc(UTF_ERR_ODD_BYTES_16,length(bytes),0)
data = reinterpret(UInt16, bytes)
# check for byte-order mark (BOM):
if data[1] == 0xfeff # native byte order
Expand All @@ -98,7 +98,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
copy!(d,1, data,1, length(data)) # assume native byte order
end
d[end] = 0 # NULL terminate
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, d) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
UTF16String(d)
end

Expand Down
6 changes: 2 additions & 4 deletions base/utf32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =

function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF32String(Char[0])
length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
length(bytes) & 3 != 0 && utf_errfunc(UTF_ERR_ODD_BYTES_32, length(bytes))
data = reinterpret(Char, bytes)
# check for byte-order mark (BOM):
if data[1] == Char(0x0000feff) # native byte order
Expand Down Expand Up @@ -76,9 +76,7 @@ function map(f, s::UTF32String)

@inbounds for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
end
!isa(c2, Char) && utf_errfunc(UTF_ERR_MAP_CHAR,0,0)
out[i] = (c2::Char)
end
UTF32String(out)
Expand Down
14 changes: 4 additions & 10 deletions base/utfcheck.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,10 @@ end
@throws ArgumentError
""" ->
=#
function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
function check_string(dat::Vector{UInt8}, len::Int = sizeof(dat), pos::Int = 0 ; options::Integer=0)
local byt::UInt8, ch::UInt32, surr::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
len = sizeof(dat)
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
Expand Down Expand Up @@ -133,11 +131,10 @@ end
@throws ArgumentError
""" ->
=#
function check_string_utf16(dat::Vector{UInt16}, len::Int)
function check_string(dat::Vector{UInt16}, len::Int = sizeof(dat)>>>1, pos::Int = 0 ; options::Integer = 0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
Expand Down Expand Up @@ -177,11 +174,10 @@ end
@throws ArgumentError
""" ->
=#
function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
function check_string(dat::Vector{UInt32}, len::Int = sizeof(dat)>>>2, pos::Int = 0; options::Integer=0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = 0
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
Expand Down Expand Up @@ -215,12 +211,10 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
return totalchar, flags, num4byte, num3byte, num2byte
end

function check_string_abs(str::AbstractString, options::Integer=0)
function check_string(str::AbstractString, len::Int = endof(str), pos::Int = start(str) ; options::Integer=0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
pos = start(str)
len = endof(str)
@inbounds while pos < len
ch, pos = next(str, pos)
totalchar += 1
Expand Down
24 changes: 12 additions & 12 deletions base/utfconvert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ end
"""
=#
function convert(::Type{UTF16String}, str::AbstractString)
len, flags, num4byte = check_string_abs(str)
len, flags, num4byte = check_string(str)
buf = Vector{UInt16}(len+num4byte+1)
out = 0
@inbounds for ch in str
Expand Down Expand Up @@ -71,7 +71,7 @@ end
"""
=#
function convert(::Type{UTF32String}, str::AbstractString)
len, flags = check_string_abs(str)
len, flags = check_string(str)
buf = Vector{Char}(len+1)
out = 0
@inbounds for ch in str ; buf[out += 1] = ch ; end
Expand All @@ -95,7 +95,7 @@ function convert(::Type{UTF16String}, str::UTF8String)
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf16
# Check that is correct UTF-8 encoding and get number of words needed
len, flags, num4byte = check_string_utf8(dat)
len, flags, num4byte = check_string(dat)
len += num4byte
buf = Vector{UInt16}(len+1)
@inbounds buf[len+1] = 0
Expand Down Expand Up @@ -143,7 +143,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt16})
# handle zero length string quickly
len == 0 && return UTF8String("")
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand All @@ -165,7 +165,7 @@ function convert(::Type{UTF8String}, str::UTF16String)
# handle zero length string quickly
len <= 1 && return UTF8String("")
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand All @@ -186,7 +186,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt32})
# handle zero length string quickly
len == 0 && return UTF8String("")
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand All @@ -208,7 +208,7 @@ function convert(::Type{UTF8String}, str::UTF32String)
# handle zero length string quickly
len <= 1 && return UTF8String("")
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1)
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
end
Expand Down Expand Up @@ -271,7 +271,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf32
# Validate UTF-8 encoding, and get number of words to create
len, flags = check_string_utf8(dat)
len, flags = check_string(dat)
# Optimize case where no characters > 0x7f
totlen = len+1
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
Expand Down Expand Up @@ -329,7 +329,7 @@ function convert(::Type{UTF32String}, str::UTF16String)
# handle zero length string quickly (account for trailing \0)
len <= 2 && return empty_utf32
# get number of words to create
len, flags, num4byte = check_string_utf16(dat, len>>>1)
len, flags, num4byte = check_string(dat, len>>>1)
# No surrogate pairs, do optimized copy
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
local ch::UInt32
Expand Down Expand Up @@ -361,7 +361,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string_utf32(dat, len>>>2)
len, flags, num4byte = check_string(dat, len>>>2)
len += num4byte + 1
# optimized path, no surrogates
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
Expand All @@ -385,7 +385,7 @@ function convert(::Type{UTF16String}, str::UTF32String)
# handle zero length string quickly
len <= 4 && return empty_utf16
# get number of words to allocate
len, flags, num4byte = check_string_utf32(dat, len>>>2)
len, flags, num4byte = check_string(dat, len>>>2)
# optimized path, no surrogates
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
return encode_to_utf16(dat, len + num4byte)
Expand Down Expand Up @@ -429,7 +429,7 @@ function convert(::Type{UTF16String}, str::ASCIIString)
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, data) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
fast_utf_copy(UTF16String, UInt16, length(data), data, true)
end

Expand Down
51 changes: 19 additions & 32 deletions base/utferror.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,33 @@
""" ->
=#

const UTF_ERR_SHORT = 1
const UTF_ERR_CONT = 2
const UTF_ERR_LONG = 3
const UTF_ERR_NOT_LEAD = 4
const UTF_ERR_NOT_TRAIL = 5
const UTF_ERR_NOT_SURROGATE = 6
const UTF_ERR_MISSING_SURROGATE = 7
const UTF_ERR_INVALID = 8
const UTF_ERR_SURROGATE = 9
const UTF_ERR_NULL_16_TERMINATE = 10
const UTF_ERR_NULL_32_TERMINATE = 11
const UTF_ERR_MAX = 11
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"

const errMsgs = [
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
"invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
"not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)",
"not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)",
"not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>",
"missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)",
"invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
"surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
"UTF16String data must be NULL-terminated",
"UTF32String data must be NULL-terminated"
]
#=
@doc """
@brief Throws ArgumentError with information about the specific error, location, and character
@param[in] errcode Error code for Unicode error (one of UTF_ERR_*)
@param[in] message One of UTF_ERR_ messages
@param[in] charpos Index of invalid byte or character
@param[in] invchar Invalid byte or character
@throws never returns, always throws ArgumentError
@throws ArgumentError
""" ->
=#
@noinline function utf_errfunc(errcode::Integer, charpos, invchar)
if errcode < 1 || errcode > UTF_ERR_MAX
throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
end
throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
@noinline function utf_errfunc(message, charpos, invchar)
throw(ArgumentError(replace(replace(message,"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
end

0 comments on commit 4592c7b

Please sign in to comment.