Changes to error handling and check_string

JuliaLang · Jun 3, 2015 · 4592c7b · 4592c7b
1 parent 83bb673
commit 4592c7b
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 60 deletions.
diff --git a/base/utf16.jl b/base/utf16.jl
@@ -82,7 +82,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
 
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
-    isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
+    isodd(length(bytes)) && utf_errfunc(UTF_ERR_ODD_BYTES_16,length(bytes),0)
     data = reinterpret(UInt16, bytes)
     # check for byte-order mark (BOM):
     if data[1] == 0xfeff        # native byte order
@@ -98,7 +98,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
         copy!(d,1, data,1, length(data)) # assume native byte order
     end
     d[end] = 0 # NULL terminate
-    !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
+    !isvalid(UTF16String, d) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
     UTF16String(d)
 end
 

diff --git a/base/utf32.jl b/base/utf32.jl
@@ -32,7 +32,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
 
 function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF32String(Char[0])
-    length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
+    length(bytes) & 3 != 0 && utf_errfunc(UTF_ERR_ODD_BYTES_32, length(bytes))
     data = reinterpret(Char, bytes)
     # check for byte-order mark (BOM):
     if data[1] == Char(0x0000feff) # native byte order
@@ -76,9 +76,7 @@ function map(f, s::UTF32String)
 
     @inbounds for i = 1:(length(d)-1)
         c2 = f(d[i])
-        if !isa(c2, Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
-        end
+        !isa(c2, Char) && utf_errfunc(UTF_ERR_MAP_CHAR,0,0)
         out[i] = (c2::Char)
     end
     UTF32String(out)

diff --git a/base/utfcheck.jl b/base/utfcheck.jl
@@ -38,12 +38,10 @@ end
 @throws     ArgumentError
 """ ->
 =#
-function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
+function check_string(dat::Vector{UInt8}, len::Int = sizeof(dat), pos::Int = 0 ; options::Integer=0)
     local byt::UInt8, ch::UInt32, surr::UInt32
     flags::UInt = 0
     totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
-    len = sizeof(dat)
     @inbounds while pos < len
         ch = dat[pos += 1]
         totalchar += 1
@@ -133,11 +131,10 @@ end
 @throws     ArgumentError
 """ ->
 =#
-function check_string_utf16(dat::Vector{UInt16}, len::Int)
+function check_string(dat::Vector{UInt16}, len::Int = sizeof(dat)>>>1, pos::Int = 0 ; options::Integer = 0)
     local ch::UInt32
     flags::UInt = 0
     totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
     @inbounds while pos < len
         ch = dat[pos += 1]
         totalchar += 1
@@ -177,11 +174,10 @@ end
 @throws     ArgumentError
 """ ->
 =#
-function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
+function check_string(dat::Vector{UInt32}, len::Int = sizeof(dat)>>>2, pos::Int = 0; options::Integer=0)
     local ch::UInt32
     flags::UInt = 0
     totalchar = num2byte = num3byte = num4byte = 0
-    pos = 0
     @inbounds while pos < len
         ch = dat[pos += 1]
         totalchar += 1
@@ -215,12 +211,10 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
     return totalchar, flags, num4byte, num3byte, num2byte
 end
 
-function check_string_abs(str::AbstractString, options::Integer=0)
+function check_string(str::AbstractString, len::Int = endof(str), pos::Int = start(str) ; options::Integer=0)
     local ch::UInt32
     flags::UInt = 0
     totalchar = num2byte = num3byte = num4byte = 0
-    pos = start(str)
-    len = endof(str)
     @inbounds while pos < len
         ch, pos = next(str, pos)
         totalchar += 1

diff --git a/base/utfconvert.jl b/base/utfconvert.jl
@@ -42,7 +42,7 @@ end
 """
 =#
 function convert(::Type{UTF16String}, str::AbstractString)
-    len, flags, num4byte = check_string_abs(str)
+    len, flags, num4byte = check_string(str)
     buf = Vector{UInt16}(len+num4byte+1)
     out = 0
     @inbounds for ch in str
@@ -71,7 +71,7 @@ end
 """
 =#
 function convert(::Type{UTF32String}, str::AbstractString)
-    len, flags = check_string_abs(str)
+    len, flags = check_string(str)
     buf = Vector{Char}(len+1)
     out = 0
     @inbounds for ch in str ; buf[out += 1] = ch ; end
@@ -95,7 +95,7 @@ function convert(::Type{UTF16String}, str::UTF8String)
     # handle zero length string quickly
     sizeof(dat) == 0 && return empty_utf16
     # Check that is correct UTF-8 encoding and get number of words needed
-    len, flags, num4byte = check_string_utf8(dat)
+    len, flags, num4byte = check_string(dat)
     len += num4byte
     buf = Vector{UInt16}(len+1)
     @inbounds buf[len+1] = 0
@@ -143,7 +143,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt16})
     # handle zero length string quickly
     len == 0 && return UTF8String("")
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
     return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -165,7 +165,7 @@ function convert(::Type{UTF8String}, str::UTF16String)
     # handle zero length string quickly
     len <= 1 && return UTF8String("")
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
     return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -186,7 +186,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt32})
     # handle zero length string quickly
     len == 0 && return UTF8String("")
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
     return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -208,7 +208,7 @@ function convert(::Type{UTF8String},  str::UTF32String)
     # handle zero length string quickly
     len <= 1 && return UTF8String("")
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
     return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -271,7 +271,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
     # handle zero length string quickly
     sizeof(dat) == 0 && return empty_utf32
     # Validate UTF-8 encoding, and get number of words to create
-    len, flags = check_string_utf8(dat)
+    len, flags = check_string(dat)
     # Optimize case where no characters > 0x7f
     totlen = len+1
     flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
@@ -329,7 +329,7 @@ function convert(::Type{UTF32String}, str::UTF16String)
     # handle zero length string quickly (account for trailing \0)
     len <= 2 && return empty_utf32
     # get number of words to create
-    len, flags, num4byte = check_string_utf16(dat, len>>>1)
+    len, flags, num4byte = check_string(dat, len>>>1)
     # No surrogate pairs, do optimized copy
     (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
     local ch::UInt32
@@ -361,7 +361,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
     # handle zero length string quickly
     len <= 4 && return empty_utf16
     # get number of words to allocate
-    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    len, flags, num4byte = check_string(dat, len>>>2)
     len += num4byte + 1
     # optimized path, no surrogates
     num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
@@ -385,7 +385,7 @@ function convert(::Type{UTF16String}, str::UTF32String)
     # handle zero length string quickly
     len <= 4 && return empty_utf16
     # get number of words to allocate
-    len, flags, num4byte = check_string_utf32(dat, len>>>2)
+    len, flags, num4byte = check_string(dat, len>>>2)
     # optimized path, no surrogates
     num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
     return encode_to_utf16(dat, len + num4byte)
@@ -429,7 +429,7 @@ function convert(::Type{UTF16String}, str::ASCIIString)
 end
 
 function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
+    !isvalid(UTF16String, data) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
     fast_utf_copy(UTF16String, UInt16, length(data), data, true)
 end
 

diff --git a/base/utferror.jl b/base/utferror.jl
@@ -6,46 +6,33 @@
 """ ->
 =#
 
-const UTF_ERR_SHORT = 1
-const UTF_ERR_CONT  = 2
-const UTF_ERR_LONG  = 3
-const UTF_ERR_NOT_LEAD = 4
-const UTF_ERR_NOT_TRAIL = 5
-const UTF_ERR_NOT_SURROGATE = 6
-const UTF_ERR_MISSING_SURROGATE = 7
-const UTF_ERR_INVALID = 8
-const UTF_ERR_SURROGATE = 9
-const UTF_ERR_NULL_16_TERMINATE = 10
-const UTF_ERR_NULL_32_TERMINATE = 11
-const UTF_ERR_MAX = 11
+const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
+const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
+const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
+const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
+const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
+const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
+const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
+const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
+const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
+const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
+const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
+const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
 
-const errMsgs = [
-    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
-    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
-    "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
-    "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)",
-    "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)",
-    "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>",
-    "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)",
-    "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
-    "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
-    "UTF16String data must be NULL-terminated",
-    "UTF32String data must be NULL-terminated"
-]
 #=
 @doc """
 @brief      Throws ArgumentError with information about the specific error, location, and character
 
-@param[in]  errcode Error code for Unicode error (one of UTF_ERR_*)
+@param[in]  message One of UTF_ERR_ messages
 @param[in]  charpos Index of invalid byte or character
 @param[in]  invchar Invalid byte or character
 
-@throws never returns, always throws ArgumentError
+@throws     ArgumentError
 """ ->
 =#
-@noinline function utf_errfunc(errcode::Integer, charpos, invchar)
-    if errcode < 1 || errcode > UTF_ERR_MAX
-        throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
-    end
-    throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
+@noinline function utf_errfunc(message, charpos, invchar)
+    throw(ArgumentError(replace(replace(message,"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
 end