Add Parsers.parse(Number, source) for number type detection + parsing

JuliaData · Nov 8, 2022 · 5911408 · 5911408
1 parent 5bdff9c
commit 5911408
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 35 deletions.
diff --git a/src/Parsers.jl b/src/Parsers.jl
@@ -305,14 +305,14 @@ const SourceType = Union{AbstractVector{UInt8}, AbstractString, IO}
 xparse(::Type{T}, source::SourceType; pos::Integer=1, len::Integer=source isa IO ? 0 : sizeof(source), kw...) where {T} =
     xparse(T, source, pos, len, Options(; kw...))
 
-@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
+@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
     Result(emptysentinel(options)(delimiter(options)(whitespace(options)(
         quoted(options)(whitespace(options)(sentinel(options)(typeparser(options)
     )))))))(T, source, pos, len, S)
 
 function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
     buf = source isa AbstractString ? codeunits(source) : source
-    if supportedtype(T)
+    if supportedtype(T) || T === Number
         return _xparse(T, buf, pos, len, options, S)
     else
         # generic fallback calls Base.tryparse
@@ -334,12 +334,12 @@ function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Typ
 end
 
 # condensed version of xparse that doesn't worry about quoting or delimiters; called from Parsers.parse/Parsers.tryparse
-@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
+@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
     Result(whitespace(false, false, false, true)(typeparser(opts)))(T, source, pos, len, S)
 
 @inline function xparse2(::Type{T}, source::SourceType, pos, len, options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
     buf = source isa AbstractString ? codeunits(source) : source
-    if supportedtype(T)
+    if supportedtype(T) || T === Number
         return _xparse2(T, buf, pos, len, options, S)
     else
         # generic fallback calls Base.tryparse
@@ -386,7 +386,7 @@ include("dates.jl")
 
 function __init__()
     resize!(empty!(BIGINT), Threads.nthreads())
-    resize!(empty!(BIGFLOAT), Threads.nthreads())
+    resize!(empty!(BIGFLOATS), Threads.nthreads())
     return
 end
 

diff --git a/src/floats.jl b/src/floats.jl
@@ -1,6 +1,7 @@
 using Base.MPFR, Base.GMP, Base.GMP.MPZ
 
 _widen(x::UInt64) = UInt128(x)
+_widen(x::Int64) = Int128(x)
 
 const BIGINT = BigInt[]
 
@@ -17,10 +18,10 @@ function access_threaded(f, v::Vector)
 end
 @noinline _length_assert() =  @assert false "0 < tid <= v"
 
-function _widen(v::UInt128)
+function _widen(v::T) where {T <: Union{Int128, UInt128}}
     x = access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), BIGINT)
     ccall((:__gmpz_import, :libgmp), Int32,
-        (Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{UInt128}),
+        (Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{T}),
         x, 1, 1, 16, 0, 0, v)
     return x
 end
@@ -43,12 +44,44 @@ function _muladd(ten, digits::BigInt, b)
     return digits
 end
 
+@enum FloatType FLOAT16 FLOAT32 FLOAT64 BIGFLOAT
+float_type(::Type{T}, FT::FloatType) where {T <: AbstractFloat} = T
+float_type(T, FT::FloatType) = FT === FLOAT16 ? Float16 :
+    FT === FLOAT32 ? Float32 :
+    FT === FLOAT64 ? Float64 :
+    BigFloat
+
 # for non SupportedFloat Reals, parse as Float64, then convert
 @inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: Real}
     pos, code, pl, x = typeparser(Float64, source, pos, len, b, code, pl, options)
     return pos, code, pl, T(x)
 end
 
+function typeparser(::Type{BigFloat}, source, pos, len, b, code, pl, options)
+    base = 0
+    rounding = Base.MPFR.ROUNDING_MODE[]
+    z = BigFloat(precision=Base.MPFR.DEFAULT_PRECISION[])
+    if source isa AbstractVector{UInt8}
+        str = source
+    else
+        _, _, _pl, _ = typeparser(String, source, pos, len, b, code, pl, options)
+        _pos = position(source)
+        vpos, vlen = _pl.pos, _pl.len
+        fastseek!(source, vpos - 1)
+        str = Base.StringVector(vlen)
+        readbytes!(source, str, vlen)
+        fastseek!(source, _pos) # reset IO to earlier position
+    end
+    GC.@preserve str begin
+        ptr = pointer(str)
+        endptr = Ref{Ptr{UInt8}}()
+        err = ccall((:mpfr_strtofr, :libmpfr), Int32, (Ref{BigFloat}, Ptr{UInt8}, Ref{Ptr{UInt8}}, Int32, Base.MPFR.MPFRRoundingMode), z, ptr, endptr, base, rounding)
+        code |= endptr[] == ptr ? INVALID : OK
+        pos += Int(endptr[] - ptr)
+        return pos, code, PosLen(pl.pos, pos - pl.pos), z
+    end
+end
+
 @inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: SupportedFloats}
     # keep track of starting pos in case of invalid, we can rewind to start of parsing
     startpos = pos
@@ -89,7 +122,7 @@ end
                     if eof(source, pos, len)
                         code |= EOF
                     end
-                    code |= OK
+                    code |= OK | SPECIAL_VALUE
                     @goto done
                 end
             end
@@ -111,7 +144,7 @@ end
                 b = peekbyte(source, pos)
                 if b == UInt8('f') || b == UInt8('F')
                     x = ifelse(neg, T(-Inf), T(Inf))
-                    code |= OK
+                    code |= OK | SPECIAL_VALUE
                     pos += 1
                     incr!(source)
                     if eof(source, pos, len)
@@ -267,12 +300,13 @@ end
 # same as above; if digits overflows, we want a non-inlined version to call with a wider type
 # note that we never expect `frac` to overflow, since it's just keep track of the # of digits
 # we parse post-decimal point
-@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType} =
+@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType} =
     parsefrac(T, source, pos, len, b, code, options, digits, neg, startpos, frac)
 
-@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType}
+@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType}
     x = zero(T)
     parsedanyfrac = false
+    FT = FLOAT64
     # check if `b` is a digit
     if b - UInt8('0') < 0x0a
         b -= UInt8('0')
@@ -285,7 +319,7 @@ end
             frac += UInt64(1)
             if eof(source, pos, len)
                 # input is simple non-scientific-notation floating number, like "1.1"
-                x = scale(T, digits, -signed(frac), neg)
+                x = scale(T, FT, digits, -signed(frac), neg)
                 code |= OK | EOF
                 @goto done
             end
@@ -299,6 +333,9 @@ end
     end
     # check for exponent notation
     if b == UInt8('e') || b == UInt8('E') || b == UInt8('f') || b == UInt8('F')
+        if b == UInt8('f') || b == UInt8('F')
+            FT = FLOAT32
+        end
         pos += 1
         incr!(source)
         if eof(source, pos, len)
@@ -327,11 +364,11 @@ end
 
         # at this point, we've parsed X and Y in "X.YeZ", but not Z in a scientific notation exponent number
         # we start our exponent number at UInt64(0)
-        return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp)
+        return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp, FT)
     else
         # if no scientific notation, we're done, so scale digits + frac and return
         if parsedanyfrac
-            x = scale(T, digits, -signed(frac), neg)
+            x = scale(T, FT, digits, -signed(frac), neg)
         else
             x = ifelse(neg, -T(digits), T(digits))
         end
@@ -344,10 +381,10 @@ end
 
 # same no-inline story, but this time for exponent number; probably even more rare to overflow the exponent number
 # compared to pre/post decimal digits, but we account for it all the same (a lot of float parsers don't account for this)
-@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType} =
-    parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp)
+@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType} =
+    parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp, FT)
 
-@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType}
+@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType}
     x = zero(T)
     # note that `b` has already had `b - UInt8('0')` applied to it for parseexp
     while true
@@ -356,19 +393,19 @@ end
         incr!(source)
         if eof(source, pos, len)
             # we finished parsing input like "1.1e1"
-            x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
+            x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
             code |= OK | EOF
             @goto done
         end
         b = peekbyte(source, pos) - UInt8('0')
         # if we encounter a non-digit, that must mean we're done
         if b > 0x09
-            x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
+            x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
             code |= OK
             @goto done
         end
         if overflows(ExpType) && exp > overflowval(ExpType)
-            return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp)
+            return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp, FT)
         end
     end
 @label done
@@ -395,7 +432,14 @@ pow10(::Type{Float32}, e) = (@inbounds v = F32_SHORT_POWERS[e+1]; return v)
 pow10(::Type{Float64}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)
 pow10(::Type{BigFloat}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)
 
-function scale(::Type{T}, v, exp, neg) where {T}
+_unsigned(x::BigInt) = x
+_unsigned(x) = unsigned(x)
+
+function scale(::Type{T}, FT::FloatType, v, exp, neg) where {T}
+    return __scale(float_type(T, FT), _unsigned(v), exp, neg)
+end
+
+function __scale(::Type{T}, v, exp, neg) where {T}
     ms = maxsig(T)
     cl = ceillog5(T)
     if v < ms
@@ -409,7 +453,7 @@ function scale(::Type{T}, v, exp, neg) where {T}
         end
     end
     v == 0 && return zero(T)
-    if exp > 308
+    if exp > 308 && T != BigFloat
         return T(neg ? -Inf : Inf)
     elseif exp < -326
         # https://github.com/JuliaData/Parsers.jl/issues/83
@@ -485,9 +529,9 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
     if exp == 23
         # special-case concluded from https://github.com/JuliaLang/julia/issues/38509
         x = v * V(1e23)
-    elseif exp >= 0
+    elseif 0 <= exp < 290
         x = v * exp10(exp)
-    elseif exp < -308 || v > maxsig(T)
+    elseif exp < -308 || exp > 308 || v > maxsig(T)
         # if v is too large, we lose precision by just doing
         # v / exp10(-exp) since that only promotes to Float64
         # so detect and re-route to this branch where we widen v
@@ -500,15 +544,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
 end
 
 const BIGEXP10 = [1 / exp10(BigInt(e)) for e = 309:327]
-const BIGFLOAT = BigFloat[]
-if VERSION > v"1.5"
-const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=64)) for i = 1:308]
-else
-const BIGFLOATEXP10 = [exp10(BigFloat(i)) for i = 1:308]
-end
+const BIGFLOATS = BigFloat[]
+const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=256)) for i = 1:308]
 
 function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
-    x = access_threaded(BigFloat, BIGFLOAT)
+    x = access_threaded(BigFloat, BIGFLOATS)
 
     ccall((:mpfr_set_z, :libmpfr), Int32,
         (Ref{BigFloat}, Ref{BigInt}, Int32),
@@ -531,7 +571,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
             x, x, y, MPFR.ROUNDING_MODE[])
     else
         # v * exp10(V(exp))
-        y = BIGFLOATEXP10[exp]
+        if exp <= 308
+            y = BIGFLOATEXP10[exp]
+        else
+            y = exp10(BigFloat(exp; precision=256))
+        end
         ccall((:mpfr_mul, :libmpfr), Int32,
             (Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Int32),
             x, x, y, MPFR.ROUNDING_MODE[])

diff --git a/src/ints.jl b/src/ints.jl
@@ -98,6 +98,7 @@ end
 
 @inline function typeparser(::Type{Number}, source, pos, len, b, code, pl, opts)
     startpos = pos
+    startcode = code
     # begin parsing
     neg = b == UInt8('-')
     if neg || b == UInt8('+')
@@ -112,7 +113,11 @@ end
     # parse rest of number
     digits = Int64(0)
     x, code, pos = parsedigits(Number, source, pos, len, b, code, opts, digits, neg, startpos)
-    return x, pos
+    if (x === Inf || x === -Inf) && !specialvalue(code)
+        # by default, parsedigits only has up to Float64 precision; if we overflow
+        # let's try BigFloat
+        return typeparser(BigFloat, source, startpos, len, b, startcode, pl, opts)
+    end
 
 @label done
     return pos, code, PosLen(pl.pos, pos - pl.pos), x

diff --git a/src/utils.jl b/src/utils.jl
@@ -61,6 +61,7 @@ const DELIMITED            = 0b0000000000001000 % ReturnCode
 const NEWLINE              = 0b0000000000010000 % ReturnCode
 const EOF                  = 0b0000000000100000 % ReturnCode
 const ESCAPED_STRING       = 0b0000001000000000 % ReturnCode
+const SPECIAL_VALUE        = 0b0000010000000000 % ReturnCode
 
 # invalid flags
 const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
@@ -78,6 +79,7 @@ quoted(x::ReturnCode) = (x & QUOTED) == QUOTED
 delimited(x::ReturnCode) = (x & DELIMITED) == DELIMITED
 newline(x::ReturnCode) = (x & NEWLINE) == NEWLINE
 escapedstring(x::ReturnCode) = (x & ESCAPED_STRING) == ESCAPED_STRING
+specialvalue(x::ReturnCode) = (x & SPECIAL_VALUE) == SPECIAL_VALUE
 invalidquotedfield(x::ReturnCode) = (x & INVALID_QUOTED_FIELD) == INVALID_QUOTED_FIELD
 invaliddelimiter(x::ReturnCode) = (x & INVALID_DELIMITER) == INVALID_DELIMITER
 overflow(x::ReturnCode) = (x & OVERFLOW) == OVERFLOW

diff --git a/test/floats.jl b/test/floats.jl
@@ -405,6 +405,30 @@ end
     @test res.val ≈ 100_000_000.99
 end
 
+@testset "BigFloats" begin
+    @test Parsers.parse(BigFloat, "1.7976931348623157e308") == Base.parse(BigFloat, "1.7976931348623157e308")
+    @test Parsers.parse(BigFloat, "-1.7976931348623157e308") == Base.parse(BigFloat, "-1.7976931348623157e308")
+    # next float64 - too large
+    @test Parsers.parse(BigFloat, "1.7976931348623159e308") == Base.parse(BigFloat, "1.7976931348623159e308")
+    @test Parsers.parse(BigFloat, "-1.7976931348623159e308") == Base.parse(BigFloat, "-1.7976931348623159e308")
+    # the border is ...158079
+    # borderline - okay
+    @test Parsers.parse(BigFloat, "1.7976931348623158e308") == Base.parse(BigFloat, "1.7976931348623158e308")
+    @test Parsers.parse(BigFloat, "-1.7976931348623158e308") == Base.parse(BigFloat, "-1.7976931348623158e308")
+    # borderline - too large
+    @test Parsers.parse(BigFloat, "1.797693134862315808e308") == Base.parse(BigFloat, "1.797693134862315808e308")
+    @test Parsers.parse(BigFloat, "-1.797693134862315808e308") == Base.parse(BigFloat, "-1.797693134862315808e308")
+
+    # a little too large
+    @test Parsers.parse(BigFloat, "1e308") == Base.parse(BigFloat, "1e308")
+    @test Parsers.parse(BigFloat, "2e308") == Base.parse(BigFloat, "2e308")
+    @test Parsers.parse(BigFloat, "1e309") == Base.parse(BigFloat, "1e309")
+
+    # way too large
+    @test Parsers.parse(BigFloat, "1e310") == Base.parse(BigFloat, "1e310")
+    @test Parsers.parse(BigFloat, "-1e310") == Base.parse(BigFloat, "-1e310")
+end
+
 # https://github.com/JuliaData/CSV.jl/issues/916
 @test  Parsers.parse(Float64, "0.44311945001372019574271437679879349172") === 0.4431194500137202
 @test Parsers.parse(BigFloat, "0.44311945001372019574271437679879349172") == BigFloat("0.44311945001372019574271437679879349172")

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -682,8 +682,16 @@ end
 
 @testset "parse(Number, x)" begin
 
-    @test parse(Number, "1") === 1
-
+    @test Parsers.parse(Number, "1") === 1
+    @test Parsers.parse(Number, "1.0") === 1.0
+    @test Parsers.parse(Number, "1.0f0") === 1.0f0
+    @test Parsers.parse(Number, "1.0e0") === 1.0e0
+    # Int128 literal
+    @test Parsers.parse(Number, "9223372036854775808") === 9223372036854775808
+    # BigInt
+    @test Parsers.parse(Number, "170141183460469231731687303715884105728") == 170141183460469231731687303715884105728
+    # BigFloat promotion
+    @test Parsers.parse(Number, "1e310") == Base.parse(BigFloat, "1e310")
 end
 
 end # @testset "Parsers"