Skip to content

Commit

Permalink
Add Parsers.parse(Number, source) for number type detection + parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
quinnj committed Nov 8, 2022
1 parent 5bdff9c commit 5911408
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 35 deletions.
10 changes: 5 additions & 5 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,14 @@ const SourceType = Union{AbstractVector{UInt8}, AbstractString, IO}
xparse(::Type{T}, source::SourceType; pos::Integer=1, len::Integer=source isa IO ? 0 : sizeof(source), kw...) where {T} =
xparse(T, source, pos, len, Options(; kw...))

@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
Result(emptysentinel(options)(delimiter(options)(whitespace(options)(
quoted(options)(whitespace(options)(sentinel(options)(typeparser(options)
)))))))(T, source, pos, len, S)

function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
buf = source isa AbstractString ? codeunits(source) : source
if supportedtype(T)
if supportedtype(T) || T === Number
return _xparse(T, buf, pos, len, options, S)
else
# generic fallback calls Base.tryparse
Expand All @@ -334,12 +334,12 @@ function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Typ
end

# condensed version of xparse that doesn't worry about quoting or delimiters; called from Parsers.parse/Parsers.tryparse
@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
Result(whitespace(false, false, false, true)(typeparser(opts)))(T, source, pos, len, S)

@inline function xparse2(::Type{T}, source::SourceType, pos, len, options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
buf = source isa AbstractString ? codeunits(source) : source
if supportedtype(T)
if supportedtype(T) || T === Number
return _xparse2(T, buf, pos, len, options, S)
else
# generic fallback calls Base.tryparse
Expand Down Expand Up @@ -386,7 +386,7 @@ include("dates.jl")

function __init__()
resize!(empty!(BIGINT), Threads.nthreads())
resize!(empty!(BIGFLOAT), Threads.nthreads())
resize!(empty!(BIGFLOATS), Threads.nthreads())
return
end

Expand Down
98 changes: 71 additions & 27 deletions src/floats.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Base.MPFR, Base.GMP, Base.GMP.MPZ

_widen(x::UInt64) = UInt128(x)
_widen(x::Int64) = Int128(x)

const BIGINT = BigInt[]

Expand All @@ -17,10 +18,10 @@ function access_threaded(f, v::Vector)
end
@noinline _length_assert() = @assert false "0 < tid <= v"

function _widen(v::UInt128)
function _widen(v::T) where {T <: Union{Int128, UInt128}}
x = access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), BIGINT)
ccall((:__gmpz_import, :libgmp), Int32,
(Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{UInt128}),
(Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{T}),
x, 1, 1, 16, 0, 0, v)
return x
end
Expand All @@ -43,12 +44,44 @@ function _muladd(ten, digits::BigInt, b)
return digits
end

@enum FloatType FLOAT16 FLOAT32 FLOAT64 BIGFLOAT
float_type(::Type{T}, FT::FloatType) where {T <: AbstractFloat} = T
float_type(T, FT::FloatType) = FT === FLOAT16 ? Float16 :
FT === FLOAT32 ? Float32 :
FT === FLOAT64 ? Float64 :
BigFloat

# for non SupportedFloat Reals, parse as Float64, then convert
@inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: Real}
pos, code, pl, x = typeparser(Float64, source, pos, len, b, code, pl, options)
return pos, code, pl, T(x)
end

function typeparser(::Type{BigFloat}, source, pos, len, b, code, pl, options)
base = 0
rounding = Base.MPFR.ROUNDING_MODE[]
z = BigFloat(precision=Base.MPFR.DEFAULT_PRECISION[])
if source isa AbstractVector{UInt8}
str = source
else
_, _, _pl, _ = typeparser(String, source, pos, len, b, code, pl, options)
_pos = position(source)
vpos, vlen = _pl.pos, _pl.len
fastseek!(source, vpos - 1)
str = Base.StringVector(vlen)
readbytes!(source, str, vlen)
fastseek!(source, _pos) # reset IO to earlier position
end
GC.@preserve str begin
ptr = pointer(str)
endptr = Ref{Ptr{UInt8}}()
err = ccall((:mpfr_strtofr, :libmpfr), Int32, (Ref{BigFloat}, Ptr{UInt8}, Ref{Ptr{UInt8}}, Int32, Base.MPFR.MPFRRoundingMode), z, ptr, endptr, base, rounding)
code |= endptr[] == ptr ? INVALID : OK
pos += Int(endptr[] - ptr)
return pos, code, PosLen(pl.pos, pos - pl.pos), z
end
end

@inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: SupportedFloats}
# keep track of starting pos in case of invalid, we can rewind to start of parsing
startpos = pos
Expand Down Expand Up @@ -89,7 +122,7 @@ end
if eof(source, pos, len)
code |= EOF
end
code |= OK
code |= OK | SPECIAL_VALUE
@goto done
end
end
Expand All @@ -111,7 +144,7 @@ end
b = peekbyte(source, pos)
if b == UInt8('f') || b == UInt8('F')
x = ifelse(neg, T(-Inf), T(Inf))
code |= OK
code |= OK | SPECIAL_VALUE
pos += 1
incr!(source)
if eof(source, pos, len)
Expand Down Expand Up @@ -267,12 +300,13 @@ end
# same as above; if digits overflows, we want a non-inlined version to call with a wider type
# note that we never expect `frac` to overflow, since it's just keep track of the # of digits
# we parse post-decimal point
@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType} =
@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType} =
parsefrac(T, source, pos, len, b, code, options, digits, neg, startpos, frac)

@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType}
@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType}
x = zero(T)
parsedanyfrac = false
FT = FLOAT64
# check if `b` is a digit
if b - UInt8('0') < 0x0a
b -= UInt8('0')
Expand All @@ -285,7 +319,7 @@ end
frac += UInt64(1)
if eof(source, pos, len)
# input is simple non-scientific-notation floating number, like "1.1"
x = scale(T, digits, -signed(frac), neg)
x = scale(T, FT, digits, -signed(frac), neg)
code |= OK | EOF
@goto done
end
Expand All @@ -299,6 +333,9 @@ end
end
# check for exponent notation
if b == UInt8('e') || b == UInt8('E') || b == UInt8('f') || b == UInt8('F')
if b == UInt8('f') || b == UInt8('F')
FT = FLOAT32
end
pos += 1
incr!(source)
if eof(source, pos, len)
Expand Down Expand Up @@ -327,11 +364,11 @@ end

# at this point, we've parsed X and Y in "X.YeZ", but not Z in a scientific notation exponent number
# we start our exponent number at UInt64(0)
return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp)
return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp, FT)
else
# if no scientific notation, we're done, so scale digits + frac and return
if parsedanyfrac
x = scale(T, digits, -signed(frac), neg)
x = scale(T, FT, digits, -signed(frac), neg)
else
x = ifelse(neg, -T(digits), T(digits))
end
Expand All @@ -344,10 +381,10 @@ end

# same no-inline story, but this time for exponent number; probably even more rare to overflow the exponent number
# compared to pre/post decimal digits, but we account for it all the same (a lot of float parsers don't account for this)
@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType} =
parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp)
@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType} =
parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp, FT)

@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType}
@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType}
x = zero(T)
# note that `b` has already had `b - UInt8('0')` applied to it for parseexp
while true
Expand All @@ -356,19 +393,19 @@ end
incr!(source)
if eof(source, pos, len)
# we finished parsing input like "1.1e1"
x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
code |= OK | EOF
@goto done
end
b = peekbyte(source, pos) - UInt8('0')
# if we encounter a non-digit, that must mean we're done
if b > 0x09
x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
code |= OK
@goto done
end
if overflows(ExpType) && exp > overflowval(ExpType)
return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp)
return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp, FT)
end
end
@label done
Expand All @@ -395,7 +432,14 @@ pow10(::Type{Float32}, e) = (@inbounds v = F32_SHORT_POWERS[e+1]; return v)
pow10(::Type{Float64}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)
pow10(::Type{BigFloat}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)

function scale(::Type{T}, v, exp, neg) where {T}
_unsigned(x::BigInt) = x
_unsigned(x) = unsigned(x)

function scale(::Type{T}, FT::FloatType, v, exp, neg) where {T}
return __scale(float_type(T, FT), _unsigned(v), exp, neg)
end

function __scale(::Type{T}, v, exp, neg) where {T}
ms = maxsig(T)
cl = ceillog5(T)
if v < ms
Expand All @@ -409,7 +453,7 @@ function scale(::Type{T}, v, exp, neg) where {T}
end
end
v == 0 && return zero(T)
if exp > 308
if exp > 308 && T != BigFloat
return T(neg ? -Inf : Inf)
elseif exp < -326
# https://github.com/JuliaData/Parsers.jl/issues/83
Expand Down Expand Up @@ -485,9 +529,9 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
if exp == 23
# special-case concluded from https://github.com/JuliaLang/julia/issues/38509
x = v * V(1e23)
elseif exp >= 0
elseif 0 <= exp < 290
x = v * exp10(exp)
elseif exp < -308 || v > maxsig(T)
elseif exp < -308 || exp > 308 || v > maxsig(T)
# if v is too large, we lose precision by just doing
# v / exp10(-exp) since that only promotes to Float64
# so detect and re-route to this branch where we widen v
Expand All @@ -500,15 +544,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
end

const BIGEXP10 = [1 / exp10(BigInt(e)) for e = 309:327]
const BIGFLOAT = BigFloat[]
if VERSION > v"1.5"
const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=64)) for i = 1:308]
else
const BIGFLOATEXP10 = [exp10(BigFloat(i)) for i = 1:308]
end
const BIGFLOATS = BigFloat[]
const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=256)) for i = 1:308]

function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
x = access_threaded(BigFloat, BIGFLOAT)
x = access_threaded(BigFloat, BIGFLOATS)

ccall((:mpfr_set_z, :libmpfr), Int32,
(Ref{BigFloat}, Ref{BigInt}, Int32),
Expand All @@ -531,7 +571,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
x, x, y, MPFR.ROUNDING_MODE[])
else
# v * exp10(V(exp))
y = BIGFLOATEXP10[exp]
if exp <= 308
y = BIGFLOATEXP10[exp]
else
y = exp10(BigFloat(exp; precision=256))
end
ccall((:mpfr_mul, :libmpfr), Int32,
(Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Int32),
x, x, y, MPFR.ROUNDING_MODE[])
Expand Down
7 changes: 6 additions & 1 deletion src/ints.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ end

@inline function typeparser(::Type{Number}, source, pos, len, b, code, pl, opts)
startpos = pos
startcode = code
# begin parsing
neg = b == UInt8('-')
if neg || b == UInt8('+')
Expand All @@ -112,7 +113,11 @@ end
# parse rest of number
digits = Int64(0)
x, code, pos = parsedigits(Number, source, pos, len, b, code, opts, digits, neg, startpos)
return x, pos
if (x === Inf || x === -Inf) && !specialvalue(code)
# by default, parsedigits only has up to Float64 precision; if we overflow
# let's try BigFloat
return typeparser(BigFloat, source, startpos, len, b, startcode, pl, opts)
end

@label done
return pos, code, PosLen(pl.pos, pos - pl.pos), x
Expand Down
2 changes: 2 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const DELIMITED = 0b0000000000001000 % ReturnCode
const NEWLINE = 0b0000000000010000 % ReturnCode
const EOF = 0b0000000000100000 % ReturnCode
const ESCAPED_STRING = 0b0000001000000000 % ReturnCode
const SPECIAL_VALUE = 0b0000010000000000 % ReturnCode

# invalid flags
const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
Expand All @@ -78,6 +79,7 @@ quoted(x::ReturnCode) = (x & QUOTED) == QUOTED
delimited(x::ReturnCode) = (x & DELIMITED) == DELIMITED
newline(x::ReturnCode) = (x & NEWLINE) == NEWLINE
escapedstring(x::ReturnCode) = (x & ESCAPED_STRING) == ESCAPED_STRING
specialvalue(x::ReturnCode) = (x & SPECIAL_VALUE) == SPECIAL_VALUE
invalidquotedfield(x::ReturnCode) = (x & INVALID_QUOTED_FIELD) == INVALID_QUOTED_FIELD
invaliddelimiter(x::ReturnCode) = (x & INVALID_DELIMITER) == INVALID_DELIMITER
overflow(x::ReturnCode) = (x & OVERFLOW) == OVERFLOW
Expand Down
24 changes: 24 additions & 0 deletions test/floats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,30 @@ end
@test res.val 100_000_000.99
end

@testset "BigFloats" begin
@test Parsers.parse(BigFloat, "1.7976931348623157e308") == Base.parse(BigFloat, "1.7976931348623157e308")
@test Parsers.parse(BigFloat, "-1.7976931348623157e308") == Base.parse(BigFloat, "-1.7976931348623157e308")
# next float64 - too large
@test Parsers.parse(BigFloat, "1.7976931348623159e308") == Base.parse(BigFloat, "1.7976931348623159e308")
@test Parsers.parse(BigFloat, "-1.7976931348623159e308") == Base.parse(BigFloat, "-1.7976931348623159e308")
# the border is ...158079
# borderline - okay
@test Parsers.parse(BigFloat, "1.7976931348623158e308") == Base.parse(BigFloat, "1.7976931348623158e308")
@test Parsers.parse(BigFloat, "-1.7976931348623158e308") == Base.parse(BigFloat, "-1.7976931348623158e308")
# borderline - too large
@test Parsers.parse(BigFloat, "1.797693134862315808e308") == Base.parse(BigFloat, "1.797693134862315808e308")
@test Parsers.parse(BigFloat, "-1.797693134862315808e308") == Base.parse(BigFloat, "-1.797693134862315808e308")

# a little too large
@test Parsers.parse(BigFloat, "1e308") == Base.parse(BigFloat, "1e308")
@test Parsers.parse(BigFloat, "2e308") == Base.parse(BigFloat, "2e308")
@test Parsers.parse(BigFloat, "1e309") == Base.parse(BigFloat, "1e309")

# way too large
@test Parsers.parse(BigFloat, "1e310") == Base.parse(BigFloat, "1e310")
@test Parsers.parse(BigFloat, "-1e310") == Base.parse(BigFloat, "-1e310")
end

# https://github.com/JuliaData/CSV.jl/issues/916
@test Parsers.parse(Float64, "0.44311945001372019574271437679879349172") === 0.4431194500137202
@test Parsers.parse(BigFloat, "0.44311945001372019574271437679879349172") == BigFloat("0.44311945001372019574271437679879349172")
Expand Down
12 changes: 10 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -682,8 +682,16 @@ end

@testset "parse(Number, x)" begin

@test parse(Number, "1") === 1

@test Parsers.parse(Number, "1") === 1
@test Parsers.parse(Number, "1.0") === 1.0
@test Parsers.parse(Number, "1.0f0") === 1.0f0
@test Parsers.parse(Number, "1.0e0") === 1.0e0
# Int128 literal
@test Parsers.parse(Number, "9223372036854775808") === 9223372036854775808
# BigInt
@test Parsers.parse(Number, "170141183460469231731687303715884105728") == 170141183460469231731687303715884105728
# BigFloat promotion
@test Parsers.parse(Number, "1e310") == Base.parse(BigFloat, "1e310")
end

end # @testset "Parsers"

0 comments on commit 5911408

Please sign in to comment.