fix parser (#522)

* fix parser * hups * use BigInt when parsing rational expressions to avoid failure on 32 bits systems * take care of a very nasty corner case * Update src/parsing.jl Co-authored-by: Benoît Richard <kolaru@hotmail.com> * better docs and exceptions handling * add tests for Float32 and bigfloat Co-authored-by: Luca Ferranti <lucaferranti@users.noreply.github.com> Co-authored-by: Benoît Richard <kolaru@hotmail.com>
JuliaIntervals · May 25, 2022 · 1b84a19 · 1b84a19
1 parent f385b7f
commit 1b84a19
Show file tree

Hide file tree

Showing 11 changed files with 357 additions and 318 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,7 +5,6 @@ version = "0.20.5"
 
 [deps]
 CRlibm = "96374032-68de-5a5b-8d9e-752f78720389"
-CombinedParsers = "5ae71ed2-6f8a-4ed1-b94f-e14e8158f19e"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 FastRounding = "fa42c844-2597-5d31-933b-ebd51ab2693f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/src/IntervalArithmetic.jl b/src/IntervalArithmetic.jl
@@ -8,7 +8,6 @@ import CRlibm
 import FastRounding
 import RoundingEmulator
 
-using CombinedParsers
 using LinearAlgebra
 using Markdown
 using StaticArrays

diff --git a/src/decorations/intervals.jl b/src/decorations/intervals.jl
@@ -78,7 +78,7 @@ macro decorated(ex...)
             lo = :($x.lo)
             hi = :($x.hi)
         else
-            lo, hi = ex 
+            lo, hi = ex
         end
 
         return :(DecoratedInterval($lo, $hi))

diff --git a/src/parsing.jl b/src/parsing.jl
@@ -1,156 +1,241 @@
-# Functions to parse strings to intervals
-
 """
     parse(DecoratedInterval, s::AbstractString)
 
-Parse a string of the form `"[a, b]_dec"` as a `DecoratedInterval`
-with decoration `dec`.
+Parse a string of the form `"[a, b]_dec"` as a `DecoratedInterval` with decoration `dec`.
+If the decoration is not specified, it is computed based on the parsed interval.
+If the input is an invalid string, a warning is printed and [NaI] is returned. The parser is
+case unsensitive.
+
+### Examples
+
+```jldoctest
+julia> @format true
+Display parameters:
+- format: standard
+- decorations: true
+- significant figures: 6
+
+julia> parse(DecoratedInterval{Float64}, "[1, 2]")
+[1, 2]_com
+
+julia> parse(DecoratedInterval{Float64}, "[1, 2]_def")
+[1, 2]_def
+
+julia> parse(DecoratedInterval{Float64}, "foobar")
+┌ Warning: invalid input, returning [NaI]
+└ @ IntervalArithmetic ~/.julia/dev/IntervalArithmetic/src/parsing.jl:44
+[NaN, NaN]_ill
+```
 """
 function parse(::Type{DecoratedInterval{T}}, s::AbstractString) where T
-    if '_' ∉ s
-        x = parse(Interval{T}, s)
-        return DecoratedInterval(x.lo, x.hi)
-    end
+    s = lowercase(strip(s))
+    s == "[nai]" && return nai(T)
+    try
+        if '_' ∉ s
+            ival, _ = _parse(Interval{T}, s)
+            return DecoratedInterval{T}(ival)
+        end
+
+        decorations = Dict(
+            "ill" => ill,
+            "trv" => trv,
+            "def" => def,
+            "dac" => dac,
+            "com" => com)
 
-    decorations = Dict(
-        "ill" => ill,
-        "trv" => trv,
-        "def" => def,
-        "dac" => dac,
-        "com" => com)
+        interval_string, dec = split(s, "_")
 
-    interval_string, dec = split(s, "_")
-    interval = parse(Interval{T}, interval_string)
-    return DecoratedInterval(interval, decorations[lowercase(dec)])
+        ival, isnotcom = _parse(Interval{T}, interval_string)
+        dec_calc = decoration(ival)
+
+        haskey(decorations, dec) || throw(ArgumentError("invalid decoration $dec"))
+        dec_given = decorations[dec]
+
+        #=
+            If I try to give a decoration that is too high, e.g. [1, Inf]_com, then it
+            should error and return [NaI]. Exception to this is if the interval would be com
+            but becomes dac because of finite precision, e.g. "[1e403]_com" when parse to
+            Interval{Float64} is allowed to become [prevfloat(Inf), Inf]_dac without erroring.
+            The isnotcom flag returned by _parse is used to track if the interval was originally
+            smaller than com or became dac because of overflow.
+        =#
+        dec_given > dec_calc && isnotcom && throw(ArgumentError("invalid decoration $dec for $ival"))
+
+        return DecoratedInterval{T}(ival, min(dec_given, dec_calc))
+    catch e
+        if e isa ArgumentError
+            @warn "invalid input, returning [NaI]"
+            return nai(T)
+        else
+            rethrow(e)
+        end
+    end
 end
 
 """
     parse(Interval, s::AbstractString)
 
 Parse a string as an `Interval`, according to the grammar specified
-in Section 9.7 of the IEEE Std 1788-2015, with some extensions.
+in Section 9.7 of the IEEE Std 1788-2015.
 
 The created interval is tight around the value described by the string,
-including for number that have no exact float representation like "0.1".
-
-Roughly speaking, the valid forms are
-    - `[ 1.33 ]` or simply `1.33` : The interval containing only `1.33``.
-    - `[ 1.44, 2.78 ]` : The interval `[1.44, 2.78]`.
-    - `7.88 ± 0.03`: The interval `[7.85, 7.91]`.j
-    - `6.42?2` : The interval `6.42 ± 0.02`. The number after `?` represent
-        the uncertainty in the last digit.
-        Physicists would write it as `6.42(2)`.
-        Strangely enough, according to the standard, the default
-        value is `0.5` (e.g. `2.3? == 2.3 ± 0.05`).
-        The direction of the uncertainty can be given by adding 'u' or 'd' at
-        the end for the error going only up or down respectively (e.g.
-        `4.5?5u == [4.5, 5]`).
+including for number that have no exact float representation like "0.1". If the input is
+an invalid string, a warning is printed and an empty interval is returned. The parser is
+case unsensitive.
+
+### Allowed format
+
+Here are some examples of allowed formats, for more details see sections 9.7 and 12.11 of
+the standard
+
+- `[ 1.33 ]` or simply `1.33` : The interval containing only `1.33``.
+- `[ 1.44, 2.78 ]` : The interval `[1.44, 2.78]`.
+- `[empty]` : the empty interval
+- `[entire]` or `[,]`: the interval `[-∞, ∞]`
+- `[3,]`: The interval `[3, ∞]`
+- `6.42?2` : The interval `6.42 ± 0.02`. The number after `?` represent the uncertainty in
+  the last digit. The default value is `0.5` (e.g. `2.3? == 2.3 ± 0.05`). The direction of
+  the uncertainty can be given by adding 'u' or 'd' at the end for the error going only up
+  or down respectively (e.g. `4.5?5u == [4.5, 5]`).
+- `6.42?2e2` : The interval `(6.42 ± 0.02)⋅10³ == 642 ± 2`
+- `3??u` : the interval `[3, ∞]`
+- `3??u` : the interval `[3, ∞]`
+- `3??` : the interval `[-∞, ∞]`
+
+### Examples
+
+```julia
+julia> parse(Interval{Float64}, "[1, 2]")
+[1, 2]
+
+julia> parse(Interval{Float64}, "[1,]")
+[1, ∞]
+
+julia> parse(Interval{Float64}, "[,]")
+[-∞, ∞]
+
+julia> parse(Interval{Float64}, "6.42?2e2")
+[640, 644]
+
+julia> parse(Interval{Float64}, "foobar")
+┌ Warning: invalid input, empty interval returned
+└ @ IntervalArithmetic ~/.julia/dev/IntervalArithmetic/src/parsing.jl:68
+∅
+```
 """
 function parse(::Type{Interval{T}}, s::AbstractString) where T
-    p = interval_parser(Interval{T})
+    s = lowercase(strip(s))
     try
-        return parse(p, s)
+        ival, _ = _parse(Interval{T}, s)
+        return ival
     catch e
-        # We avoid CombinedParsers stacktraces because they are hard to read.
-        if !(e isa ArgumentError)
-            rethrow()
+        if e isa ArgumentError
+            @warn "invalid input, empty interval returned"
+            return emptyinterval(T)
+        else
+            rethrow(e)
         end
     end
-
-    throw(ArgumentError("string \"$s\" can not be parsed to an interval."))
 end
 
 """
-Same as `parse(T, s, rounding_mode)`, but also accept string representing rational numbers.
-"""
-function extended_parse(T, s, rounding_mode)
-    if '/' in s
-        num, denum = parse.(Int, split(s, '/'))
-        return T(num//denum, rounding_mode)
-    end
+    _parse(::Type{Interval{T}}, s::AbstractString) where T
 
-    return parse(T, s, rounding_mode)
-end
+tries to parse the string `s` to an interval of type `Interval{T}` and throws an argument
+error if an invalid string is given.
 
-"""
-    interval_parser(::Type{Interval})
+### Output
 
-Return a parser that processes a string according to Section 9.7 of
-the IEEE Std 1788-2015 and return an interval of the given type.
-
-It is an extension of the specification in the standard, more lenient
-on what is allowed.
+- the parsed interval
+- a flag `isnotcom`, which is set to true if the input interval is not `com` and to false
+  otherwise. This is used to distinguish the case when an interval is supposed to be
+  unbounded (e.g. input `"[3, infinity]"`) or becomes unbounded because of overflow
+  (e.g. the input `"[3, 1e400]", which is parse to `[3, ∞]` when using `Float64`).
 """
-function interval_parser(::Type{Interval{T}}) where T
-    # Exclude a minimal number of Char and let Julia parse the number
-    # when needed
-    any_number = Lazy(Repeat1(CharNotIn("[]?ud")))
-    integer = Lazy(Repeat1(CharIn("0123456789")))
-
-    point_interval = Either(
-        Sequence(seq -> seq[2], "[", trim(any_number), "]"),
-        trim(any_number)
-    ) do x
-            lo = extended_parse(T, join(x), RoundDown)
-            hi = extended_parse(T, join(x), RoundUp)
-        return checked_interval(lo, hi)
-    end
-    infsup_interval = Sequence(
-            "[",
-            trim(any_number),
-            ",",
-            trim(any_number),
-            "]") do seq
-        lo = extended_parse(T, join(seq[2]), RoundDown)
-        hi = extended_parse(T, join(seq[4]), RoundUp)
-        return checked_interval(lo, hi)
-    end
-    uncert_interval = Sequence(
-        any_number,
-        "?",
-        Optional(integer, default=missing),  # The radius
-        Optional(CharIn("ud"), default='?'),
-        Optional(Sequence("e", any_number), default=("e", "0"))
-    ) do seq
-        x = join(seq[1])
-        radius = ismissing(seq[3]) ? 1//2 : Rational(parse(Int, join(seq[3])))
-        dir = seq[4]
-        exponent = parse(Int, join(seq[5][2]))
-
-        # TODO The ulp calculation is wrong when x has an exponent
-        # e.g. 1.33e6?1
-        # This is in principle not allowed by the standard but I see no reason
-        # to disallow it
-        # Remove the decimal point, scaling accordingly
-        if '.' in x
-            core, decimals = split(x, '.')
-            exponent -= length(decimals)
-            x = join([core, decimals])
+function _parse(::Type{Interval{T}}, s::AbstractString) where T
+    isnotcom = occursin("inf", s)
+    if startswith(s, '[') && endswith(s, ']') # parse as interval
+        s = strip(s[2:end-1])
+        if ',' in s # infsupinterval
+            lo, hi = strip.(split(s, ','))
+            isempty(lo) && (lo = "-inf"; isnotcom = true)
+            isempty(hi) && (hi = "inf"; isnotcom = true)
+            lo = parse_num(T, lo, RoundDown)
+            hi = parse_num(T, hi, RoundUp)
+        else # point interval
+
+            (s == "empty" || isempty(s)) && return emptyinterval(T), true # emptyinterval
+            s == "entire" && return entireinterval(T), true # entireinterval
+            lo = parse_num(T, s, RoundDown)
+            hi = parse_num(T, s, RoundUp)
         end
-
-        rhi = dir == 'd' ? Rational(0) : radius
-        rlo = dir == 'u' ? Rational(0) : radius
-        scale = exponent >= 0 ? 10^exponent : 1//10^-exponent
-        x = parse(Int, x)
-        lo = T((x - rlo)*scale, RoundDown)
-        hi = T((x + rhi)*scale, RoundUp)
-        return Interval(lo, hi)
+    elseif '?' in s # uncertainty interval
+        if occursin("??", s) # unbounded interval
+            isnotcom = true
+            m, _ = split(s, "??")
+            if 'u' in s # interval in the form [m, Inf]
+                lo = parse(T, m, RoundDown)
+                hi = T(Inf)
+            elseif 'd' in s # interval in the form [-Inf, m]
+                lo = T(-Inf)
+                hi = parse(T, m, RoundUp)
+            else
+                return entireinterval(T), true
+            end
+        else
+            m , vde = split(s, '?')
+
+            # ulp computation
+            if '.' in m # ulp is last decimal position
+                ulp = 1/(big(10.0)^length(split(m, '.')[2]))
+            else # no decimal, hence ulp is unit
+                ulp = big(1.0)
+            end
+            m = parse(BigFloat, m)
+
+            if 'e' in vde
+                vd, e = split(vde, 'e')
+                e = big(10.0) ^ parse(Int, e)
+            else
+                vd = vde
+                e = big(1.0)
+            end
+
+            if 'u' in vd || 'd' in vd
+                d = last(vd)
+                v = vd[1:end-1]
+            else
+                d = 'b' # both directions
+                v = vd
+            end
+
+            v = isempty(v) ? 1//2 : parse(BigInt, v)
+            if d == 'd'
+                lo = T((m - v * ulp) * e, RoundDown)
+                hi = T(m * e, RoundUp)
+            elseif d == 'u'
+                lo = T(m * e, RoundDown)
+                hi = T((m + v * ulp) * e, RoundUp)
+            else
+                lo = T((m - v * ulp) * e, RoundDown)
+                hi = T((m + v * ulp) * e, RoundUp)
+            end
+        end
+    else # number
+        lo = parse_num(T, s, RoundDown)
+        hi = parse_num(T, s, RoundUp)
     end
+    is_valid_interval(lo, hi) && return Interval(lo, hi), isnotcom
+    throw(ArgumentError("input $s can not be parsed as an interval."))
+end
 
-    # Not in the standard
-    pm_interval = Sequence(
-        any_number,
-        trim("±"),
-        any_number
-    ) do seq
-        a = parse(T, join(seq[1]))
-        b = parse(T, join(seq[3]))
-        return a ± b
+"""
+Same as `parse(T, s, rounding_mode)`, but also accept string representing rational numbers.
+"""
+function parse_num(T, s, rounding_mode)
+    if '/' in s
+        num, denum = parse.(BigInt, split(s, '/'))
+        return T(num//denum, rounding_mode)
     end
-
-    return Sequence(
-        first,
-        Either(pm_interval, uncert_interval, infsup_interval, point_interval),
-        AtEnd()
-    )
-end
+    return T(parse(BigFloat, s), rounding_mode)
+end