Skip to content

Commit

Permalink
Fix handling of -0.0 in histograms (#768)
Browse files Browse the repository at this point in the history
`searchsortedfirst` and `searchsortedlast` use `isless` for comparisons
and therefore consider `-0.0` to be different from `0.0`. This means
that these two values do not end up in the same bin when an edge is 0.
This does not make much sense statistically, but even worse is that
when an extreme edge is 0, `-0.0` is not counted at all.

Fix this by replacing `-0.0` with `0.0` before the search.
  • Loading branch information
nalimilan committed Mar 26, 2022
1 parent a1b02d8 commit 26947bc
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 3 deletions.
28 changes: 25 additions & 3 deletions src/hist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,14 @@ mutable struct Histogram{T<:Real,N,E} <: AbstractHistogram{T,N,E}
closed == :right || closed == :left || error("closed must :left or :right")
isdensity && !(T <: AbstractFloat) && error("Density histogram must have float-type weights")
_edges_nbins(edges) == size(weights) || error("Histogram edge vectors must be 1 longer than corresponding weight dimensions")
# We do not handle -0.0 in ranges correctly in `binindex` for performance
# Constructing ranges starting or ending with -0.0 is very hard,
# and ranges containing -0.0 elsewhere virtually impossible,
# but check this just in case as it is cheap
foreach(edges) do e
e isa AbstractRange && any(isequal(-0.0), e) &&
throw(ArgumentError("ranges containing -0.0 not allowed in edges"))
end
new{T,N,E}(edges,weights,closed,isdensity)
end
end
Expand Down Expand Up @@ -226,11 +234,25 @@ binindex(h::AbstractHistogram{T,1}, x::Real) where {T} = binindex(h, (x,))[1]
binindex(h::Histogram{T,N}, xs::NTuple{N,Real}) where {T,N} =
map((edge, x) -> _edge_binindex(edge, h.closed, x), h.edges, xs)

_normalize_zero(x::AbstractFloat) = isequal(x, -0.0) ? zero(x) : x
_normalize_zero(x::Any) = x

# Always treat -0.0 like 0.0
@inline function _edge_binindex(edge::AbstractVector, closed::Symbol, x::Real)
if closed == :right
searchsortedfirst(edge, x) - 1
if closed === :right
return searchsortedfirst(edge, _normalize_zero(x), by=_normalize_zero) - 1
else
return searchsortedlast(edge, _normalize_zero(x), by=_normalize_zero)
end
end
# Passing by=_normalize_zero for ranges would have a large performance hit
# as it would force using the AbstractVector fallback
# This is not worth it given that it is very difficult to construct a range containing -0.0
@inline function _edge_binindex(edge::AbstractRange, closed::Symbol, x::Real)
if closed === :right
return searchsortedfirst(edge, _normalize_zero(x)) - 1
else
searchsortedlast(edge, x)
return searchsortedlast(edge, _normalize_zero(x))
end
end

Expand Down
40 changes: 40 additions & 0 deletions test/hist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,44 @@ end
@test StatsBase.midpoints(range(0, stop = 1, length = 5)) == 0.125:0.25:0.875
end

@testset "histogram with -0.0" begin
@test fit(Histogram, [-0.0, 1.0]) == fit(Histogram, [0.0, 1.0])
@test fit(Histogram, [-0.0, 1.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], closed=:right)
@test fit(Histogram, [-0.0, -1.0]) == fit(Histogram, [0.0, -1.0])
@test fit(Histogram, [-0.0, -1.0], closed=:right) ==
fit(Histogram, [0.0, -1.0], closed=:right)

@test fit(Histogram, [-0.0, 1.0], [-0.0, 0.5]) ==
fit(Histogram, [0.0, 1.0], [0.0, 0.5]) ==
fit(Histogram, [-0.0, 1.0], [0.0, 0.5]) ==
fit(Histogram, [0.0, 1.0], [-0.0, 0.5]) ==
fit(Histogram, [0.0, 1.0], 0.0:0.5:0.5) ==
fit(Histogram, [-0.0, 1.0], 0.0:0.5:0.5)
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0]) ==
fit(Histogram, [0.0, 1.0], [-0.5, -0.0]) ==
fit(Histogram, [-0.0, 1.0], [-0.5, 0.0]) ==
fit(Histogram, [0.0, 1.0], [-0.5, 0.0]) ==
fit(Histogram, [-0.0, 1.0], -0.5:0.5:0.0) ==
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0)
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], [-0.5, 0.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0, closed=:right)
@test fit(Histogram, [-0.0, 1.0], [-0.0, 0.5], closed=:right) ==
fit(Histogram, [0.0, 1.0], [0.0, 0.5], closed=:right) ==
fit(Histogram, [0.0, 1.0], [-0.0, 0.5], closed=:right) ==
fit(Histogram, [-0.0, 1.0], [0.0, 0.5], closed=:right) ==
fit(Histogram, [0.0, 1.0], 0.0:0.5:0.5, closed=:right) ==
fit(Histogram, [-0.0, 1.0], 0.0:0.5:0.5, closed=:right)
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], [-0.5, 0.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], [-0.5, -0.0], closed=:right) ==
fit(Histogram, [-0.0, 1.0], [-0.5, 0.0], closed=:right) ==
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0, closed=:right) ==
fit(Histogram, [-0.0, 1.0], -0.5:0.5:0.0, closed=:right)

@test_throws ArgumentError fit(Histogram, [-0.5], LinRange(-1.0, -0.0, 3))
@test_throws ArgumentError fit(Histogram, [-0.5], UnitRange(-0.0, 1.0))
end

end # @testset "StatsBase.Histogram"

0 comments on commit 26947bc

Please sign in to comment.