Skip to content

Commit

Permalink
extend ecdf to a weighted sample (#500)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnczito authored and nalimilan committed Aug 23, 2019
1 parent 416af55 commit fde6209
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 11 deletions.
33 changes: 22 additions & 11 deletions src/empirical.jl
Expand Up @@ -3,51 +3,62 @@

## Empirical CDF

struct ECDF{T <: AbstractVector{<:Real}}
struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}}
sorted_values::T
weights::W
end

function (ecdf::ECDF)(x::Real)
searchsortedlast(ecdf.sorted_values, x) / length(ecdf.sorted_values)
n = searchsortedlast(ecdf.sorted_values, x)
evenweights = isempty(ecdf.weights)
weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n))
partialsum / weightsum
end

function (ecdf::ECDF)(v::RealVector)
evenweights = isempty(ecdf.weights)
weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights)
ord = sortperm(v)
m = length(v)
r = similar(ecdf.sorted_values, m)
r0 = 0
r0 = zero(weightsum)
i = 1
n = length(ecdf.sorted_values)
for x in ecdf.sorted_values
for (j, x) in enumerate(ecdf.sorted_values)
while i <= m && x > v[ord[i]]
r[ord[i]] = r0
i += 1
end
r0 += 1
r0 += evenweights ? 1 : ecdf.weights[j]
if i > m
break
end
end
while i <= m
r[ord[i]] = n
r[ord[i]] = weightsum
i += 1
end
return r / n
return r / weightsum
end

"""
ecdf(X)
ecdf(X; weights::AbstractWeights)
Return an empirical cumulative distribution function (ECDF) based on a vector of samples
given in `X`.
given in `X`. Optionally providing `weights` returns a weighted ECDF.
Note: this function that returns a callable composite type, which can then be applied to
evaluate CDF values on other samples.
`extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which
function is inside the interval ``(0,1)``; the function is defined for the whole real line.
"""
ecdf(X::RealVector{T}) where T<:Real = ECDF(sort(X))
function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[]))
isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," *
"got $(length(X)) and $(length(weights))"))
ord = sortperm(X)
ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord]))
end

minimum(ecdf::ECDF) = first(ecdf.sorted_values)

Expand Down
38 changes: 38 additions & 0 deletions test/empirical.jl
Expand Up @@ -13,3 +13,41 @@ using Test
@test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)]
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5)
end

@testset "Weighted ECDF" begin
x = randn(10000000)
w1 = rand(10000000)
w2 = weights(w1)
fnecdf = ecdf(x, weights=w1)
fnecdfalt = ecdf(x, weights=w2)
@test fnecdf.sorted_values == fnecdfalt.sorted_values
@test fnecdf.weights == fnecdfalt.weights
@test fnecdf.weights != w1 # check that w wasn't accidently modified in place
@test fnecdfalt.weights != w2
y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96]
@test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3)
@test isapprox(fnecdf(1.96), 0.975, atol=1e-3)
@test fnecdf(y) map(fnecdf, y)
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x)
fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1]))
@test fnecdf(0.75) == 0.25
@test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0)
@test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10)))
# Check frequency weights
v = randn(100)
r = rand(1:100, 100)
vv = vcat(fill.(v, r)...) # repeat elements of v according to r
fw = fweights(r)
frecdf1 = ecdf(v, weights=fw)
frecdf2 = ecdf(vv)
@test frecdf1(y) frecdf2(y)
# Check probability weights
a = randn(100)
b = rand(100)
= abs(10randn()) * b
bw1 = pweights(b)
bw2 = pweights(b̃)
precdf1 = ecdf(a, weights=bw1)
precdf2 = ecdf(a, weights=bw2)
@test precdf1(y) precdf2(y)
end

0 comments on commit fde6209

Please sign in to comment.