Skip to content

Commit

Permalink
Add DiscreteNonParametric distribution (#634)
Browse files Browse the repository at this point in the history
* Add Generic discrete distribution

* Convert modes to explicit loop

* Add uniqueness check for values in Generic constructor

* Convert Generic's support to an AbstractVector

* Convert Categorical to a type alias of Generic

* Add Generic mgf and cf

* Add Generic fit and sufficient statistic methods

* Easy 0.7 fixes

* Updates for Random in 0.7 and tests passing

* Fix 0.7 constructor dispatch

* Fix weighted suffstats and minor changes

* Rename Generic to DiscreteNonParametric

* Add documentation for DiscreteNonParametric

* Generalize probability vector type for DiscreteNonParametric and Categorical, fix #743

* Generalize DiscreteNonParametric/Categorical samplers

* Further specialize support of Categorical

* Partially support test_distr with DiscreteNonParametric
  • Loading branch information
Gord Stephen authored and matbesancon committed Nov 4, 2018
1 parent ad271d2 commit cd0ae4e
Show file tree
Hide file tree
Showing 16 changed files with 536 additions and 178 deletions.
1 change: 1 addition & 0 deletions docs/src/univariate.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ BetaBinomial
Binomial
Categorical
DiscreteUniform
DiscreteNonParametric
Geometric
Hypergeometric
NegativeBinomial
Expand Down
1 change: 1 addition & 0 deletions src/Distributions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ export
FullNormal,
FullNormalCanon,
Gamma,
DiscreteNonParametric,
GeneralizedPareto,
GeneralizedExtremeValue,
Geometric,
Expand Down
6 changes: 4 additions & 2 deletions src/samplers.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# delegation of samplers

for fname in ["categorical.jl",
for fname in ["aliastable.jl",
"binomial.jl",
"poissonbinomial.jl",
"poisson.jl",
"exponential.jl",
"gamma.jl",
"multinomial.jl",
"vonmises.jl",
"vonmisesfisher.jl"]
"vonmisesfisher.jl",
"discretenonparametric.jl",
"categorical.jl"]

include(joinpath("samplers", fname))
end
25 changes: 25 additions & 0 deletions src/samplers/aliastable.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
struct AliasTable{S} <: Sampleable{Univariate,Discrete}
accept::Vector{Float64}
alias::Vector{Int}
isampler::S
end
ncategories(s::AliasTable) = length(s.accept)

function AliasTable(probs::AbstractVector{T}) where T<:Real
n = length(probs)
n > 0 || throw(ArgumentError("The input probability vector is empty."))
accp = Vector{Float64}(undef, n)
alias = Vector{Int}(undef, n)
StatsBase.make_alias_table!(probs, 1.0, accp, alias)
AliasTable(accp, alias, Random.RangeGenerator(1:n))
end

function rand(rng::AbstractRNG, s::AliasTable)
i = rand(GLOBAL_RNG, s.isampler) % Int
u = rand()
@inbounds r = u < s.accept[i] ? i : s.alias[i]
r
end
rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)

show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))
44 changes: 10 additions & 34 deletions src/samplers/categorical.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
#### naive sampling

struct CategoricalDirectSampler <: Sampleable{Univariate,Discrete}
prob::Vector{Float64}
struct CategoricalDirectSampler{T<:Real,Ts<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
prob::Ts

function CategoricalDirectSampler(p::Vector{Float64})
isempty(p) && error("p is empty.")
new(p)
function CategoricalDirectSampler{T,Ts}(p::Ts) where {
T<:Real,Ts<:AbstractVector{T}}
isempty(p) && throw(ArgumentError("p is empty."))
new{T,Ts}(p)
end
end

CategoricalDirectSampler(p::Ts) where {T<:Real,Ts<:AbstractVector{T}} =
CategoricalDirectSampler{T,Ts}(p)

ncategories(s::CategoricalDirectSampler) = length(s.prob)

function rand(s::CategoricalDirectSampler)
Expand All @@ -22,32 +27,3 @@ function rand(s::CategoricalDirectSampler)
return i
end


##### Alias Table #####

struct AliasTable{S} <: Sampleable{Univariate,Discrete}
accept::Vector{Float64}
alias::Vector{Int}
isampler::S
end
ncategories(s::AliasTable) = length(s.accept)

function AliasTable(probs::AbstractVector{T}) where T<:Real
n = length(probs)
n > 0 || error("The input probability vector is empty.")
accp = Vector{Float64}(undef, n)
alias = Vector{Int}(undef, n)
StatsBase.make_alias_table!(probs, 1.0, accp, alias)
AliasTable(accp, alias, Random.RangeGenerator(1:n))
end

function rand(rng::AbstractRNG, s::AliasTable)
i = rand(GLOBAL_RNG, s.isampler) % Int
u = rand()
@inbounds r = u < s.accept[i] ? i : s.alias[i]
r
end
rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)

show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))

19 changes: 19 additions & 0 deletions src/samplers/discretenonparametric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
DiscreteNonParametricSampler(xs, ps)
Data structure for efficiently sampling from an arbitrary probability mass
function defined by support `xs` and probabilities `ps`.
"""
struct DiscreteNonParametricSampler{T<:Real, S<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
support::S
aliastable::AliasTable

DiscreteNonParametricSampler{T,S}(support::S, probs::AbstractVector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
new(support, AliasTable(probs))
end

DiscreteNonParametricSampler(support::S, probs::AbstractVector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
DiscreteNonParametricSampler{T,S}(support, probs)

rand(s::DiscreteNonParametricSampler) =
(@inbounds v = s.support[rand(s.aliastable)]; v)
7 changes: 4 additions & 3 deletions src/testutils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,9 @@ end
function get_evalsamples(d::DiscreteUnivariateDistribution, q::Float64)
# samples for testing evaluation functions (even spacing)

lv = (islowerbounded(d) ? minimum(d) : floor(Int,quantile(d, q/2)))::Int
hv = (isupperbounded(d) ? maximum(d) : ceil(Int,cquantile(d, q/2)))::Int
T = eltype(d)
lv = (islowerbounded(d) ? minimum(d) : floor(T,quantile(d, q/2)))::T
hv = (isupperbounded(d) ? maximum(d) : ceil(T,cquantile(d, q/2)))::T
@assert lv <= hv
return lv:hv
end
Expand Down Expand Up @@ -284,7 +285,7 @@ function test_support(d::UnivariateDistribution, vs::AbstractVector)
if isbounded(d)
if isa(d, DiscreteUnivariateDistribution)
s = support(d)
@test isa(s, UnitRange)
@test isa(s, AbstractUnitRange)
@test first(s) == minimum(d)
@test last(s) == maximum(d)
end
Expand Down
171 changes: 50 additions & 121 deletions src/univariate/discrete/categorical.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,60 +13,52 @@ probs(d) # Get the probability vector, i.e. p
ncategories(d) # Get the number of categories, i.e. K
```
Here, `p` must be a real vector, of which all components are nonnegative and sum to one.
**Note:** The input vector `p` is directly used as a field of the constructed distribution, without being copied.
`Categorical` is simply a type alias describing a special case of a
`DiscreteNonParametric` distribution, so non-specialized methods defined for
`DiscreteNonParametric` apply to `Categorical` as well.
External links:
* [Categorical distribution on Wikipedia](http://en.wikipedia.org/wiki/Categorical_distribution)
"""
struct Categorical{T<:Real} <: DiscreteUnivariateDistribution
K::Int
p::Vector{T}
Categorical{P,Ps} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps}

Categorical{T}(p::Vector{T}, ::NoArgCheck) where {T} = new{T}(length(p), p)
Categorical{P,Ps}(p::Ps, ::NoArgCheck) where {P<:Real, Ps<:AbstractVector{P}} =
Categorical{P,Ps}(Base.OneTo(length(p)), p, NoArgCheck())

function Categorical{T}(p::Vector{T}) where T
@check_args(Categorical, isprobvec(p))
new{T}(length(p), p)
end
Categorical(p::Ps, ::NoArgCheck) where {P<:Real, Ps<:AbstractVector{P}} =
Categorical{P,Ps}(p, NoArgCheck())

function Categorical{T}(k::Integer) where T
@check_args(Categorical, k >= 1)
new{T}(k, fill(1/k, k))
end
function Categorical{P,Ps}(p::Ps) where {P<:Real, Ps<:AbstractVector{P}}
@check_args(Categorical, isprobvec(p))
Categorical{P,Ps}(Base.OneTo(length(p)), p, NoArgCheck())
end

Categorical(p::Vector{T}, ::NoArgCheck) where {T<:Real} = Categorical{T}(p, NoArgCheck())
Categorical(p::Vector{T}) where {T<:Real} = Categorical{T}(p)
Categorical(k::Integer) = Categorical{Float64}(k)
Categorical(p::Ps) where {P<:Real, Ps<:AbstractVector{P}} =
Categorical{P,Ps}(p)

@distr_support Categorical 1 d.K
function Categorical(k::Integer)
@check_args(Categorical, k >= 1)
Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k), NoArgCheck())
end

### Conversions

convert(::Type{Categorical{T}}, p::Vector{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(p))
convert(::Type{Categorical{T}}, d::Categorical{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(d.p))
convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where {
P<:Real,Ps<:AbstractVector{P}} = Categorical{P,Ps}(Ps(x))

### Parameters

ncategories(d::Categorical) = d.K
probs(d::Categorical) = d.p
params(d::Categorical) = (d.p,)
ncategories(d::Categorical) = support(d).stop
params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),)
@inline partype(d::Categorical{T}) where {T<:Real} = T


### Statistics

function categorical_mean(p::AbstractArray{T}) where T<:Real
k = length(p)
s = zero(T)
for i = 1:k
@inbounds s += p[i] * i
end
s
end

mean(d::Categorical) = categorical_mean(d.p)

function median(d::Categorical{T}) where T<:Real
function median(d::Categorical{T}) where {T<:Real}
k = ncategories(d)
p = probs(d)
cp = zero(T)
Expand All @@ -78,78 +70,6 @@ function median(d::Categorical{T}) where T<:Real
i
end

function var(d::Categorical{T}) where T<:Real
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += abs2(i - m) * p[i]
end
s
end

function skewness(d::Categorical{T}) where T<:Real
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += (i - m)^3 * p[i]
end
v = var(d)
s / (v * sqrt(v))
end

function kurtosis(d::Categorical{T}) where T<:Real
k = ncategories(d)
p = probs(d)
m = categorical_mean(p)
s = zero(T)
for i = 1:k
@inbounds s += (i - m)^4 * p[i]
end
s / abs2(var(d)) - 3
end

entropy(d::Categorical) = entropy(d.p)

function mgf(d::Categorical{T}, t::Real) where T<:Real
k = ncategories(d)
p = probs(d)
s = zero(T)
for i = 1:k
@inbounds s += p[i] * exp(t)
end
s
end

function cf(d::Categorical{T}, t::Real) where T<:Real
k = ncategories(d)
p = probs(d)
s = zero(T) + zero(T)*im
for i = 1:k
@inbounds s += p[i] * cis(t)
end
s
end

mode(d::Categorical) = argmax(probs(d))

function modes(d::Categorical)
K = ncategories(d)
p = probs(d)
maxp = maximum(p)
r = Vector{Int}()
for k = 1:K
@inbounds if p[k] == maxp
push!(r, k)
end
end
r
end


### Evaluation

function cdf(d::Categorical{T}, x::Int) where T<:Real
Expand All @@ -164,27 +84,38 @@ function cdf(d::Categorical{T}, x::Int) where T<:Real
return c
end

pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? d.p[x] : zero(T)
pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? probs(d)[x] : zero(T)

logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(d.p[x]) : -Inf
logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(probs(d)[x]) : -Inf

function quantile(d::Categorical, p::Float64)
0 <= p <= 1 || throw(DomainError())
k = ncategories(d)
pv = probs(d)
i = 1
v = pv[1]
while v < p && i < k
i += 1
@inbounds v += pv[i]
function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Real}
vfirst = round(Int, first(rgn))
vlast = round(Int, last(rgn))
vl = max(vfirst, 1)
vr = min(vlast, ncategories(d))
p = probs(d)
if vl > vfirst
for i = 1:(vl - vfirst)
r[i] = zero(T)
end
end
i
fm1 = vfirst - 1
for v = vl:vr
r[v - fm1] = p[v]
end
if vr < vlast
for i = (vr - vfirst + 2):length(rgn)
r[i] = zero(T)
end
end
return r
end


# sampling

sampler(d::Categorical) = AliasTable(d.p)
sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} =
AliasTable(probs(d))


### sufficient statistics
Expand Down Expand Up @@ -249,5 +180,3 @@ fit_mle(::Type{Categorical}, x::AbstractArray{T}, w::AbstractArray{Float64}) whe

fit(::Type{Categorical}, data::CategoricalData) = fit_mle(Categorical, data)
fit(::Type{Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = fit_mle(Categorical, data, w)

==(c1::Categorical,c2::Categorical) = (c1.K == c2.K) && all(c1.p .== c2.p)
Loading

0 comments on commit cd0ae4e

Please sign in to comment.