Add DiscreteNonParametric distribution (#634)

* Add Generic discrete distribution * Convert modes to explicit loop * Add uniqueness check for values in Generic constructor * Convert Generic's support to an AbstractVector * Convert Categorical to a type alias of Generic * Add Generic mgf and cf * Add Generic fit and sufficient statistic methods * Easy 0.7 fixes * Updates for Random in 0.7 and tests passing * Fix 0.7 constructor dispatch * Fix weighted suffstats and minor changes * Rename Generic to DiscreteNonParametric * Add documentation for DiscreteNonParametric * Generalize probability vector type for DiscreteNonParametric and Categorical, fix #743 * Generalize DiscreteNonParametric/Categorical samplers * Further specialize support of Categorical * Partially support test_distr with DiscreteNonParametric
JuliaStats · Nov 4, 2018 · cd0ae4e · cd0ae4e
1 parent ad271d2
commit cd0ae4e
Show file tree

Hide file tree

Showing 16 changed files with 536 additions and 178 deletions.
diff --git a/docs/src/univariate.md b/docs/src/univariate.md
@@ -144,6 +144,7 @@ BetaBinomial
 Binomial
 Categorical
 DiscreteUniform
+DiscreteNonParametric
 Geometric
 Hypergeometric
 NegativeBinomial

diff --git a/src/Distributions.jl b/src/Distributions.jl
@@ -86,6 +86,7 @@ export
     FullNormal,
     FullNormalCanon,
     Gamma,
+    DiscreteNonParametric,
     GeneralizedPareto,
     GeneralizedExtremeValue,
     Geometric,

diff --git a/src/samplers.jl b/src/samplers.jl
@@ -1,14 +1,16 @@
 # delegation of samplers
 
-for fname in ["categorical.jl",
+for fname in ["aliastable.jl",
               "binomial.jl",
               "poissonbinomial.jl",
               "poisson.jl",
               "exponential.jl",
               "gamma.jl",
               "multinomial.jl",
               "vonmises.jl",
-              "vonmisesfisher.jl"]
+              "vonmisesfisher.jl",
+              "discretenonparametric.jl",
+              "categorical.jl"]
 
     include(joinpath("samplers", fname))
 end
diff --git a/src/samplers/aliastable.jl b/src/samplers/aliastable.jl
@@ -0,0 +1,25 @@
+struct AliasTable{S} <: Sampleable{Univariate,Discrete}
+    accept::Vector{Float64}
+    alias::Vector{Int}
+    isampler::S
+end
+ncategories(s::AliasTable) = length(s.accept)
+
+function AliasTable(probs::AbstractVector{T}) where T<:Real
+    n = length(probs)
+    n > 0 || throw(ArgumentError("The input probability vector is empty."))
+    accp = Vector{Float64}(undef, n)
+    alias = Vector{Int}(undef, n)
+    StatsBase.make_alias_table!(probs, 1.0, accp, alias)
+    AliasTable(accp, alias, Random.RangeGenerator(1:n))
+end
+
+function rand(rng::AbstractRNG, s::AliasTable)
+    i = rand(GLOBAL_RNG, s.isampler) % Int
+    u = rand()
+    @inbounds r = u < s.accept[i] ? i : s.alias[i]
+    r
+end
+rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)
+
+show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))
diff --git a/src/samplers/categorical.jl b/src/samplers/categorical.jl
@@ -1,13 +1,18 @@
 #### naive sampling
 
-struct CategoricalDirectSampler <: Sampleable{Univariate,Discrete}
-    prob::Vector{Float64}
+struct CategoricalDirectSampler{T<:Real,Ts<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
+    prob::Ts
 
-    function CategoricalDirectSampler(p::Vector{Float64})
-        isempty(p) && error("p is empty.")
-        new(p)
+    function CategoricalDirectSampler{T,Ts}(p::Ts) where {
+        T<:Real,Ts<:AbstractVector{T}}
+        isempty(p) && throw(ArgumentError("p is empty."))
+        new{T,Ts}(p)
     end
 end
+
+CategoricalDirectSampler(p::Ts) where {T<:Real,Ts<:AbstractVector{T}} =
+    CategoricalDirectSampler{T,Ts}(p)
+
 ncategories(s::CategoricalDirectSampler) = length(s.prob)
 
 function rand(s::CategoricalDirectSampler)
@@ -22,32 +27,3 @@ function rand(s::CategoricalDirectSampler)
     return i
 end
 
-
-##### Alias Table #####
-
-struct AliasTable{S} <: Sampleable{Univariate,Discrete}
-    accept::Vector{Float64}
-    alias::Vector{Int}
-    isampler::S
-end
-ncategories(s::AliasTable) = length(s.accept)
-
-function AliasTable(probs::AbstractVector{T}) where T<:Real
-    n = length(probs)
-    n > 0 || error("The input probability vector is empty.")
-    accp = Vector{Float64}(undef, n)
-    alias = Vector{Int}(undef, n)
-    StatsBase.make_alias_table!(probs, 1.0, accp, alias)
-    AliasTable(accp, alias, Random.RangeGenerator(1:n))
-end
-
-function rand(rng::AbstractRNG, s::AliasTable)
-    i = rand(GLOBAL_RNG, s.isampler) % Int
-    u = rand()
-    @inbounds r = u < s.accept[i] ? i : s.alias[i]
-    r
-end
-rand(s::AliasTable) = rand(Random.GLOBAL_RNG, s)
-
-show(io::IO, s::AliasTable) = @printf(io, "AliasTable with %d entries", ncategories(s))
-
diff --git a/src/samplers/discretenonparametric.jl b/src/samplers/discretenonparametric.jl
@@ -0,0 +1,19 @@
+"""
+    DiscreteNonParametricSampler(xs, ps)
+
+Data structure for efficiently sampling from an arbitrary probability mass
+function defined by support `xs` and probabilities `ps`.
+"""
+struct DiscreteNonParametricSampler{T<:Real, S<:AbstractVector{T}} <: Sampleable{Univariate,Discrete}
+    support::S
+    aliastable::AliasTable
+
+    DiscreteNonParametricSampler{T,S}(support::S, probs::AbstractVector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+        new(support, AliasTable(probs))
+end
+
+DiscreteNonParametricSampler(support::S, probs::AbstractVector{<:Real}) where {T<:Real,S<:AbstractVector{T}} =
+    DiscreteNonParametricSampler{T,S}(support, probs)
+
+rand(s::DiscreteNonParametricSampler) =
+    (@inbounds v = s.support[rand(s.aliastable)]; v)
diff --git a/src/testutils.jl b/src/testutils.jl
@@ -247,8 +247,9 @@ end
 function get_evalsamples(d::DiscreteUnivariateDistribution, q::Float64)
     # samples for testing evaluation functions (even spacing)
 
-    lv = (islowerbounded(d) ? minimum(d) : floor(Int,quantile(d, q/2)))::Int
-    hv = (isupperbounded(d) ? maximum(d) : ceil(Int,cquantile(d, q/2)))::Int
+    T = eltype(d)
+    lv = (islowerbounded(d) ? minimum(d) : floor(T,quantile(d, q/2)))::T
+    hv = (isupperbounded(d) ? maximum(d) : ceil(T,cquantile(d, q/2)))::T
     @assert lv <= hv
     return lv:hv
 end
@@ -284,7 +285,7 @@ function test_support(d::UnivariateDistribution, vs::AbstractVector)
     if isbounded(d)
         if isa(d, DiscreteUnivariateDistribution)
             s = support(d)
-            @test isa(s, UnitRange)
+            @test isa(s, AbstractUnitRange)
             @test first(s) == minimum(d)
             @test last(s) == maximum(d)
         end

diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl
@@ -13,60 +13,52 @@ probs(d)         # Get the probability vector, i.e. p
 ncategories(d)   # Get the number of categories, i.e. K
 ```
 Here, `p` must be a real vector, of which all components are nonnegative and sum to one.
+
 **Note:** The input vector `p` is directly used as a field of the constructed distribution, without being copied.
+
+`Categorical` is simply a type alias describing a special case of a
+`DiscreteNonParametric` distribution, so non-specialized methods defined for
+`DiscreteNonParametric` apply to `Categorical` as well.
+
 External links:
+
 * [Categorical distribution on Wikipedia](http://en.wikipedia.org/wiki/Categorical_distribution)
 """
-struct Categorical{T<:Real} <: DiscreteUnivariateDistribution
-    K::Int
-    p::Vector{T}
+Categorical{P,Ps} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps}
 
-    Categorical{T}(p::Vector{T}, ::NoArgCheck) where {T} = new{T}(length(p), p)
+Categorical{P,Ps}(p::Ps, ::NoArgCheck) where {P<:Real, Ps<:AbstractVector{P}} =
+    Categorical{P,Ps}(Base.OneTo(length(p)), p, NoArgCheck())
 
-    function Categorical{T}(p::Vector{T}) where T
-        @check_args(Categorical, isprobvec(p))
-        new{T}(length(p), p)
-    end
+Categorical(p::Ps, ::NoArgCheck) where {P<:Real, Ps<:AbstractVector{P}} =
+    Categorical{P,Ps}(p, NoArgCheck())
 
-    function Categorical{T}(k::Integer) where T
-        @check_args(Categorical, k >= 1)
-        new{T}(k, fill(1/k, k))
-    end
+function Categorical{P,Ps}(p::Ps) where {P<:Real, Ps<:AbstractVector{P}}
+    @check_args(Categorical, isprobvec(p))
+    Categorical{P,Ps}(Base.OneTo(length(p)), p, NoArgCheck())
 end
 
-Categorical(p::Vector{T}, ::NoArgCheck) where {T<:Real} = Categorical{T}(p, NoArgCheck())
-Categorical(p::Vector{T}) where {T<:Real} = Categorical{T}(p)
-Categorical(k::Integer) = Categorical{Float64}(k)
+Categorical(p::Ps) where {P<:Real, Ps<:AbstractVector{P}} =
+    Categorical{P,Ps}(p)
 
-@distr_support Categorical 1 d.K
+function Categorical(k::Integer)
+    @check_args(Categorical, k >= 1)
+    Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k), NoArgCheck())
+end
 
 ### Conversions
 
-convert(::Type{Categorical{T}}, p::Vector{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(p))
-convert(::Type{Categorical{T}}, d::Categorical{S}) where {T<:Real, S<:Real} = Categorical(Vector{T}(d.p))
+convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where {
+    P<:Real,Ps<:AbstractVector{P}} = Categorical{P,Ps}(Ps(x))
 
 ### Parameters
 
-ncategories(d::Categorical) = d.K
-probs(d::Categorical) = d.p
-params(d::Categorical) = (d.p,)
+ncategories(d::Categorical) = support(d).stop
+params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),)
 @inline partype(d::Categorical{T}) where {T<:Real} = T
 
-
 ### Statistics
 
-function categorical_mean(p::AbstractArray{T}) where T<:Real
-    k = length(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += p[i] * i
-    end
-    s
-end
-
-mean(d::Categorical) = categorical_mean(d.p)
-
-function median(d::Categorical{T}) where T<:Real
+function median(d::Categorical{T}) where {T<:Real}
     k = ncategories(d)
     p = probs(d)
     cp = zero(T)
@@ -78,78 +70,6 @@ function median(d::Categorical{T}) where T<:Real
     i
 end
 
-function var(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += abs2(i - m) * p[i]
-    end
-    s
-end
-
-function skewness(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^3 * p[i]
-    end
-    v = var(d)
-    s / (v * sqrt(v))
-end
-
-function kurtosis(d::Categorical{T}) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    m = categorical_mean(p)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += (i - m)^4 * p[i]
-    end
-    s / abs2(var(d)) - 3
-end
-
-entropy(d::Categorical) = entropy(d.p)
-
-function mgf(d::Categorical{T}, t::Real) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    s = zero(T)
-    for i = 1:k
-        @inbounds s += p[i] * exp(t)
-    end
-    s
-end
-
-function cf(d::Categorical{T}, t::Real) where T<:Real
-    k = ncategories(d)
-    p = probs(d)
-    s = zero(T) + zero(T)*im
-    for i = 1:k
-        @inbounds s += p[i] * cis(t)
-    end
-    s
-end
-
-mode(d::Categorical) = argmax(probs(d))
-
-function modes(d::Categorical)
-    K = ncategories(d)
-    p = probs(d)
-    maxp = maximum(p)
-    r = Vector{Int}()
-    for k = 1:K
-        @inbounds if p[k] == maxp
-            push!(r, k)
-        end
-    end
-    r
-end
-
-
 ### Evaluation
 
 function cdf(d::Categorical{T}, x::Int) where T<:Real
@@ -164,27 +84,38 @@ function cdf(d::Categorical{T}, x::Int) where T<:Real
     return c
 end
 
-pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? d.p[x] : zero(T)
+pdf(d::Categorical{T}, x::Int) where {T<:Real} = insupport(d, x) ? probs(d)[x] : zero(T)
 
-logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(d.p[x]) : -Inf
+logpdf(d::Categorical, x::Int) = insupport(d, x) ? log(probs(d)[x]) : -Inf
 
-function quantile(d::Categorical, p::Float64)
-    0 <= p <= 1 || throw(DomainError())
-    k = ncategories(d)
-    pv = probs(d)
-    i = 1
-    v = pv[1]
-    while v < p && i < k
-        i += 1
-        @inbounds v += pv[i]
+function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Real}
+    vfirst = round(Int, first(rgn))
+    vlast = round(Int, last(rgn))
+    vl = max(vfirst, 1)
+    vr = min(vlast, ncategories(d))
+    p = probs(d)
+    if vl > vfirst
+        for i = 1:(vl - vfirst)
+            r[i] = zero(T)
+        end
     end
-    i
+    fm1 = vfirst - 1
+    for v = vl:vr
+        r[v - fm1] = p[v]
+    end
+    if vr < vlast
+        for i = (vr - vfirst + 2):length(rgn)
+            r[i] = zero(T)
+        end
+    end
+    return r
 end
 
 
 # sampling
 
-sampler(d::Categorical) = AliasTable(d.p)
+sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} =
+   AliasTable(probs(d))
 
 
 ### sufficient statistics
@@ -249,5 +180,3 @@ fit_mle(::Type{Categorical}, x::AbstractArray{T}, w::AbstractArray{Float64}) whe
 
 fit(::Type{Categorical}, data::CategoricalData) = fit_mle(Categorical, data)
 fit(::Type{Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = fit_mle(Categorical, data, w)
-
-==(c1::Categorical,c2::Categorical) = (c1.K == c2.K) && all(c1.p .== c2.p)