From a059f39e236d1eb6c62a8eebc211907707985792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:54:45 +0100 Subject: [PATCH 01/17] add fit_mle(dt::D, arg) method --- src/genericfit.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/genericfit.jl b/src/genericfit.jl index 2b4b5e67e..7c79fdcda 100644 --- a/src/genericfit.jl +++ b/src/genericfit.jl @@ -1,6 +1,6 @@ # generic functions for distribution fitting -function suffstats(dt::Type{D}, xs...) where D<:Distribution +function suffstats(dt::Type{D}, xs...) where {D<:Distribution} argtypes = tuple(D, map(typeof, xs)...) error("suffstats is not implemented for $argtypes.") end @@ -30,5 +30,7 @@ fit_mle(dt::Type{D}, x::AbstractArray, w::AbstractArray) where {D<:UnivariateDis fit_mle(dt::Type{D}, x::AbstractMatrix) where {D<:MultivariateDistribution} = fit_mle(D, suffstats(D, x)) fit_mle(dt::Type{D}, x::AbstractMatrix, w::AbstractArray) where {D<:MultivariateDistribution} = fit_mle(D, suffstats(D, x, w)) +fit_mle(dt::D, args...) where {D<:Distribution} = fit_mle(typeof(dt), args...) + fit(dt::Type{D}, x) where {D<:Distribution} = fit_mle(D, x) fit(dt::Type{D}, args...) where {D<:Distribution} = fit_mle(D, args...) From bb4bf69a838959cc106b2798cb7b21ad3169d00e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:55:09 +0100 Subject: [PATCH 02/17] add fit_mle for product --- src/multivariate/product.jl | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 3bb0f0d9e..083ebfce3 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -27,15 +27,17 @@ function Product(v::V) where {S<:ValueSupport,T<:UnivariateDistribution{S},V<:Ab "`Product(v)` is deprecated, please use `product_distribution(v)`", :Product, ) - return Product{S, T, V}(v) + return Product{S,T,V}(v) end length(d::Product) = length(d.v) function Base.eltype(::Type{<:Product{S,T}}) where {S<:ValueSupport, - T<:UnivariateDistribution{S}} + T<:UnivariateDistribution{S}} return eltype(T) end +params(g::Product) = params.(g.v) + _rand!(rng::AbstractRNG, d::Product, x::AbstractVector{<:Real}) = map!(Base.Fix1(rand, rng), x, d.v) function _logpdf(d::Product, x::AbstractVector{<:Real}) @@ -60,3 +62,17 @@ maximum(d::Product) = map(maximum, d.v) function product_distribution(dists::V) where {S<:ValueSupport,T<:UnivariateDistribution{S},V<:AbstractVector{T}} return Product{S,T,V}(dists) end + +#### Fitting + +""" + fit_mle(g::Product, x::AbstractMatrix) + fit_mle(g::Product, x::AbstractMatrix, γ::AbstractVector) + +The `fit_mle` for a multivariate Product distributions `g` is the `product_distribution` of `fit_mle` of each components of `g`. +""" +function fit_mle(g::Product, x::AbstractMatrix, args...) + d = size(x, 1) + length(g) == d || throw(DimensionMismatch("The dimensions of g and x are inconsistent.")) + return product_distribution([fit_mle(g.v[s], y, args...) for (s, y) in enumerate(eachrow(x))]) +end From 168efa26612fba44f417511270c2c6487639b8dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:55:36 +0100 Subject: [PATCH 03/17] add weighted fit_mle for Laplace --- src/univariate/continuous/laplace.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/univariate/continuous/laplace.jl b/src/univariate/continuous/laplace.jl index 2a7bf04a4..caeaa1d29 100644 --- a/src/univariate/continuous/laplace.jl +++ b/src/univariate/continuous/laplace.jl @@ -136,3 +136,15 @@ function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}) xc .= abs.(x .- m) return Laplace(m, mean(xc)) end + +function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}, w::AbstractArray{<:Real}) + n = length(x) + if n != length(w) + throw(DimensionMismatch("Inconsistent array lengths.")) + end + xc = similar(x) + copyto!(xc, x) + m = median(xc, weights(w)) # https://github.com/JuliaStats/StatsBase.jl/blob/0b64a9c8a16355da16469d0fe5016e0fdf099af5/src/weights.jl#L788 + xc .= abs.(x .- m) + return Laplace(m, mean(xc, weights(w))) +end \ No newline at end of file From 2fdb32214bc75f118451d9807c2abad37469e8de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:55:45 +0100 Subject: [PATCH 04/17] add fit_mle for Dirac --- src/univariate/discrete/dirac.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/univariate/discrete/dirac.jl b/src/univariate/discrete/dirac.jl index 94d082b0f..49645078d 100644 --- a/src/univariate/discrete/dirac.jl +++ b/src/univariate/discrete/dirac.jl @@ -50,9 +50,21 @@ logccdf(d::Dirac, x::Real) = x < d.value ? 0.0 : isnan(x) ? NaN : -Inf quantile(d::Dirac{T}, p::Real) where {T} = 0 <= p <= 1 ? d.value : T(NaN) mgf(d::Dirac, t) = exp(t * d.value) -cgf(d::Dirac, t) = t*d.value +cgf(d::Dirac, t) = t * d.value cf(d::Dirac, t) = cis(t * d.value) #### Sampling rand(rng::AbstractRNG, d::Dirac) = d.value + +#### Fitting + +fit_mle(::Type{<:Dirac}, x::AbstractArray{T}) where {T<:Real} = length(unique(x)) == 1 ? Dirac(first(x)) : Dirac(NaN) + +function fit_mle(::Type{<:Dirac}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Real} + n = length(x) + if n != length(w) + throw(DimensionMismatch("Inconsistent array lengths.")) + end + return length(unique(x[findall(!iszero, w)])) == 1 ? Dirac(first(x)) : Dirac(NaN) +end \ No newline at end of file From c198e7f7f7640b25f0e47c59ff2efb4029d32727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 17:01:19 +0100 Subject: [PATCH 05/17] add lin to EM pkg to fit mixture --- docs/src/mixture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/mixture.md b/docs/src/mixture.md index e6b24b103..2ae7ab904 100644 --- a/docs/src/mixture.md +++ b/docs/src/mixture.md @@ -106,4 +106,4 @@ rand!(::AbstractMixtureModel, ::AbstractArray) ## Estimation There are several methods for the estimation of mixture models from data, and this problem remains an open research topic. -This package does not provide facilities for estimating mixture models. One can resort to other packages, *e.g.* [*GaussianMixtures.jl*](https://github.com/davidavdav/GaussianMixtures.jl), for this purpose. +This package does not provide facilities for estimating mixture models. One can resort to other packages, *e.g.* [*GaussianMixtures.jl*](https://github.com/davidavdav/GaussianMixtures.jl) or [*ExpectationMaximization.jl*](https://github.com/dmetivie/ExpectationMaximization.jl), for this purpose. From 81fcffecb258f11000cc3f8fe852191aa08ef141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 17:02:02 +0100 Subject: [PATCH 06/17] add docs for new fit_mle --- docs/src/fit.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/src/fit.md b/docs/src/fit.md index 24ddbad5c..5148d0768 100644 --- a/docs/src/fit.md +++ b/docs/src/fit.md @@ -42,6 +42,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Binomial`](@ref) - [`Categorical`](@ref) - [`DiscreteUniform`](@ref) +- [`Dirac`](@ref) - [`Exponential`](@ref) - [`LogNormal`](@ref) - [`Normal`](@ref) @@ -60,6 +61,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Multinomial`](@ref) - [`MvNormal`](@ref) - [`Dirichlet`](@ref) +- [`Product`](@ref) For most of these distributions, the usage is as described above. For a few special distributions that require additional information for estimation, we have to use a modified interface: @@ -74,6 +76,15 @@ fit_mle(Categorical, x) # equivalent to fit_mle(Categorical, max(x), x) fit_mle(Categorical, x, w) ``` +It is also possible to directly input a distribution `fit_mle(D::Distribution, x[, w])`. This form avoids the extra arguments: +```julia +fit_mle(Binomial(n, 0.1), x) # equivalent to fit_mle(Binomial, ntrials(Binomial(n, 0.1)), x), here the parameter 0.1 is not used +fit_mle(Categorical(p), x) # equivalent to fit_mle(Categorical, ncategories(Categorical(p)), x), here the only the length of p is used not its values +fit_mle(product_distribution([Exponential(0.5), Normal(11.3, 3.2)]), x) # equivalent to product_distribution([fit_mle(Exponential, x[1,:]), fit_mle(Normal, x[2, :])]). Again values are not used. +``` +Note that for standard distributions, the values of the distribution parameters `D` are not used in `fit_mle` only the “structure” of `D` is passed into `fit_mle`. +However, for complex Maximum Likelihood estimation, one can imagine using `D` as an initial guess. + ## Sufficient Statistics For many distributions, the estimation can be based on (sum of) sufficient statistics computed from a dataset. To simplify implementation, for such distributions, we implement `suffstats` method instead of `fit_mle` directly: From 12722ca88f397374272e6d285c3db75276c22e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 17:02:45 +0100 Subject: [PATCH 07/17] add tests and fit_mle --- src/univariate/discrete/binomial.jl | 1 + src/univariate/discrete/categorical.jl | 37 +++---- test/fit.jl | 128 +++++++++++++++++-------- 3 files changed, 107 insertions(+), 59 deletions(-) diff --git a/src/univariate/discrete/binomial.jl b/src/univariate/discrete/binomial.jl index f4102cbb8..3aa24545c 100644 --- a/src/univariate/discrete/binomial.jl +++ b/src/univariate/discrete/binomial.jl @@ -196,6 +196,7 @@ suffstats(::Type{T}, data::BinomData, w::AbstractArray{<:Real}) where {T<:Binomi fit_mle(::Type{T}, ss::BinomialStats) where {T<:Binomial} = T(ss.n, ss.ns / (ss.ne * ss.n)) +fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Binomial} = fit_mle(T, suffstats(T, ntrials(d), x)) fit_mle(::Type{T}, n::Integer, x::AbstractArray{<:Integer}) where {T<:Binomial}= fit_mle(T, suffstats(T, n, x)) fit_mle(::Type{T}, n::Integer, x::AbstractArray{<:Integer}, w::AbstractArray{<:Real}) where {T<:Binomial} = fit_mle(T, suffstats(T, n, x, w)) fit_mle(::Type{T}, data::BinomData) where {T<:Binomial} = fit_mle(T, suffstats(T, data)) diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl index 1dc08960c..d5270db3c 100644 --- a/src/univariate/discrete/categorical.jl +++ b/src/univariate/discrete/categorical.jl @@ -26,7 +26,7 @@ External links: """ const Categorical{P<:Real,Ps<:AbstractVector{P}} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps} -function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real, Ps<:AbstractVector{P}} +function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real,Ps<:AbstractVector{P}} @check_args Categorical (p, isprobvec(p), "vector p is not a probability vector") return Categorical{P,Ps}(Base.OneTo(length(p)), p; check_args=check_args) end @@ -36,7 +36,7 @@ Categorical(p::AbstractVector{P}; check_args::Bool=true) where {P<:Real} = function Categorical(k::Integer; check_args::Bool=true) @check_args Categorical (k, k >= 1, "at least one category is required") - return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k); check_args=false) + return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1 / k, k); check_args=false) end Categorical(probabilities::Real...; check_args::Bool=true) = Categorical([probabilities...]; check_args=check_args) @@ -49,7 +49,7 @@ convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where { ### Parameters ncategories(d::Categorical) = support(d).stop -params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),) +params(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = (probs(d),) partype(::Categorical{T}) where {T<:Real} = T ### Statistics @@ -59,7 +59,7 @@ function median(d::Categorical{T}) where {T<:Real} p = probs(d) cp = zero(T) i = 0 - while cp < 1/2 && i <= k + while cp < 1 / 2 && i <= k i += 1 @inbounds cp += p[i] end @@ -87,16 +87,16 @@ function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Re vr = min(vlast, ncategories(d)) p = probs(d) if vl > vfirst - for i = 1:(vl - vfirst) + for i = 1:(vl-vfirst) r[i] = zero(T) end end fm1 = vfirst - 1 for v = vl:vr - r[v - fm1] = p[v] + r[v-fm1] = p[v] end if vr < vlast - for i = (vr - vfirst + 2):length(rgn) + for i = (vr-vfirst+2):length(rgn) r[i] = zero(T) end end @@ -107,7 +107,7 @@ end # sampling sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = - AliasTable(probs(d)) + AliasTable(probs(d)) ### sufficient statistics @@ -116,20 +116,20 @@ struct CategoricalStats <: SufficientStats h::Vector{Float64} end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where T<:Integer - for i = 1 : length(x) +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where {T<:Integer} + for i = 1:length(x) @inbounds xi = x[i] - h[xi] += 1. # cannot use @inbounds, as no guarantee that x[i] is in bound + h[xi] += 1.0 # cannot use @inbounds, as no guarantee that x[i] is in bound end h end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} n = length(x) if n != length(w) throw(DimensionMismatch("Inconsistent array lengths.")) end - for i = 1 : n + for i = 1:n @inbounds xi = x[i] @inbounds wi = w[i] h[xi] += wi # cannot use @inbounds, as no guarantee that x[i] is in bound @@ -137,15 +137,15 @@ function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::Abs h end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where T<:Integer +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where {T<:Integer} CategoricalStats(add_categorical_counts!(zeros(k), x)) end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} CategoricalStats(add_categorical_counts!(zeros(k), x, w)) end -const CategoricalData = Tuple{Int, AbstractArray} +const CategoricalData = Tuple{Int,AbstractArray} suffstats(::Type{<:Categorical}, data::CategoricalData) = suffstats(Categorical, data...) suffstats(::Type{<:Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = suffstats(Categorical, data..., w) @@ -156,11 +156,12 @@ function fit_mle(::Type{<:Categorical}, ss::CategoricalStats) Categorical(normalize!(ss.h, 1)) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where T<:Integer +fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Categorical} = fit_mle(T, ncategories(d), x) +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where {T<:Integer} Categorical(normalize!(add_categorical_counts!(zeros(k), x), 1), check_args=false) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} Categorical(normalize!(add_categorical_counts!(zeros(k), x, w), 1), check_args=false) end diff --git a/test/fit.jl b/test/fit.jl index 4483dd1ec..968369c79 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -14,7 +14,7 @@ N = 10^5 rng = MersenneTwister(123) -const funcs = ([rand,rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) +const funcs = ([rand, rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) @testset "Testing fit for DiscreteUniform" begin for func in funcs @@ -40,28 +40,28 @@ end for x in (z, OffsetArray(z, -n0 ÷ 2)), w in (v, OffsetArray(v, -n0 ÷ 2)) ss = @inferred suffstats(D, x) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 == n0 - count(t->t != 0, z) - @test ss.cnt1 == count(t->t != 0, z) + @test ss.cnt0 == n0 - count(t -> t != 0, z) + @test ss.cnt1 == count(t -> t != 0, z) ss = @inferred suffstats(D, x, w) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 ≈ sum(v[z .== 0]) - @test ss.cnt1 ≈ sum(v[z .== 1]) + @test ss.cnt0 ≈ sum(v[z.==0]) + @test ss.cnt1 ≈ sum(v[z.==1]) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ count(t->t != 0, z) / n0 + @test mean(d) ≈ count(t -> t != 0, z) / n0 d = @inferred fit(D, x, w) @test d isa D - @test mean(d) ≈ sum(v[z .== 1]) / sum(v) + @test mean(d) ≈ sum(v[z.==1]) / sum(v) end z = rand(rng..., Bernoulli(0.7), N) for x in (z, OffsetArray(z, -N ÷ 2)) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ 0.7 atol=0.01 + @test mean(d) ≈ 0.7 atol = 0.01 end end end @@ -114,7 +114,11 @@ end d = @inferred fit(D, 100, x) @test d isa D @test ntrials(d) == 100 - @test succprob(d) ≈ 0.3 atol=0.01 + @test succprob(d) ≈ 0.3 atol = 0.01 + d2 = @inferred fit_mle(D(100, 0.5), x) + @test d2 isa D + @test ntrials(d2) == 100 + @test succprob(d2) ≈ 0.3 atol = 0.01 end end end @@ -128,7 +132,7 @@ end w = func[1](n0) ss = suffstats(Categorical, (3, x)) - h = Float64[count(v->v == i, x) for i = 1 : 3] + h = Float64[count(v -> v == i, x) for i = 1:3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -141,8 +145,12 @@ end @test isa(d2, Categorical) @test probs(d2) == probs(d) + d3 = fit_mle(Categorical(p), x) + @test isa(d3, Categorical) + @test probs(d3) == probs(d) + ss = suffstats(Categorical, (3, x), w) - h = Float64[sum(w[x .== i]) for i = 1 : 3] + h = Float64[sum(w[x.==i]) for i = 1:3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -165,6 +173,20 @@ end @test fit(Cauchy{Float64}, collect(-4.0:4.0)) === Cauchy(0.0, 2.0) end +@testset "Testing fit for Delta" begin + for func in funcs, dist in (Dirac, Dirac{Float64}) + x = func[2](dist(1.3), n0) + w = func[1](n0) + d = fit(dist, x) + @test isa(d, dist) + @test isapprox(d.value, 1.3) + + d = fit(dist, x, w) + @test isa(d, dist) + @test isapprox(d.value, 1.3) + end +end + @testset "Testing fit for Exponential" begin for func in funcs, dist in (Exponential, Exponential{Float64}) w = func[1](n0) @@ -204,27 +226,27 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ sum(x) - @test ss.m ≈ mean(x) - @test ss.s2 ≈ sum((x .- ss.m).^2) + @test ss.s ≈ sum(x) + @test ss.m ≈ mean(x) + @test ss.s2 ≈ sum((x .- ss.m) .^ 2) @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ dot(x, w) - @test ss.m ≈ dot(x, w) / sum(w) - @test ss.s2 ≈ dot((x .- ss.m).^2, w) + @test ss.s ≈ dot(x, w) + @test ss.m ≈ dot(x, w) / sum(w) + @test ss.s2 ≈ dot((x .- ss.m) .^ 2, w) @test ss.tw ≈ sum(w) d = fit(dist, x) @test isa(d, dist) @test d.μ ≈ mean(x) - @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) + @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) d = fit(dist, x, w) @test isa(d, dist) @test d.μ ≈ dot(x, w) / sum(w) - @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) d = fit(dist, func[2](dist(μ, σ), N)) @test isa(d, dist) @@ -252,18 +274,18 @@ end ss = suffstats(NormalKnownMu(μ), x, w) @test isa(ss, Distributions.NormalKnownMuStats) @test ss.μ == μ - @test ss.s2 ≈ dot((x .- μ).^2, w) + @test ss.s2 ≈ dot((x .- μ) .^ 2, w) @test ss.tw ≈ sum(w) d = fit_mle(Normal, x; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) + @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) d = fit_mle(Normal, x, w; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) ss = suffstats(NormalKnownSigma(σ), x) @@ -306,8 +328,8 @@ end d = fit(D, x) @test d isa D @test 1.2 <= minimum(d) <= maximum(d) <= 5.8 - @test minimum(d) ≈ 1.2 atol=0.02 - @test maximum(d) ≈ 5.8 atol=0.02 + @test minimum(d) ≈ 1.2 atol = 0.02 + @test maximum(d) ≈ 5.8 atol = 0.02 end end end @@ -319,15 +341,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.slogx ≈ sum(log.(x)) - @test ss.tw ≈ n0 + @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.slogx ≈ dot(log.(x), w) - @test ss.tw ≈ sum(w) + @test ss.tw ≈ sum(w) d = fit(dist, func[2](dist(3.9, 2.1), N)) @test isa(d, dist) @@ -353,11 +375,11 @@ end d = fit(dist, x) @test isa(d, dist) - @test succprob(d) ≈ inv(1. + mean(x)) + @test succprob(d) ≈ inv(1.0 + mean(x)) d = fit(dist, x, w) @test isa(d, dist) - @test succprob(d) ≈ inv(1. + dot(x, w) / sum(w)) + @test succprob(d) ≈ inv(1.0 + dot(x, w) / sum(w)) d = fit(dist, func[2](dist(0.3), N)) @test isa(d, dist) @@ -367,21 +389,29 @@ end @testset "Testing fit for Laplace" begin for func in funcs, dist in (Laplace, Laplace{Float64}) - d = fit(dist, func[2](dist(5.0, 3.0), N + 1)) + x = func[2](dist(5.0, 3.0), N) + w = func[1](N) + + d = fit(dist, x) @test isa(d, dist) @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d) , 3.0, atol=0.03) + @test isapprox(scale(d), 3.0, atol=0.03) + + d = fit(dist, x, w) + @test isa(d, dist) + @test isapprox(location(d), 5.0, atol=0.02) + @test isapprox(scale(d), 3.0, atol=0.03) end end @testset "Testing fit for Pareto" begin for func in funcs, dist in (Pareto, Pareto{Float64}) - x = func[2](dist(3., 7.), N) + x = func[2](dist(3.0, 7.0), N) d = fit(dist, x) @test isa(d, dist) - @test isapprox(shape(d), 3., atol=0.1) - @test isapprox(scale(d), 7., atol=0.1) + @test isapprox(shape(d), 3.0, atol=0.1) + @test isapprox(scale(d), 7.0, atol=0.1) end end @@ -414,6 +444,22 @@ end end end +@testset "Testing fit_mle for Product" begin + for func in funcs + dist = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) + x = rand(dist, N) + w = func[1](N) + + d = fit_mle(dist, x) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + + d = fit_mle(dist, x, w) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + end +end + @testset "Testing fit for InverseGaussian" begin for func in funcs, dist in (InverseGaussian, InverseGaussian{Float64}) x = rand(dist(3.9, 2.1), n0) @@ -421,15 +467,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.sinvx ≈ sum(1 ./ x) - @test ss.sw ≈ n0 + @test ss.sw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.sinvx ≈ dot(1 ./ x, w) - @test ss.sw ≈ sum(w) + @test ss.sw ≈ sum(w) d = fit(dist, rand(dist(3.9, 2.1), N)) @test isa(d, dist) @@ -460,8 +506,8 @@ end for func in funcs, dist in (Weibull, Weibull{Float64}) d = fit(dist, func[2](dist(8.1, 4.3), N)) @test isa(d, dist) - @test isapprox(d.α, 8.1, atol = 0.1) - @test isapprox(d.θ, 4.3, atol = 0.1) + @test isapprox(d.α, 8.1, atol=0.1) + @test isapprox(d.θ, 4.3, atol=0.1) end end From a96582e3cb10dfcdf84eb9f5c7c97221b3ea4e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 20:34:31 +0100 Subject: [PATCH 08/17] add docs for new fit_mle + build docs --- docs/Project.toml | 1 + docs/src/fit.md | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/docs/Project.toml b/docs/Project.toml index 3d6a9ee4e..fbb4973b6 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,5 @@ [deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" diff --git a/docs/src/fit.md b/docs/src/fit.md index 24ddbad5c..1e52baaa3 100644 --- a/docs/src/fit.md +++ b/docs/src/fit.md @@ -42,6 +42,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Binomial`](@ref) - [`Categorical`](@ref) - [`DiscreteUniform`](@ref) +- [`Dirac`](@ref) - [`Exponential`](@ref) - [`LogNormal`](@ref) - [`Normal`](@ref) @@ -60,6 +61,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Multinomial`](@ref) - [`MvNormal`](@ref) - [`Dirichlet`](@ref) +- [`Product`](@ref) For most of these distributions, the usage is as described above. For a few special distributions that require additional information for estimation, we have to use a modified interface: @@ -74,6 +76,21 @@ fit_mle(Categorical, x) # equivalent to fit_mle(Categorical, max(x), x) fit_mle(Categorical, x, w) ``` +It is also possible to directly input a distribution `fit_mle(d::Distribution, x[, w])`. This form avoids the extra arguments: +```julia +fit_mle(Binomial(n, 0.1), x) +# equivalent to fit_mle(Binomial, ntrials(Binomial(n, 0.1)), x), here the parameter 0.1 is not used + +fit_mle(Categorical(p), x) +# equivalent to fit_mle(Categorical, ncategories(Categorical(p)), x), here the only the length of p is used not its values + +d = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) +fit_mle(d, x) +# equivalent to product_distribution([fit_mle(Exponential, x[1,:]), fit_mle(Normal, x[2, :])]). Again parameters of d are not used. +``` +Note that for standard distributions, the values of the distribution parameters `d` are not used in `fit_mle` only the “structure” of `d` is passed into `fit_mle`. +However, for complex Maximum Likelihood estimation requiring optimization, e.g., EM algorithm, one could use `D` as an initial guess. + ## Sufficient Statistics For many distributions, the estimation can be based on (sum of) sufficient statistics computed from a dataset. To simplify implementation, for such distributions, we implement `suffstats` method instead of `fit_mle` directly: From c310bc36dbe21cc871ff6863a9b650cb222fe375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 20:35:33 +0100 Subject: [PATCH 09/17] tests new features + fit_mle binom and categorical --- src/univariate/discrete/binomial.jl | 1 + src/univariate/discrete/categorical.jl | 37 +++---- test/fit.jl | 128 +++++++++++++++++-------- 3 files changed, 107 insertions(+), 59 deletions(-) diff --git a/src/univariate/discrete/binomial.jl b/src/univariate/discrete/binomial.jl index f4102cbb8..3aa24545c 100644 --- a/src/univariate/discrete/binomial.jl +++ b/src/univariate/discrete/binomial.jl @@ -196,6 +196,7 @@ suffstats(::Type{T}, data::BinomData, w::AbstractArray{<:Real}) where {T<:Binomi fit_mle(::Type{T}, ss::BinomialStats) where {T<:Binomial} = T(ss.n, ss.ns / (ss.ne * ss.n)) +fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Binomial} = fit_mle(T, suffstats(T, ntrials(d), x)) fit_mle(::Type{T}, n::Integer, x::AbstractArray{<:Integer}) where {T<:Binomial}= fit_mle(T, suffstats(T, n, x)) fit_mle(::Type{T}, n::Integer, x::AbstractArray{<:Integer}, w::AbstractArray{<:Real}) where {T<:Binomial} = fit_mle(T, suffstats(T, n, x, w)) fit_mle(::Type{T}, data::BinomData) where {T<:Binomial} = fit_mle(T, suffstats(T, data)) diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl index 1dc08960c..d5270db3c 100644 --- a/src/univariate/discrete/categorical.jl +++ b/src/univariate/discrete/categorical.jl @@ -26,7 +26,7 @@ External links: """ const Categorical{P<:Real,Ps<:AbstractVector{P}} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps} -function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real, Ps<:AbstractVector{P}} +function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real,Ps<:AbstractVector{P}} @check_args Categorical (p, isprobvec(p), "vector p is not a probability vector") return Categorical{P,Ps}(Base.OneTo(length(p)), p; check_args=check_args) end @@ -36,7 +36,7 @@ Categorical(p::AbstractVector{P}; check_args::Bool=true) where {P<:Real} = function Categorical(k::Integer; check_args::Bool=true) @check_args Categorical (k, k >= 1, "at least one category is required") - return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k); check_args=false) + return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1 / k, k); check_args=false) end Categorical(probabilities::Real...; check_args::Bool=true) = Categorical([probabilities...]; check_args=check_args) @@ -49,7 +49,7 @@ convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where { ### Parameters ncategories(d::Categorical) = support(d).stop -params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),) +params(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = (probs(d),) partype(::Categorical{T}) where {T<:Real} = T ### Statistics @@ -59,7 +59,7 @@ function median(d::Categorical{T}) where {T<:Real} p = probs(d) cp = zero(T) i = 0 - while cp < 1/2 && i <= k + while cp < 1 / 2 && i <= k i += 1 @inbounds cp += p[i] end @@ -87,16 +87,16 @@ function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Re vr = min(vlast, ncategories(d)) p = probs(d) if vl > vfirst - for i = 1:(vl - vfirst) + for i = 1:(vl-vfirst) r[i] = zero(T) end end fm1 = vfirst - 1 for v = vl:vr - r[v - fm1] = p[v] + r[v-fm1] = p[v] end if vr < vlast - for i = (vr - vfirst + 2):length(rgn) + for i = (vr-vfirst+2):length(rgn) r[i] = zero(T) end end @@ -107,7 +107,7 @@ end # sampling sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = - AliasTable(probs(d)) + AliasTable(probs(d)) ### sufficient statistics @@ -116,20 +116,20 @@ struct CategoricalStats <: SufficientStats h::Vector{Float64} end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where T<:Integer - for i = 1 : length(x) +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where {T<:Integer} + for i = 1:length(x) @inbounds xi = x[i] - h[xi] += 1. # cannot use @inbounds, as no guarantee that x[i] is in bound + h[xi] += 1.0 # cannot use @inbounds, as no guarantee that x[i] is in bound end h end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} n = length(x) if n != length(w) throw(DimensionMismatch("Inconsistent array lengths.")) end - for i = 1 : n + for i = 1:n @inbounds xi = x[i] @inbounds wi = w[i] h[xi] += wi # cannot use @inbounds, as no guarantee that x[i] is in bound @@ -137,15 +137,15 @@ function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::Abs h end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where T<:Integer +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where {T<:Integer} CategoricalStats(add_categorical_counts!(zeros(k), x)) end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} CategoricalStats(add_categorical_counts!(zeros(k), x, w)) end -const CategoricalData = Tuple{Int, AbstractArray} +const CategoricalData = Tuple{Int,AbstractArray} suffstats(::Type{<:Categorical}, data::CategoricalData) = suffstats(Categorical, data...) suffstats(::Type{<:Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = suffstats(Categorical, data..., w) @@ -156,11 +156,12 @@ function fit_mle(::Type{<:Categorical}, ss::CategoricalStats) Categorical(normalize!(ss.h, 1)) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where T<:Integer +fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Categorical} = fit_mle(T, ncategories(d), x) +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where {T<:Integer} Categorical(normalize!(add_categorical_counts!(zeros(k), x), 1), check_args=false) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} Categorical(normalize!(add_categorical_counts!(zeros(k), x, w), 1), check_args=false) end diff --git a/test/fit.jl b/test/fit.jl index 4483dd1ec..968369c79 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -14,7 +14,7 @@ N = 10^5 rng = MersenneTwister(123) -const funcs = ([rand,rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) +const funcs = ([rand, rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) @testset "Testing fit for DiscreteUniform" begin for func in funcs @@ -40,28 +40,28 @@ end for x in (z, OffsetArray(z, -n0 ÷ 2)), w in (v, OffsetArray(v, -n0 ÷ 2)) ss = @inferred suffstats(D, x) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 == n0 - count(t->t != 0, z) - @test ss.cnt1 == count(t->t != 0, z) + @test ss.cnt0 == n0 - count(t -> t != 0, z) + @test ss.cnt1 == count(t -> t != 0, z) ss = @inferred suffstats(D, x, w) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 ≈ sum(v[z .== 0]) - @test ss.cnt1 ≈ sum(v[z .== 1]) + @test ss.cnt0 ≈ sum(v[z.==0]) + @test ss.cnt1 ≈ sum(v[z.==1]) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ count(t->t != 0, z) / n0 + @test mean(d) ≈ count(t -> t != 0, z) / n0 d = @inferred fit(D, x, w) @test d isa D - @test mean(d) ≈ sum(v[z .== 1]) / sum(v) + @test mean(d) ≈ sum(v[z.==1]) / sum(v) end z = rand(rng..., Bernoulli(0.7), N) for x in (z, OffsetArray(z, -N ÷ 2)) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ 0.7 atol=0.01 + @test mean(d) ≈ 0.7 atol = 0.01 end end end @@ -114,7 +114,11 @@ end d = @inferred fit(D, 100, x) @test d isa D @test ntrials(d) == 100 - @test succprob(d) ≈ 0.3 atol=0.01 + @test succprob(d) ≈ 0.3 atol = 0.01 + d2 = @inferred fit_mle(D(100, 0.5), x) + @test d2 isa D + @test ntrials(d2) == 100 + @test succprob(d2) ≈ 0.3 atol = 0.01 end end end @@ -128,7 +132,7 @@ end w = func[1](n0) ss = suffstats(Categorical, (3, x)) - h = Float64[count(v->v == i, x) for i = 1 : 3] + h = Float64[count(v -> v == i, x) for i = 1:3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -141,8 +145,12 @@ end @test isa(d2, Categorical) @test probs(d2) == probs(d) + d3 = fit_mle(Categorical(p), x) + @test isa(d3, Categorical) + @test probs(d3) == probs(d) + ss = suffstats(Categorical, (3, x), w) - h = Float64[sum(w[x .== i]) for i = 1 : 3] + h = Float64[sum(w[x.==i]) for i = 1:3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -165,6 +173,20 @@ end @test fit(Cauchy{Float64}, collect(-4.0:4.0)) === Cauchy(0.0, 2.0) end +@testset "Testing fit for Delta" begin + for func in funcs, dist in (Dirac, Dirac{Float64}) + x = func[2](dist(1.3), n0) + w = func[1](n0) + d = fit(dist, x) + @test isa(d, dist) + @test isapprox(d.value, 1.3) + + d = fit(dist, x, w) + @test isa(d, dist) + @test isapprox(d.value, 1.3) + end +end + @testset "Testing fit for Exponential" begin for func in funcs, dist in (Exponential, Exponential{Float64}) w = func[1](n0) @@ -204,27 +226,27 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ sum(x) - @test ss.m ≈ mean(x) - @test ss.s2 ≈ sum((x .- ss.m).^2) + @test ss.s ≈ sum(x) + @test ss.m ≈ mean(x) + @test ss.s2 ≈ sum((x .- ss.m) .^ 2) @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ dot(x, w) - @test ss.m ≈ dot(x, w) / sum(w) - @test ss.s2 ≈ dot((x .- ss.m).^2, w) + @test ss.s ≈ dot(x, w) + @test ss.m ≈ dot(x, w) / sum(w) + @test ss.s2 ≈ dot((x .- ss.m) .^ 2, w) @test ss.tw ≈ sum(w) d = fit(dist, x) @test isa(d, dist) @test d.μ ≈ mean(x) - @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) + @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) d = fit(dist, x, w) @test isa(d, dist) @test d.μ ≈ dot(x, w) / sum(w) - @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) d = fit(dist, func[2](dist(μ, σ), N)) @test isa(d, dist) @@ -252,18 +274,18 @@ end ss = suffstats(NormalKnownMu(μ), x, w) @test isa(ss, Distributions.NormalKnownMuStats) @test ss.μ == μ - @test ss.s2 ≈ dot((x .- μ).^2, w) + @test ss.s2 ≈ dot((x .- μ) .^ 2, w) @test ss.tw ≈ sum(w) d = fit_mle(Normal, x; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) + @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) d = fit_mle(Normal, x, w; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) ss = suffstats(NormalKnownSigma(σ), x) @@ -306,8 +328,8 @@ end d = fit(D, x) @test d isa D @test 1.2 <= minimum(d) <= maximum(d) <= 5.8 - @test minimum(d) ≈ 1.2 atol=0.02 - @test maximum(d) ≈ 5.8 atol=0.02 + @test minimum(d) ≈ 1.2 atol = 0.02 + @test maximum(d) ≈ 5.8 atol = 0.02 end end end @@ -319,15 +341,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.slogx ≈ sum(log.(x)) - @test ss.tw ≈ n0 + @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.slogx ≈ dot(log.(x), w) - @test ss.tw ≈ sum(w) + @test ss.tw ≈ sum(w) d = fit(dist, func[2](dist(3.9, 2.1), N)) @test isa(d, dist) @@ -353,11 +375,11 @@ end d = fit(dist, x) @test isa(d, dist) - @test succprob(d) ≈ inv(1. + mean(x)) + @test succprob(d) ≈ inv(1.0 + mean(x)) d = fit(dist, x, w) @test isa(d, dist) - @test succprob(d) ≈ inv(1. + dot(x, w) / sum(w)) + @test succprob(d) ≈ inv(1.0 + dot(x, w) / sum(w)) d = fit(dist, func[2](dist(0.3), N)) @test isa(d, dist) @@ -367,21 +389,29 @@ end @testset "Testing fit for Laplace" begin for func in funcs, dist in (Laplace, Laplace{Float64}) - d = fit(dist, func[2](dist(5.0, 3.0), N + 1)) + x = func[2](dist(5.0, 3.0), N) + w = func[1](N) + + d = fit(dist, x) @test isa(d, dist) @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d) , 3.0, atol=0.03) + @test isapprox(scale(d), 3.0, atol=0.03) + + d = fit(dist, x, w) + @test isa(d, dist) + @test isapprox(location(d), 5.0, atol=0.02) + @test isapprox(scale(d), 3.0, atol=0.03) end end @testset "Testing fit for Pareto" begin for func in funcs, dist in (Pareto, Pareto{Float64}) - x = func[2](dist(3., 7.), N) + x = func[2](dist(3.0, 7.0), N) d = fit(dist, x) @test isa(d, dist) - @test isapprox(shape(d), 3., atol=0.1) - @test isapprox(scale(d), 7., atol=0.1) + @test isapprox(shape(d), 3.0, atol=0.1) + @test isapprox(scale(d), 7.0, atol=0.1) end end @@ -414,6 +444,22 @@ end end end +@testset "Testing fit_mle for Product" begin + for func in funcs + dist = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) + x = rand(dist, N) + w = func[1](N) + + d = fit_mle(dist, x) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + + d = fit_mle(dist, x, w) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + end +end + @testset "Testing fit for InverseGaussian" begin for func in funcs, dist in (InverseGaussian, InverseGaussian{Float64}) x = rand(dist(3.9, 2.1), n0) @@ -421,15 +467,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.sinvx ≈ sum(1 ./ x) - @test ss.sw ≈ n0 + @test ss.sw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.sinvx ≈ dot(1 ./ x, w) - @test ss.sw ≈ sum(w) + @test ss.sw ≈ sum(w) d = fit(dist, rand(dist(3.9, 2.1), N)) @test isa(d, dist) @@ -460,8 +506,8 @@ end for func in funcs, dist in (Weibull, Weibull{Float64}) d = fit(dist, func[2](dist(8.1, 4.3), N)) @test isa(d, dist) - @test isapprox(d.α, 8.1, atol = 0.1) - @test isapprox(d.θ, 4.3, atol = 0.1) + @test isapprox(d.α, 8.1, atol=0.1) + @test isapprox(d.θ, 4.3, atol=0.1) end end From f933504c44877de69e2f22b98c8924361c71a540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Tue, 31 Jan 2023 21:13:54 +0100 Subject: [PATCH 10/17] doc ref Product --- docs/src/fit.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/fit.md b/docs/src/fit.md index de0756977..1e52baaa3 100644 --- a/docs/src/fit.md +++ b/docs/src/fit.md @@ -61,6 +61,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Multinomial`](@ref) - [`MvNormal`](@ref) - [`Dirichlet`](@ref) +- [`Product`](@ref) For most of these distributions, the usage is as described above. For a few special distributions that require additional information for estimation, we have to use a modified interface: From fe9c3c4b612f2d784251922e19a59b320963cf6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Wed, 1 Feb 2023 10:59:43 +0100 Subject: [PATCH 11/17] unformat --- src/genericfit.jl | 2 +- src/multivariate/product.jl | 4 +- src/univariate/continuous/laplace.jl | 2 +- src/univariate/discrete/categorical.jl | 36 +++---- src/univariate/discrete/dirac.jl | 2 +- test/fit.jl | 128 ++++++++----------------- 6 files changed, 64 insertions(+), 110 deletions(-) diff --git a/src/genericfit.jl b/src/genericfit.jl index 7c79fdcda..c84af46b1 100644 --- a/src/genericfit.jl +++ b/src/genericfit.jl @@ -1,6 +1,6 @@ # generic functions for distribution fitting -function suffstats(dt::Type{D}, xs...) where {D<:Distribution} +function suffstats(dt::Type{D}, xs...) where D<:Distribution argtypes = tuple(D, map(typeof, xs)...) error("suffstats is not implemented for $argtypes.") end diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 083ebfce3..5e6b264f3 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -27,12 +27,12 @@ function Product(v::V) where {S<:ValueSupport,T<:UnivariateDistribution{S},V<:Ab "`Product(v)` is deprecated, please use `product_distribution(v)`", :Product, ) - return Product{S,T,V}(v) + return Product{S, T ,V}(v) end length(d::Product) = length(d.v) function Base.eltype(::Type{<:Product{S,T}}) where {S<:ValueSupport, - T<:UnivariateDistribution{S}} + T<:UnivariateDistribution{S}} return eltype(T) end diff --git a/src/univariate/continuous/laplace.jl b/src/univariate/continuous/laplace.jl index caeaa1d29..245b29f20 100644 --- a/src/univariate/continuous/laplace.jl +++ b/src/univariate/continuous/laplace.jl @@ -147,4 +147,4 @@ function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}, w::AbstractArray{< m = median(xc, weights(w)) # https://github.com/JuliaStats/StatsBase.jl/blob/0b64a9c8a16355da16469d0fe5016e0fdf099af5/src/weights.jl#L788 xc .= abs.(x .- m) return Laplace(m, mean(xc, weights(w))) -end \ No newline at end of file +end diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl index d5270db3c..a49810d45 100644 --- a/src/univariate/discrete/categorical.jl +++ b/src/univariate/discrete/categorical.jl @@ -26,7 +26,7 @@ External links: """ const Categorical{P<:Real,Ps<:AbstractVector{P}} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps} -function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real,Ps<:AbstractVector{P}} +function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real, Ps<:AbstractVector{P}} @check_args Categorical (p, isprobvec(p), "vector p is not a probability vector") return Categorical{P,Ps}(Base.OneTo(length(p)), p; check_args=check_args) end @@ -36,7 +36,7 @@ Categorical(p::AbstractVector{P}; check_args::Bool=true) where {P<:Real} = function Categorical(k::Integer; check_args::Bool=true) @check_args Categorical (k, k >= 1, "at least one category is required") - return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1 / k, k); check_args=false) + return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k); check_args=false) end Categorical(probabilities::Real...; check_args::Bool=true) = Categorical([probabilities...]; check_args=check_args) @@ -49,7 +49,7 @@ convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where { ### Parameters ncategories(d::Categorical) = support(d).stop -params(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = (probs(d),) +params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),) partype(::Categorical{T}) where {T<:Real} = T ### Statistics @@ -59,7 +59,7 @@ function median(d::Categorical{T}) where {T<:Real} p = probs(d) cp = zero(T) i = 0 - while cp < 1 / 2 && i <= k + while cp < 1/2 && i <= k i += 1 @inbounds cp += p[i] end @@ -87,16 +87,16 @@ function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Re vr = min(vlast, ncategories(d)) p = probs(d) if vl > vfirst - for i = 1:(vl-vfirst) + for i = 1:(vl - vfirst) r[i] = zero(T) end end fm1 = vfirst - 1 for v = vl:vr - r[v-fm1] = p[v] + r[v - fm1] = p[v] end if vr < vlast - for i = (vr-vfirst+2):length(rgn) + for i = (vr - vfirst + 2):length(rgn) r[i] = zero(T) end end @@ -107,7 +107,7 @@ end # sampling sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = - AliasTable(probs(d)) + AliasTable(probs(d)) ### sufficient statistics @@ -116,20 +116,20 @@ struct CategoricalStats <: SufficientStats h::Vector{Float64} end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where {T<:Integer} - for i = 1:length(x) +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where T<:Integer + for i = 1 : length(x) @inbounds xi = x[i] - h[xi] += 1.0 # cannot use @inbounds, as no guarantee that x[i] is in bound + h[xi] += 1. # cannot use @inbounds, as no guarantee that x[i] is in bound end h end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer n = length(x) if n != length(w) throw(DimensionMismatch("Inconsistent array lengths.")) end - for i = 1:n + for i = 1 : n @inbounds xi = x[i] @inbounds wi = w[i] h[xi] += wi # cannot use @inbounds, as no guarantee that x[i] is in bound @@ -137,15 +137,15 @@ function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::Abs h end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where {T<:Integer} +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where T<:Integer CategoricalStats(add_categorical_counts!(zeros(k), x)) end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer CategoricalStats(add_categorical_counts!(zeros(k), x, w)) end -const CategoricalData = Tuple{Int,AbstractArray} +const CategoricalData = Tuple{Int, AbstractArray} suffstats(::Type{<:Categorical}, data::CategoricalData) = suffstats(Categorical, data...) suffstats(::Type{<:Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = suffstats(Categorical, data..., w) @@ -157,11 +157,11 @@ function fit_mle(::Type{<:Categorical}, ss::CategoricalStats) end fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Categorical} = fit_mle(T, ncategories(d), x) -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where {T<:Integer} +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where T<:Integer Categorical(normalize!(add_categorical_counts!(zeros(k), x), 1), check_args=false) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer Categorical(normalize!(add_categorical_counts!(zeros(k), x, w), 1), check_args=false) end diff --git a/src/univariate/discrete/dirac.jl b/src/univariate/discrete/dirac.jl index 49645078d..adf385740 100644 --- a/src/univariate/discrete/dirac.jl +++ b/src/univariate/discrete/dirac.jl @@ -50,7 +50,7 @@ logccdf(d::Dirac, x::Real) = x < d.value ? 0.0 : isnan(x) ? NaN : -Inf quantile(d::Dirac{T}, p::Real) where {T} = 0 <= p <= 1 ? d.value : T(NaN) mgf(d::Dirac, t) = exp(t * d.value) -cgf(d::Dirac, t) = t * d.value +cgf(d::Dirac, t) = t*d.value cf(d::Dirac, t) = cis(t * d.value) #### Sampling diff --git a/test/fit.jl b/test/fit.jl index 968369c79..4483dd1ec 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -14,7 +14,7 @@ N = 10^5 rng = MersenneTwister(123) -const funcs = ([rand, rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) +const funcs = ([rand,rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) @testset "Testing fit for DiscreteUniform" begin for func in funcs @@ -40,28 +40,28 @@ end for x in (z, OffsetArray(z, -n0 ÷ 2)), w in (v, OffsetArray(v, -n0 ÷ 2)) ss = @inferred suffstats(D, x) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 == n0 - count(t -> t != 0, z) - @test ss.cnt1 == count(t -> t != 0, z) + @test ss.cnt0 == n0 - count(t->t != 0, z) + @test ss.cnt1 == count(t->t != 0, z) ss = @inferred suffstats(D, x, w) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 ≈ sum(v[z.==0]) - @test ss.cnt1 ≈ sum(v[z.==1]) + @test ss.cnt0 ≈ sum(v[z .== 0]) + @test ss.cnt1 ≈ sum(v[z .== 1]) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ count(t -> t != 0, z) / n0 + @test mean(d) ≈ count(t->t != 0, z) / n0 d = @inferred fit(D, x, w) @test d isa D - @test mean(d) ≈ sum(v[z.==1]) / sum(v) + @test mean(d) ≈ sum(v[z .== 1]) / sum(v) end z = rand(rng..., Bernoulli(0.7), N) for x in (z, OffsetArray(z, -N ÷ 2)) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ 0.7 atol = 0.01 + @test mean(d) ≈ 0.7 atol=0.01 end end end @@ -114,11 +114,7 @@ end d = @inferred fit(D, 100, x) @test d isa D @test ntrials(d) == 100 - @test succprob(d) ≈ 0.3 atol = 0.01 - d2 = @inferred fit_mle(D(100, 0.5), x) - @test d2 isa D - @test ntrials(d2) == 100 - @test succprob(d2) ≈ 0.3 atol = 0.01 + @test succprob(d) ≈ 0.3 atol=0.01 end end end @@ -132,7 +128,7 @@ end w = func[1](n0) ss = suffstats(Categorical, (3, x)) - h = Float64[count(v -> v == i, x) for i = 1:3] + h = Float64[count(v->v == i, x) for i = 1 : 3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -145,12 +141,8 @@ end @test isa(d2, Categorical) @test probs(d2) == probs(d) - d3 = fit_mle(Categorical(p), x) - @test isa(d3, Categorical) - @test probs(d3) == probs(d) - ss = suffstats(Categorical, (3, x), w) - h = Float64[sum(w[x.==i]) for i = 1:3] + h = Float64[sum(w[x .== i]) for i = 1 : 3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -173,20 +165,6 @@ end @test fit(Cauchy{Float64}, collect(-4.0:4.0)) === Cauchy(0.0, 2.0) end -@testset "Testing fit for Delta" begin - for func in funcs, dist in (Dirac, Dirac{Float64}) - x = func[2](dist(1.3), n0) - w = func[1](n0) - d = fit(dist, x) - @test isa(d, dist) - @test isapprox(d.value, 1.3) - - d = fit(dist, x, w) - @test isa(d, dist) - @test isapprox(d.value, 1.3) - end -end - @testset "Testing fit for Exponential" begin for func in funcs, dist in (Exponential, Exponential{Float64}) w = func[1](n0) @@ -226,27 +204,27 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ sum(x) - @test ss.m ≈ mean(x) - @test ss.s2 ≈ sum((x .- ss.m) .^ 2) + @test ss.s ≈ sum(x) + @test ss.m ≈ mean(x) + @test ss.s2 ≈ sum((x .- ss.m).^2) @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ dot(x, w) - @test ss.m ≈ dot(x, w) / sum(w) - @test ss.s2 ≈ dot((x .- ss.m) .^ 2, w) + @test ss.s ≈ dot(x, w) + @test ss.m ≈ dot(x, w) / sum(w) + @test ss.s2 ≈ dot((x .- ss.m).^2, w) @test ss.tw ≈ sum(w) d = fit(dist, x) @test isa(d, dist) @test d.μ ≈ mean(x) - @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) + @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) d = fit(dist, x, w) @test isa(d, dist) @test d.μ ≈ dot(x, w) / sum(w) - @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) d = fit(dist, func[2](dist(μ, σ), N)) @test isa(d, dist) @@ -274,18 +252,18 @@ end ss = suffstats(NormalKnownMu(μ), x, w) @test isa(ss, Distributions.NormalKnownMuStats) @test ss.μ == μ - @test ss.s2 ≈ dot((x .- μ) .^ 2, w) + @test ss.s2 ≈ dot((x .- μ).^2, w) @test ss.tw ≈ sum(w) d = fit_mle(Normal, x; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) + @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) d = fit_mle(Normal, x, w; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) ss = suffstats(NormalKnownSigma(σ), x) @@ -328,8 +306,8 @@ end d = fit(D, x) @test d isa D @test 1.2 <= minimum(d) <= maximum(d) <= 5.8 - @test minimum(d) ≈ 1.2 atol = 0.02 - @test maximum(d) ≈ 5.8 atol = 0.02 + @test minimum(d) ≈ 1.2 atol=0.02 + @test maximum(d) ≈ 5.8 atol=0.02 end end end @@ -341,15 +319,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.slogx ≈ sum(log.(x)) - @test ss.tw ≈ n0 + @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.slogx ≈ dot(log.(x), w) - @test ss.tw ≈ sum(w) + @test ss.tw ≈ sum(w) d = fit(dist, func[2](dist(3.9, 2.1), N)) @test isa(d, dist) @@ -375,11 +353,11 @@ end d = fit(dist, x) @test isa(d, dist) - @test succprob(d) ≈ inv(1.0 + mean(x)) + @test succprob(d) ≈ inv(1. + mean(x)) d = fit(dist, x, w) @test isa(d, dist) - @test succprob(d) ≈ inv(1.0 + dot(x, w) / sum(w)) + @test succprob(d) ≈ inv(1. + dot(x, w) / sum(w)) d = fit(dist, func[2](dist(0.3), N)) @test isa(d, dist) @@ -389,29 +367,21 @@ end @testset "Testing fit for Laplace" begin for func in funcs, dist in (Laplace, Laplace{Float64}) - x = func[2](dist(5.0, 3.0), N) - w = func[1](N) - - d = fit(dist, x) + d = fit(dist, func[2](dist(5.0, 3.0), N + 1)) @test isa(d, dist) @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d), 3.0, atol=0.03) - - d = fit(dist, x, w) - @test isa(d, dist) - @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d), 3.0, atol=0.03) + @test isapprox(scale(d) , 3.0, atol=0.03) end end @testset "Testing fit for Pareto" begin for func in funcs, dist in (Pareto, Pareto{Float64}) - x = func[2](dist(3.0, 7.0), N) + x = func[2](dist(3., 7.), N) d = fit(dist, x) @test isa(d, dist) - @test isapprox(shape(d), 3.0, atol=0.1) - @test isapprox(scale(d), 7.0, atol=0.1) + @test isapprox(shape(d), 3., atol=0.1) + @test isapprox(scale(d), 7., atol=0.1) end end @@ -444,22 +414,6 @@ end end end -@testset "Testing fit_mle for Product" begin - for func in funcs - dist = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) - x = rand(dist, N) - w = func[1](N) - - d = fit_mle(dist, x) - @test isa(d, typeof(dist)) - @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) - - d = fit_mle(dist, x, w) - @test isa(d, typeof(dist)) - @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) - end -end - @testset "Testing fit for InverseGaussian" begin for func in funcs, dist in (InverseGaussian, InverseGaussian{Float64}) x = rand(dist(3.9, 2.1), n0) @@ -467,15 +421,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.sinvx ≈ sum(1 ./ x) - @test ss.sw ≈ n0 + @test ss.sw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.sinvx ≈ dot(1 ./ x, w) - @test ss.sw ≈ sum(w) + @test ss.sw ≈ sum(w) d = fit(dist, rand(dist(3.9, 2.1), N)) @test isa(d, dist) @@ -506,8 +460,8 @@ end for func in funcs, dist in (Weibull, Weibull{Float64}) d = fit(dist, func[2](dist(8.1, 4.3), N)) @test isa(d, dist) - @test isapprox(d.α, 8.1, atol=0.1) - @test isapprox(d.θ, 4.3, atol=0.1) + @test isapprox(d.α, 8.1, atol = 0.1) + @test isapprox(d.θ, 4.3, atol = 0.1) end end From 07433f28d7a837acc21c0432cc5b1a7f508bb64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Wed, 1 Feb 2023 11:00:41 +0100 Subject: [PATCH 12/17] unformat --- src/genericfit.jl | 2 +- src/multivariate/product.jl | 4 +- src/univariate/continuous/laplace.jl | 2 +- src/univariate/discrete/categorical.jl | 36 +++---- src/univariate/discrete/dirac.jl | 2 +- test/fit.jl | 128 ++++++++----------------- 6 files changed, 64 insertions(+), 110 deletions(-) diff --git a/src/genericfit.jl b/src/genericfit.jl index 7c79fdcda..c84af46b1 100644 --- a/src/genericfit.jl +++ b/src/genericfit.jl @@ -1,6 +1,6 @@ # generic functions for distribution fitting -function suffstats(dt::Type{D}, xs...) where {D<:Distribution} +function suffstats(dt::Type{D}, xs...) where D<:Distribution argtypes = tuple(D, map(typeof, xs)...) error("suffstats is not implemented for $argtypes.") end diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 083ebfce3..5e6b264f3 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -27,12 +27,12 @@ function Product(v::V) where {S<:ValueSupport,T<:UnivariateDistribution{S},V<:Ab "`Product(v)` is deprecated, please use `product_distribution(v)`", :Product, ) - return Product{S,T,V}(v) + return Product{S, T ,V}(v) end length(d::Product) = length(d.v) function Base.eltype(::Type{<:Product{S,T}}) where {S<:ValueSupport, - T<:UnivariateDistribution{S}} + T<:UnivariateDistribution{S}} return eltype(T) end diff --git a/src/univariate/continuous/laplace.jl b/src/univariate/continuous/laplace.jl index caeaa1d29..245b29f20 100644 --- a/src/univariate/continuous/laplace.jl +++ b/src/univariate/continuous/laplace.jl @@ -147,4 +147,4 @@ function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}, w::AbstractArray{< m = median(xc, weights(w)) # https://github.com/JuliaStats/StatsBase.jl/blob/0b64a9c8a16355da16469d0fe5016e0fdf099af5/src/weights.jl#L788 xc .= abs.(x .- m) return Laplace(m, mean(xc, weights(w))) -end \ No newline at end of file +end diff --git a/src/univariate/discrete/categorical.jl b/src/univariate/discrete/categorical.jl index d5270db3c..a49810d45 100644 --- a/src/univariate/discrete/categorical.jl +++ b/src/univariate/discrete/categorical.jl @@ -26,7 +26,7 @@ External links: """ const Categorical{P<:Real,Ps<:AbstractVector{P}} = DiscreteNonParametric{Int,P,Base.OneTo{Int},Ps} -function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real,Ps<:AbstractVector{P}} +function Categorical{P,Ps}(p::Ps; check_args::Bool=true) where {P<:Real, Ps<:AbstractVector{P}} @check_args Categorical (p, isprobvec(p), "vector p is not a probability vector") return Categorical{P,Ps}(Base.OneTo(length(p)), p; check_args=check_args) end @@ -36,7 +36,7 @@ Categorical(p::AbstractVector{P}; check_args::Bool=true) where {P<:Real} = function Categorical(k::Integer; check_args::Bool=true) @check_args Categorical (k, k >= 1, "at least one category is required") - return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1 / k, k); check_args=false) + return Categorical{Float64,Vector{Float64}}(Base.OneTo(k), fill(1/k, k); check_args=false) end Categorical(probabilities::Real...; check_args::Bool=true) = Categorical([probabilities...]; check_args=check_args) @@ -49,7 +49,7 @@ convert(::Type{Categorical{P,Ps}}, x::AbstractVector{<:Real}) where { ### Parameters ncategories(d::Categorical) = support(d).stop -params(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = (probs(d),) +params(d::Categorical{P,Ps}) where {P<:Real, Ps<:AbstractVector{P}} = (probs(d),) partype(::Categorical{T}) where {T<:Real} = T ### Statistics @@ -59,7 +59,7 @@ function median(d::Categorical{T}) where {T<:Real} p = probs(d) cp = zero(T) i = 0 - while cp < 1 / 2 && i <= k + while cp < 1/2 && i <= k i += 1 @inbounds cp += p[i] end @@ -87,16 +87,16 @@ function _pdf!(r::AbstractArray, d::Categorical{T}, rgn::UnitRange) where {T<:Re vr = min(vlast, ncategories(d)) p = probs(d) if vl > vfirst - for i = 1:(vl-vfirst) + for i = 1:(vl - vfirst) r[i] = zero(T) end end fm1 = vfirst - 1 for v = vl:vr - r[v-fm1] = p[v] + r[v - fm1] = p[v] end if vr < vlast - for i = (vr-vfirst+2):length(rgn) + for i = (vr - vfirst + 2):length(rgn) r[i] = zero(T) end end @@ -107,7 +107,7 @@ end # sampling sampler(d::Categorical{P,Ps}) where {P<:Real,Ps<:AbstractVector{P}} = - AliasTable(probs(d)) + AliasTable(probs(d)) ### sufficient statistics @@ -116,20 +116,20 @@ struct CategoricalStats <: SufficientStats h::Vector{Float64} end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where {T<:Integer} - for i = 1:length(x) +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}) where T<:Integer + for i = 1 : length(x) @inbounds xi = x[i] - h[xi] += 1.0 # cannot use @inbounds, as no guarantee that x[i] is in bound + h[xi] += 1. # cannot use @inbounds, as no guarantee that x[i] is in bound end h end -function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer n = length(x) if n != length(w) throw(DimensionMismatch("Inconsistent array lengths.")) end - for i = 1:n + for i = 1 : n @inbounds xi = x[i] @inbounds wi = w[i] h[xi] += wi # cannot use @inbounds, as no guarantee that x[i] is in bound @@ -137,15 +137,15 @@ function add_categorical_counts!(h::Vector{Float64}, x::AbstractArray{T}, w::Abs h end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where {T<:Integer} +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}) where T<:Integer CategoricalStats(add_categorical_counts!(zeros(k), x)) end -function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function suffstats(::Type{<:Categorical}, k::Int, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer CategoricalStats(add_categorical_counts!(zeros(k), x, w)) end -const CategoricalData = Tuple{Int,AbstractArray} +const CategoricalData = Tuple{Int, AbstractArray} suffstats(::Type{<:Categorical}, data::CategoricalData) = suffstats(Categorical, data...) suffstats(::Type{<:Categorical}, data::CategoricalData, w::AbstractArray{Float64}) = suffstats(Categorical, data..., w) @@ -157,11 +157,11 @@ function fit_mle(::Type{<:Categorical}, ss::CategoricalStats) end fit_mle(d::T, x::AbstractArray{<:Integer}) where {T<:Categorical} = fit_mle(T, ncategories(d), x) -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where {T<:Integer} +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}) where T<:Integer Categorical(normalize!(add_categorical_counts!(zeros(k), x), 1), check_args=false) end -function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Integer} +function fit_mle(::Type{<:Categorical}, k::Integer, x::AbstractArray{T}, w::AbstractArray{Float64}) where T<:Integer Categorical(normalize!(add_categorical_counts!(zeros(k), x, w), 1), check_args=false) end diff --git a/src/univariate/discrete/dirac.jl b/src/univariate/discrete/dirac.jl index 49645078d..adf385740 100644 --- a/src/univariate/discrete/dirac.jl +++ b/src/univariate/discrete/dirac.jl @@ -50,7 +50,7 @@ logccdf(d::Dirac, x::Real) = x < d.value ? 0.0 : isnan(x) ? NaN : -Inf quantile(d::Dirac{T}, p::Real) where {T} = 0 <= p <= 1 ? d.value : T(NaN) mgf(d::Dirac, t) = exp(t * d.value) -cgf(d::Dirac, t) = t * d.value +cgf(d::Dirac, t) = t*d.value cf(d::Dirac, t) = cis(t * d.value) #### Sampling diff --git a/test/fit.jl b/test/fit.jl index 968369c79..4483dd1ec 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -14,7 +14,7 @@ N = 10^5 rng = MersenneTwister(123) -const funcs = ([rand, rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) +const funcs = ([rand,rand], [dist -> rand(rng, dist), (dist, n) -> rand(rng, dist, n)]) @testset "Testing fit for DiscreteUniform" begin for func in funcs @@ -40,28 +40,28 @@ end for x in (z, OffsetArray(z, -n0 ÷ 2)), w in (v, OffsetArray(v, -n0 ÷ 2)) ss = @inferred suffstats(D, x) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 == n0 - count(t -> t != 0, z) - @test ss.cnt1 == count(t -> t != 0, z) + @test ss.cnt0 == n0 - count(t->t != 0, z) + @test ss.cnt1 == count(t->t != 0, z) ss = @inferred suffstats(D, x, w) @test ss isa Distributions.BernoulliStats - @test ss.cnt0 ≈ sum(v[z.==0]) - @test ss.cnt1 ≈ sum(v[z.==1]) + @test ss.cnt0 ≈ sum(v[z .== 0]) + @test ss.cnt1 ≈ sum(v[z .== 1]) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ count(t -> t != 0, z) / n0 + @test mean(d) ≈ count(t->t != 0, z) / n0 d = @inferred fit(D, x, w) @test d isa D - @test mean(d) ≈ sum(v[z.==1]) / sum(v) + @test mean(d) ≈ sum(v[z .== 1]) / sum(v) end z = rand(rng..., Bernoulli(0.7), N) for x in (z, OffsetArray(z, -N ÷ 2)) d = @inferred fit(D, x) @test d isa D - @test mean(d) ≈ 0.7 atol = 0.01 + @test mean(d) ≈ 0.7 atol=0.01 end end end @@ -114,11 +114,7 @@ end d = @inferred fit(D, 100, x) @test d isa D @test ntrials(d) == 100 - @test succprob(d) ≈ 0.3 atol = 0.01 - d2 = @inferred fit_mle(D(100, 0.5), x) - @test d2 isa D - @test ntrials(d2) == 100 - @test succprob(d2) ≈ 0.3 atol = 0.01 + @test succprob(d) ≈ 0.3 atol=0.01 end end end @@ -132,7 +128,7 @@ end w = func[1](n0) ss = suffstats(Categorical, (3, x)) - h = Float64[count(v -> v == i, x) for i = 1:3] + h = Float64[count(v->v == i, x) for i = 1 : 3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -145,12 +141,8 @@ end @test isa(d2, Categorical) @test probs(d2) == probs(d) - d3 = fit_mle(Categorical(p), x) - @test isa(d3, Categorical) - @test probs(d3) == probs(d) - ss = suffstats(Categorical, (3, x), w) - h = Float64[sum(w[x.==i]) for i = 1:3] + h = Float64[sum(w[x .== i]) for i = 1 : 3] @test isa(ss, Distributions.CategoricalStats) @test ss.h ≈ h @@ -173,20 +165,6 @@ end @test fit(Cauchy{Float64}, collect(-4.0:4.0)) === Cauchy(0.0, 2.0) end -@testset "Testing fit for Delta" begin - for func in funcs, dist in (Dirac, Dirac{Float64}) - x = func[2](dist(1.3), n0) - w = func[1](n0) - d = fit(dist, x) - @test isa(d, dist) - @test isapprox(d.value, 1.3) - - d = fit(dist, x, w) - @test isa(d, dist) - @test isapprox(d.value, 1.3) - end -end - @testset "Testing fit for Exponential" begin for func in funcs, dist in (Exponential, Exponential{Float64}) w = func[1](n0) @@ -226,27 +204,27 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ sum(x) - @test ss.m ≈ mean(x) - @test ss.s2 ≈ sum((x .- ss.m) .^ 2) + @test ss.s ≈ sum(x) + @test ss.m ≈ mean(x) + @test ss.s2 ≈ sum((x .- ss.m).^2) @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.NormalStats) - @test ss.s ≈ dot(x, w) - @test ss.m ≈ dot(x, w) / sum(w) - @test ss.s2 ≈ dot((x .- ss.m) .^ 2, w) + @test ss.s ≈ dot(x, w) + @test ss.m ≈ dot(x, w) / sum(w) + @test ss.s2 ≈ dot((x .- ss.m).^2, w) @test ss.tw ≈ sum(w) d = fit(dist, x) @test isa(d, dist) @test d.μ ≈ mean(x) - @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) + @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) d = fit(dist, x, w) @test isa(d, dist) @test d.μ ≈ dot(x, w) / sum(w) - @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) d = fit(dist, func[2](dist(μ, σ), N)) @test isa(d, dist) @@ -274,18 +252,18 @@ end ss = suffstats(NormalKnownMu(μ), x, w) @test isa(ss, Distributions.NormalKnownMuStats) @test ss.μ == μ - @test ss.s2 ≈ dot((x .- μ) .^ 2, w) + @test ss.s2 ≈ dot((x .- μ).^2, w) @test ss.tw ≈ sum(w) d = fit_mle(Normal, x; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(mean((x .- d.μ) .^ 2)) + @test d.σ ≈ sqrt(mean((x .- d.μ).^2)) d = fit_mle(Normal, x, w; mu=μ) @test isa(d, Normal) @test d.μ == μ - @test d.σ ≈ sqrt(dot((x .- d.μ) .^ 2, w) / sum(w)) + @test d.σ ≈ sqrt(dot((x .- d.μ).^2, w) / sum(w)) ss = suffstats(NormalKnownSigma(σ), x) @@ -328,8 +306,8 @@ end d = fit(D, x) @test d isa D @test 1.2 <= minimum(d) <= maximum(d) <= 5.8 - @test minimum(d) ≈ 1.2 atol = 0.02 - @test maximum(d) ≈ 5.8 atol = 0.02 + @test minimum(d) ≈ 1.2 atol=0.02 + @test maximum(d) ≈ 5.8 atol=0.02 end end end @@ -341,15 +319,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.slogx ≈ sum(log.(x)) - @test ss.tw ≈ n0 + @test ss.tw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.GammaStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.slogx ≈ dot(log.(x), w) - @test ss.tw ≈ sum(w) + @test ss.tw ≈ sum(w) d = fit(dist, func[2](dist(3.9, 2.1), N)) @test isa(d, dist) @@ -375,11 +353,11 @@ end d = fit(dist, x) @test isa(d, dist) - @test succprob(d) ≈ inv(1.0 + mean(x)) + @test succprob(d) ≈ inv(1. + mean(x)) d = fit(dist, x, w) @test isa(d, dist) - @test succprob(d) ≈ inv(1.0 + dot(x, w) / sum(w)) + @test succprob(d) ≈ inv(1. + dot(x, w) / sum(w)) d = fit(dist, func[2](dist(0.3), N)) @test isa(d, dist) @@ -389,29 +367,21 @@ end @testset "Testing fit for Laplace" begin for func in funcs, dist in (Laplace, Laplace{Float64}) - x = func[2](dist(5.0, 3.0), N) - w = func[1](N) - - d = fit(dist, x) + d = fit(dist, func[2](dist(5.0, 3.0), N + 1)) @test isa(d, dist) @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d), 3.0, atol=0.03) - - d = fit(dist, x, w) - @test isa(d, dist) - @test isapprox(location(d), 5.0, atol=0.02) - @test isapprox(scale(d), 3.0, atol=0.03) + @test isapprox(scale(d) , 3.0, atol=0.03) end end @testset "Testing fit for Pareto" begin for func in funcs, dist in (Pareto, Pareto{Float64}) - x = func[2](dist(3.0, 7.0), N) + x = func[2](dist(3., 7.), N) d = fit(dist, x) @test isa(d, dist) - @test isapprox(shape(d), 3.0, atol=0.1) - @test isapprox(scale(d), 7.0, atol=0.1) + @test isapprox(shape(d), 3., atol=0.1) + @test isapprox(scale(d), 7., atol=0.1) end end @@ -444,22 +414,6 @@ end end end -@testset "Testing fit_mle for Product" begin - for func in funcs - dist = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) - x = rand(dist, N) - w = func[1](N) - - d = fit_mle(dist, x) - @test isa(d, typeof(dist)) - @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) - - d = fit_mle(dist, x, w) - @test isa(d, typeof(dist)) - @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) - end -end - @testset "Testing fit for InverseGaussian" begin for func in funcs, dist in (InverseGaussian, InverseGaussian{Float64}) x = rand(dist(3.9, 2.1), n0) @@ -467,15 +421,15 @@ end ss = suffstats(dist, x) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ sum(x) + @test ss.sx ≈ sum(x) @test ss.sinvx ≈ sum(1 ./ x) - @test ss.sw ≈ n0 + @test ss.sw ≈ n0 ss = suffstats(dist, x, w) @test isa(ss, Distributions.InverseGaussianStats) - @test ss.sx ≈ dot(x, w) + @test ss.sx ≈ dot(x, w) @test ss.sinvx ≈ dot(1 ./ x, w) - @test ss.sw ≈ sum(w) + @test ss.sw ≈ sum(w) d = fit(dist, rand(dist(3.9, 2.1), N)) @test isa(d, dist) @@ -506,8 +460,8 @@ end for func in funcs, dist in (Weibull, Weibull{Float64}) d = fit(dist, func[2](dist(8.1, 4.3), N)) @test isa(d, dist) - @test isapprox(d.α, 8.1, atol=0.1) - @test isapprox(d.θ, 4.3, atol=0.1) + @test isapprox(d.α, 8.1, atol = 0.1) + @test isapprox(d.θ, 4.3, atol = 0.1) end end From dd4629da8ce8eab8dca0686425594aca31b1cf66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:08:08 +0100 Subject: [PATCH 13/17] mv Product to ProductDistr rm Laplace, Dirac --- docs/src/fit.md | 5 +++-- src/multivariate/product.jl | 13 ------------- src/product.jl | 19 +++++++++++++++++++ src/univariate/continuous/laplace.jl | 12 ------------ src/univariate/discrete/dirac.jl | 12 ------------ 5 files changed, 22 insertions(+), 39 deletions(-) diff --git a/docs/src/fit.md b/docs/src/fit.md index 1e52baaa3..3c1903c22 100644 --- a/docs/src/fit.md +++ b/docs/src/fit.md @@ -42,7 +42,6 @@ The `fit_mle` method has been implemented for the following distributions: - [`Binomial`](@ref) - [`Categorical`](@ref) - [`DiscreteUniform`](@ref) -- [`Dirac`](@ref) - [`Exponential`](@ref) - [`LogNormal`](@ref) - [`Normal`](@ref) @@ -61,7 +60,7 @@ The `fit_mle` method has been implemented for the following distributions: - [`Multinomial`](@ref) - [`MvNormal`](@ref) - [`Dirichlet`](@ref) -- [`Product`](@ref) +- [`ProductDistribution`](@ref) For most of these distributions, the usage is as described above. For a few special distributions that require additional information for estimation, we have to use a modified interface: @@ -77,6 +76,7 @@ fit_mle(Categorical, x, w) ``` It is also possible to directly input a distribution `fit_mle(d::Distribution, x[, w])`. This form avoids the extra arguments: + ```julia fit_mle(Binomial(n, 0.1), x) # equivalent to fit_mle(Binomial, ntrials(Binomial(n, 0.1)), x), here the parameter 0.1 is not used @@ -88,6 +88,7 @@ d = product_distribution([Exponential(0.5), Normal(11.3, 3.2)]) fit_mle(d, x) # equivalent to product_distribution([fit_mle(Exponential, x[1,:]), fit_mle(Normal, x[2, :])]). Again parameters of d are not used. ``` + Note that for standard distributions, the values of the distribution parameters `d` are not used in `fit_mle` only the “structure” of `d` is passed into `fit_mle`. However, for complex Maximum Likelihood estimation requiring optimization, e.g., EM algorithm, one could use `D` as an initial guess. diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 5e6b264f3..8d9607c4e 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -63,16 +63,3 @@ function product_distribution(dists::V) where {S<:ValueSupport,T<:UnivariateDist return Product{S,T,V}(dists) end -#### Fitting - -""" - fit_mle(g::Product, x::AbstractMatrix) - fit_mle(g::Product, x::AbstractMatrix, γ::AbstractVector) - -The `fit_mle` for a multivariate Product distributions `g` is the `product_distribution` of `fit_mle` of each components of `g`. -""" -function fit_mle(g::Product, x::AbstractMatrix, args...) - d = size(x, 1) - length(g) == d || throw(DimensionMismatch("The dimensions of g and x are inconsistent.")) - return product_distribution([fit_mle(g.v[s], y, args...) for (s, y) in enumerate(eachrow(x))]) -end diff --git a/src/product.jl b/src/product.jl index 7a4904ae7..def59f3e7 100644 --- a/src/product.jl +++ b/src/product.jl @@ -244,3 +244,22 @@ function product_distribution(dists::AbstractVector{<:Normal}) σ2 = map(var, dists) return MvNormal(µ, Diagonal(σ2)) end + +#### Fitting +promote_sample(::Type{dT}, x::AbstractArray{T}) where {T<:Real, dT<:Real} = T <: dT ? x : convert.(dT, x) + +""" + fit_mle(dists::ArrayOfUnivariateDistribution, x::AbstractArray) + fit_mle(dists::ArrayOfUnivariateDistribution, x::AbstractArray, γ::AbstractVector) + +The `fit_mle` for a `ArrayOfUnivariateDistribution` distributions `dists` is the `product_distribution` of `fit_mle` of each components of `dists`. +""" +function fit_mle(dists::VectorOfUnivariateDistribution, x::AbstractMatrix{<:Real}, args...) + length(dists) == size(x, 1) || throw(DimensionMismatch("The dimensions of dists and x are inconsistent.")) + return product_distribution([fit_mle(d, promote_sample(eltype(d), x[s, :]), args...) for (s, d) in enumerate(dists.dists)]) +end + +function fit_mle(dists::ArrayOfUnivariateDistribution, x::AbstractArray, args...) + size(dists) == size(first(x)) || throw(DimensionMismatch("The dimensions of dists and x are inconsistent.")) + return product_distribution([fit_mle(d, promote_sample(eltype(d), [x[i][s] for i in eachindex(x)]), args...) for (s, d) in enumerate(dists.dists)]) +end \ No newline at end of file diff --git a/src/univariate/continuous/laplace.jl b/src/univariate/continuous/laplace.jl index 245b29f20..2a7bf04a4 100644 --- a/src/univariate/continuous/laplace.jl +++ b/src/univariate/continuous/laplace.jl @@ -136,15 +136,3 @@ function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}) xc .= abs.(x .- m) return Laplace(m, mean(xc)) end - -function fit_mle(::Type{<:Laplace}, x::AbstractArray{<:Real}, w::AbstractArray{<:Real}) - n = length(x) - if n != length(w) - throw(DimensionMismatch("Inconsistent array lengths.")) - end - xc = similar(x) - copyto!(xc, x) - m = median(xc, weights(w)) # https://github.com/JuliaStats/StatsBase.jl/blob/0b64a9c8a16355da16469d0fe5016e0fdf099af5/src/weights.jl#L788 - xc .= abs.(x .- m) - return Laplace(m, mean(xc, weights(w))) -end diff --git a/src/univariate/discrete/dirac.jl b/src/univariate/discrete/dirac.jl index adf385740..94d082b0f 100644 --- a/src/univariate/discrete/dirac.jl +++ b/src/univariate/discrete/dirac.jl @@ -56,15 +56,3 @@ cf(d::Dirac, t) = cis(t * d.value) #### Sampling rand(rng::AbstractRNG, d::Dirac) = d.value - -#### Fitting - -fit_mle(::Type{<:Dirac}, x::AbstractArray{T}) where {T<:Real} = length(unique(x)) == 1 ? Dirac(first(x)) : Dirac(NaN) - -function fit_mle(::Type{<:Dirac}, x::AbstractArray{T}, w::AbstractArray{Float64}) where {T<:Real} - n = length(x) - if n != length(w) - throw(DimensionMismatch("Inconsistent array lengths.")) - end - return length(unique(x[findall(!iszero, w)])) == 1 ? Dirac(first(x)) : Dirac(NaN) -end \ No newline at end of file From d892d9876b3b1e192505ce5d56b22b8917653119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:35:48 +0100 Subject: [PATCH 14/17] add test --- src/multivariate/product.jl | 13 +++++++++++++ src/product.jl | 2 ++ test/fit.jl | 26 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 8d9607c4e..5e6b264f3 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -63,3 +63,16 @@ function product_distribution(dists::V) where {S<:ValueSupport,T<:UnivariateDist return Product{S,T,V}(dists) end +#### Fitting + +""" + fit_mle(g::Product, x::AbstractMatrix) + fit_mle(g::Product, x::AbstractMatrix, γ::AbstractVector) + +The `fit_mle` for a multivariate Product distributions `g` is the `product_distribution` of `fit_mle` of each components of `g`. +""" +function fit_mle(g::Product, x::AbstractMatrix, args...) + d = size(x, 1) + length(g) == d || throw(DimensionMismatch("The dimensions of g and x are inconsistent.")) + return product_distribution([fit_mle(g.v[s], y, args...) for (s, y) in enumerate(eachrow(x))]) +end diff --git a/src/product.jl b/src/product.jl index def59f3e7..e862abc32 100644 --- a/src/product.jl +++ b/src/product.jl @@ -74,6 +74,8 @@ end size(d::ProductDistribution) = d.size +params(d::ArrayOfUnivariateDistribution) = params.(d.dists) + mean(d::ProductDistribution) = reshape(mapreduce(vec ∘ mean, vcat, d.dists), size(d)) var(d::ProductDistribution) = reshape(mapreduce(vec ∘ var, vcat, d.dists), size(d)) cov(d::ProductDistribution) = Diagonal(vec(var(d))) diff --git a/test/fit.jl b/test/fit.jl index 4483dd1ec..d587d0d87 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -115,6 +115,11 @@ end @test d isa D @test ntrials(d) == 100 @test succprob(d) ≈ 0.3 atol=0.01 + + d2 = @inferred fit_mle(D(100, 0.5), x) + @test d2 isa D + @test ntrials(d2) == 100 + @test succprob(d2) ≈ 0.3 atol = 0.01 end end end @@ -141,6 +146,10 @@ end @test isa(d2, Categorical) @test probs(d2) == probs(d) + d3 = fit_mle(Categorical(p), x) + @test isa(d3, Categorical) + @test probs(d3) == probs(d) + ss = suffstats(Categorical, (3, x), w) h = Float64[sum(w[x .== i]) for i = 1 : 3] @test isa(ss, Distributions.CategoricalStats) @@ -414,6 +423,23 @@ end end end +@testset "Testing fit_mle for ProductDistribution" begin + dists = [product_distribution([Exponential(0.5), Normal(11.3, 3.2)]), product_distribution([Exponential(0.5) Normal(11.3, 3.2) + Bernoulli(0.2) Normal{Float32}(0f0,0f0)])] + for func in funcs, dist in dists + x = rand(dist, N) + w = func[1](N) + + d = fit_mle(dist, x) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + + d = fit_mle(dist, x, w) + @test isa(d, typeof(dist)) + @test isapprox(collect.(params(d)), collect.(params(dist)), atol=0.1) + end +end + @testset "Testing fit for InverseGaussian" begin for func in funcs, dist in (InverseGaussian, InverseGaussian{Float64}) x = rand(dist(3.9, 2.1), n0) From 2f8139fe5bf788b11446aa3454345d3cc320f4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:08:41 +0100 Subject: [PATCH 15/17] Update docs/src/fit.md Co-authored-by: David Widmann --- docs/src/fit.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/fit.md b/docs/src/fit.md index 3c1903c22..b04341667 100644 --- a/docs/src/fit.md +++ b/docs/src/fit.md @@ -60,7 +60,6 @@ The `fit_mle` method has been implemented for the following distributions: - [`Multinomial`](@ref) - [`MvNormal`](@ref) - [`Dirichlet`](@ref) -- [`ProductDistribution`](@ref) For most of these distributions, the usage is as described above. For a few special distributions that require additional information for estimation, we have to use a modified interface: From da8929f180eeb0e052ec753513de75f55d809c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:16:18 +0100 Subject: [PATCH 16/17] Update src/genericfit.jl Co-authored-by: David Widmann --- src/genericfit.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genericfit.jl b/src/genericfit.jl index c84af46b1..f5cb7c72b 100644 --- a/src/genericfit.jl +++ b/src/genericfit.jl @@ -30,7 +30,7 @@ fit_mle(dt::Type{D}, x::AbstractArray, w::AbstractArray) where {D<:UnivariateDis fit_mle(dt::Type{D}, x::AbstractMatrix) where {D<:MultivariateDistribution} = fit_mle(D, suffstats(D, x)) fit_mle(dt::Type{D}, x::AbstractMatrix, w::AbstractArray) where {D<:MultivariateDistribution} = fit_mle(D, suffstats(D, x, w)) -fit_mle(dt::D, args...) where {D<:Distribution} = fit_mle(typeof(dt), args...) +fit_mle(dist::Distribution, args...) = fit_mle(typeof(dist), args...) fit(dt::Type{D}, x) where {D<:Distribution} = fit_mle(D, x) fit(dt::Type{D}, args...) where {D<:Distribution} = fit_mle(D, args...) From 1a57ad481e884a9a1f836a8d806464cbc9dabd32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=A9tivier?= <46794064+dmetivie@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:16:37 +0100 Subject: [PATCH 17/17] Update src/multivariate/product.jl Co-authored-by: David Widmann --- src/multivariate/product.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multivariate/product.jl b/src/multivariate/product.jl index 5e6b264f3..32366fed3 100644 --- a/src/multivariate/product.jl +++ b/src/multivariate/product.jl @@ -27,7 +27,7 @@ function Product(v::V) where {S<:ValueSupport,T<:UnivariateDistribution{S},V<:Ab "`Product(v)` is deprecated, please use `product_distribution(v)`", :Product, ) - return Product{S, T ,V}(v) + return Product{S, T, V}(v) end length(d::Product) = length(d.v)