Skip to content

Commit

Permalink
Merge pull request #33 from bdeonovic/master
Browse files Browse the repository at this point in the history
Power Divergence Test
  • Loading branch information
simonbyrne committed Mar 6, 2015
2 parents 0eefbc7 + 520e95d commit bfb7401
Show file tree
Hide file tree
Showing 6 changed files with 530 additions and 1 deletion.
3 changes: 2 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
Distributions
Roots
Roots
StatsBase
46 changes: 46 additions & 0 deletions doc/parametric/power_divergence_test.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Power Divergence Test
=============================================

.. function:: PowerDivergenceTest(x [,y] [, lambda] [,theta0] )

If ``x`` is a matrix with one row or column, or if ``x`` is a vector and ``y`` is
not given, then a goodness-of-fit test is performed (``x`` is treated as a one-
dimensional contingency table. The entries of ``x`` must be non-negative integers.
In this case, the hypothesis tested is whether the population probabilities equal
those in ``theta0``, or are all equal if ``theta0`` is not given.

If ``x`` is a matrix with at least two rows and columns, it is taken as a two-dimensional
contigency table: the entries of ``x`` must be non-negative integers. Otherwise, ``x``
and ``y`` must be vectors of the same length. The contigency table is calculated using
``counts`` from ``Statsbase``. Then the power divergence test is performed of the null
hypothesis that the joint distribution of the cell counts in a 2-dimensional contingency
table is teh product of the row and column marginals.

The power divergence test is given by

.. math::
\dfrac{2}{\lambda(\lambda+1)}\sum_{i=1}^I \sum_{j=1}^J n_{ij}\left[(n_{ij}/\hat{n}_{ij})^\lambda -1\right]
where :math:`n_{ij}` is the cell count in the :math:`i`th row and :math:`j`th column and :math:`\lambda \in \mathbb{R}`.
Note that when :math:`\lambda = 1`, this is equal to Pearson's chi-squared statistic, as :math`\lambda \to 0`, it converges
to the likelihood ratio test statistic, as :math:`\lambda \to -1` it converges to the minimum discrimination information
statistic (Gokhale and Kullback 1978), for :math:`\lambda=-2` it equals Neyman modified chi-squared (Neyman 1949), and for
:math:`\lambda=-1/2` it equals the Freeman-Tukey statistic (Freeman and Tukey 1950). Under regulairty conditions, their
asymptotic distributions are identical (see Drost et. al. 1989). The chis-squared null approximation works best for
:math:`\lambda` near 2/3.

Implements: :ref:`pvalue<pvalue>`, :ref:`ci<ci>`

References:

- Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013.


.. function:: ChisqTest(x [,y] [,theta0])

Convenience function for power divergence test with :math:`\lambda=1`.

.. function:: MultinomialLRT(x [,y] [,theta0])

Convenience function for power divergence test with :math:`\lambda=0`.
2 changes: 2 additions & 0 deletions src/HypothesisTests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ module HypothesisTests

using Distributions
using Roots
using StatsBase

export testname, pvalue, ci
abstract HypothesisTest
Expand Down Expand Up @@ -134,4 +135,5 @@ include("kruskal_wallis.jl")
include("mann_whitney.jl")
include("t.jl")
include("wilcoxon.jl")
include("power_divergence.jl")
end
313 changes: 313 additions & 0 deletions src/power_divergence.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
export PowerDivergenceTest, ChisqTest, MultinomialLRT

function boundproportion{T<:Real}(x::T)
max(min(x,1),0)
end


immutable PowerDivergenceTest <: HypothesisTest
lambda::Float64
theta0::Vector{Float64}
stat::Float64
df::Int64
observed::Matrix{Int64}
n::Int64
thetahat::Vector{Float64}

expected::Matrix{Float64}
residuals::Matrix{Float64}
stdresiduals::Matrix{Float64}
end

#test name
function testname(x::PowerDivergenceTest)
if x.lambda == 1
return("Pearson's Chi-square Test")
elseif x.lambda == 0
return("Multinomial Likelihood Ratio Test")
elseif x.lambda == -1
return("Minimum Discrimination Information Test")
elseif x.lambda == -2
return("Neyman Modified Chi-square Test")
elseif x.lambda == -.5
return("Freeman-Tukey Test")
else
return("Power Divergence Test")
end
end

# parameter of interest: name, value under h0, point estimate
population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities", x.theta0, x.thetahat)

pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=tail)

function ci(x::PowerDivergenceTest, alpha::Float64=0.05; tail::Symbol=:both, method::Symbol=:sison_glaz, correct::Bool=true, bootstrap_iters::Int64=10000, GC::Bool=true)
check_alpha(alpha)

m = length(x.thetahat)

if tail == :left
i = ci(x, alpha*2,method=method, GC=GC)
(Float64,Float64)[ (0.0, i[j][2]) for j in 1:m]
elseif tail == :right
i = ci(x, alpha*2,method=method, GC=GC)
(Float64,Float64)[ (i[j][1], 1.0) for j in 1:m]
elseif tail == :both
if method == :gold
ci_gold(x,alpha,correct=correct,GC=GC)
elseif method == :sison_glaz
ci_sison_glaz(x,alpha, skew_correct=correct)
elseif method == :quesenberry_hurst
ci_quesenberry_hurst(x,alpha,GC=GC)
elseif method == :bootstrap
ci_bootstrap(x,alpha,bootstrap_iters)
else
throw(ArgumentError("method=$(method) is invalid or not implemented yet"))
end
else
throw(ArgumentError("tail=$(tail) is invalid"))
end
end

# Bootstrap
function ci_bootstrap(x::PowerDivergenceTest,alpha::Float64, iters::Int64)
m = mapslices(x -> quantile(x,[alpha/2,1-alpha/2]), rand(Multinomial(x.n,convert(Vector{Float64},x.thetahat)),iters)/x.n, 2)
(Float64,Float64)[ ( boundproportion(m[i,1]), boundproportion(m[i,2])) for i in 1:length(x.thetahat)]
end

# Quesenberry, Hurst (1964)
function ci_quesenberry_hurst(x::PowerDivergenceTest,alpha::Float64; GC::Bool=true)
m = length(x.thetahat)
cv = GC ? quantile(Chisq(1),1-alpha/m) : quantile(Chisq(m-1), 1-alpha)

u = (cv + 2*x.observed + sqrt( cv * (cv + 4 * x.observed .* (x.n - x.observed)/x.n)))/( 2*(x.n + cv))
l = (cv + 2*x.observed - sqrt( cv * (cv + 4 * x.observed .* (x.n - x.observed)/x.n)))/( 2*(x.n + cv))
(Float64,Float64)[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m]
end

# asymptotic simultaneous confidence interval
# Gold (1963)
function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true, GC::Bool=true)
m = length(x.thetahat)
cv = GC ? quantile(Chisq(1), 1-alpha/(2*m)) : quantile(Chisq(m-1), 1-alpha/2)

u = [ x.thetahat[j] + sqrt(cv * x.thetahat[j]*(1-x.thetahat[j])/x.n) + (correct ? 1/(2*x.n) : 0 ) for j in 1:m]
l = [ x.thetahat[j] - sqrt(cv * x.thetahat[j]*(1-x.thetahat[j])/x.n) - (correct ? 1/(2*x.n) : 0 ) for j in 1:m]
(Float64,Float64)[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m]
end

# Simultaneous confidence interval
# Sison, Glaz (1995)
# adapted from SAS macro by May, Johnson (2000)
function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Bool=true)
k = length(x.thetahat)
probn = 1/pdf( Poisson(x.n),x.n)

c = 0
p_old = 0

for c in 1:x.n
#run truncpoi
m = zeros(k,5)
for i in 1:k
lambda = x.observed[i]
#run moments
a = lambda + c
b = max(lambda - c, 0)
poislama = lambda > 0 ? cdf(Poisson(lambda),a) : 1
poislamb = lambda > 0 ? cdf(Poisson(lambda),b-1) : 1
den = b > 0 ? poislama-poislamb : poislama

mu = zeros(4,1)
for r in 1:4
poisA = 0
poisB = 0
plar = lambda > 0 ? cdf(Poisson(lambda),a-r) : 1
plbr = lambda > 0 ? cdf(Poisson(lambda),b-r-1): 1
poisA = a - r >= 0 ? poislama-plar : poislama
if b - r - 1 >= 0
poisB = poislamb - plbr
end
if b - r - 1 < 0 && b-1 >= 0
poisB = poislamb
end
if b - r - 1 < 0 && b-1 < 0
poisB = 0
end
mu[r] = (lambda^r) * (1-(poisA-poisB)/den)
end
# end of moments
m[i,1] = mu[1]
m[i,2] = mu[2] + mu[1] - mu[1]^2
m[i,3] = mu[3]+mu[2]*(3-3*mu[1])+(mu[1]-3*mu[1]^2+2*mu[1]^3)
m[i,4] = mu[4] + mu[3] * (6-4*mu[1]) + mu[2] * (7-12*mu[1]+6*mu[1]^2) + mu[1]-4*mu[1]^2+6*mu[1]^3-3*mu[1]^4
m[i,5] = den
end
[ m[i,4] = m[i,4] - 3*m[i,2]^2 for i in 1:k]
s1,s2,s3,s4 = mapslices(sum,m,1)
z = (x.n-s1)/sqrt(s2)
g1 = s3/(s2^(3/2))
g2 = s4*(s2^2)

poly = 1 + g1 * (z^3 - 3*z)/6 + g2*(z^4-6*z^2+3)/24 + g1^2 * (z^6 - 15*z^4 + 45*z^2 -15)/72
f = poly*exp(-z^2/2)/(sqrt(2*pi))
probx = 1

[ probx *= probx*m[i,5] for i in 1:k]

p = probn*probx*f/sqrt(s2)
# end of truncpoi

p > 1-alpha && p_old < 1-alpha ? break : nothing
p_old = p
end

delta = (1-alpha - p_old)/(p_old)
out = zeros(k,5)
num = zeros(k,1)

c = c-1

vol1 = 1
vol2 = 1
for i in 1:k
num[i,1] = i
cn = c/x.n
onen = 1/x.n
out[i,1] = x.thetahat[i]

out[i,2] = max(x.thetahat[i]-cn,0)
out[i,3] = min(x.thetahat[i]+cn+2*delta/x.n,1)

out[i,4] = max(x.thetahat[i] - cn - onen,0)
out[i,5] = min(x.thetahat[i] + cn + onen,1)
end
if skew_correct
return( (Float64,Float64)[ (boundproportion(out[i,2]),boundproportion(out[i,3])) for i in 1:k] )
else
return( (Float64,Float64)[ (boundproportion(out[i,4]),boundproportion(out[i,5])) for i in 1:k] )
end
end

# power divergence statistic for testing goodness of fit
# Cressie and Read 1984; Read and Cressie 1988
# lambda = 1: Pearson's Chi-square statistic
# lambda -> 0: Converges to Likelihood Ratio test stat
# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale, Kullback 1978)
# lambda = -2: Neyman Modified chi-squared statistic (Neyman 1949)
# lambda = -.5: Freeman-Tukey statistic (Freeman, Tukey 1950)

# Under regularity conditions, their asymptotic distributions are all the same (Drost 1989)
# Chi-squared null approximation works best for lambda near 2/3
function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x))

nrows, ncols = size(x)
n = sum(x)

#validate date
any( x .< 0) || any( !isfinite(x)) ? error("all entries must be nonnegative and finite") : nothing
n == 0 ? error("at least one entry must be positive") : nothing
isfinite(nrows) && isfinite(ncols) && isfinite(nrows*ncols) ? nothing : error("Invalid number of rows or columns")

df = thetahat = xhat = V = 0

if nrows > 1 && ncols > 1
rowsums = mapslices(sum, x, 2)
colsums = mapslices(sum, x, 1)
df = (nrows-1)*(ncols-1)
thetahat = [ x[i]/n for i in 1:length(x)]
xhat = rowsums * colsums / n
theta0 = (xhat/n)[:]
V = Float64[ colsums[j]*rowsums[i]*(n-rowsums[i])*(n-colsums[j])/n^3 for i in 1:nrows, j in 1:ncols]
elseif nrows == 1 || ncols == 1
df = length(x) - 1
xhat = reshape(n * theta0, size(x))
thetahat = x/n
V = reshape(n * theta0 .* (1-theta0), size(x))

abs( 1 - sum(theta0)) <= sqrt(eps()) ? nothing : error("Probabilities must sum to one")
else
error("Number of rows or columns must be at least 1")
end

stat = 0
if lambda == 0
for i in 1:length(x)
stat += x[i]*(log(x[i]) - log(xhat[i]))
end
stat *= 2
elseif lambda == -1
for i in 1:length(x)
stat += xhat[i]*(log(xhat[i]) - log(x[i]))
end
stat *= 2
else
for i in 1:length(x)
stat += x[i]*( (x[i]/(xhat[i]))^lambda - 1)
end
stat *= 2/(lambda*(lambda+1))
end

PowerDivergenceTest(lambda, theta0, stat, df, x, n, thetahat[:], xhat, (x - xhat)./sqrt(xhat), (x-xhat)./sqrt(V))
end

#convenience functions

#PDT
function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T}); lambda::U=1.0)
d = counts(x,y,levels)
PowerDivergenceTest(d,lambda=lambda)
end

function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, y::AbstractVector{T}, k::T; lambda::U=1.0)
d = counts(x,y,k)
PowerDivergenceTest(d,lambda=lambda)
end

PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) =
PowerDivergenceTest(reshape(x,length(x),1),lambda=lambda,theta0=theta0)

#ChisqTest
function ChisqTest{T<: Integer}(x::AbstractMatrix{T})
PowerDivergenceTest(x,lambda=1.0)
end

function ChisqTest{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T}))
d = counts(x,y,levels)
PowerDivergenceTest(d,lambda=1.0)
end

function ChisqTest{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, k::T)
d = counts(x,y,k)
PowerDivergenceTest(d,lambda=1.0)
end

ChisqTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) =
PowerDivergenceTest(reshape(x,length(x),1), lambda=1.0, theta0 = theta0)

#MultinomialLRT
function MultinomialLRT{T<: Integer}(x::AbstractMatrix{T})
PowerDivergenceTest(x,lambda=0.0)
end

function MultinomialLRT{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T}))
d = counts(x,y,levels)
PowerDivergenceTest(d,lambda=0.0)
end

function MultinomialLRT{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, k::T)
d = counts(x,y,k)
PowerDivergenceTest(d,lambda=0.0)
end

MultinomialLRT{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) =
PowerDivergenceTest(reshape(x,length(x),1), lambda=0.0, theta0 = theta0)

function show_params(io::IO, x::PowerDivergenceTest, ident="")
println(io,ident, "Sample size: $(x.n)")
println(io,ident, "statistic: $(x.stat)")
println(io,ident, "degrees of freedom: $(x.df)")
println(io,ident, "residuals: $(x.residuals[:])")
println(io,ident, "std. residuals: $(x.stdresiduals[:])")
end

2 comments on commit bfb7401

@bdeonovic
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonbyrne I forgot to update the https://raw.githubusercontent.com/JuliaStats/HypothesisTests.jl/master/doc/parametric.rst to include power_divergence_test could you update this?

@simonbyrne
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Thanks again for doing all this.

Please sign in to comment.