-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33 from bdeonovic/master
Power Divergence Test
- Loading branch information
Showing
6 changed files
with
530 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
Distributions | ||
Roots | ||
Roots | ||
StatsBase |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
Power Divergence Test | ||
============================================= | ||
|
||
.. function:: PowerDivergenceTest(x [,y] [, lambda] [,theta0] ) | ||
|
||
If ``x`` is a matrix with one row or column, or if ``x`` is a vector and ``y`` is | ||
not given, then a goodness-of-fit test is performed (``x`` is treated as a one- | ||
dimensional contingency table. The entries of ``x`` must be non-negative integers. | ||
In this case, the hypothesis tested is whether the population probabilities equal | ||
those in ``theta0``, or are all equal if ``theta0`` is not given. | ||
|
||
If ``x`` is a matrix with at least two rows and columns, it is taken as a two-dimensional | ||
contigency table: the entries of ``x`` must be non-negative integers. Otherwise, ``x`` | ||
and ``y`` must be vectors of the same length. The contigency table is calculated using | ||
``counts`` from ``Statsbase``. Then the power divergence test is performed of the null | ||
hypothesis that the joint distribution of the cell counts in a 2-dimensional contingency | ||
table is teh product of the row and column marginals. | ||
|
||
The power divergence test is given by | ||
|
||
.. math:: | ||
\dfrac{2}{\lambda(\lambda+1)}\sum_{i=1}^I \sum_{j=1}^J n_{ij}\left[(n_{ij}/\hat{n}_{ij})^\lambda -1\right] | ||
where :math:`n_{ij}` is the cell count in the :math:`i`th row and :math:`j`th column and :math:`\lambda \in \mathbb{R}`. | ||
Note that when :math:`\lambda = 1`, this is equal to Pearson's chi-squared statistic, as :math`\lambda \to 0`, it converges | ||
to the likelihood ratio test statistic, as :math:`\lambda \to -1` it converges to the minimum discrimination information | ||
statistic (Gokhale and Kullback 1978), for :math:`\lambda=-2` it equals Neyman modified chi-squared (Neyman 1949), and for | ||
:math:`\lambda=-1/2` it equals the Freeman-Tukey statistic (Freeman and Tukey 1950). Under regulairty conditions, their | ||
asymptotic distributions are identical (see Drost et. al. 1989). The chis-squared null approximation works best for | ||
:math:`\lambda` near 2/3. | ||
|
||
Implements: :ref:`pvalue<pvalue>`, :ref:`ci<ci>` | ||
|
||
References: | ||
|
||
- Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013. | ||
|
||
|
||
.. function:: ChisqTest(x [,y] [,theta0]) | ||
|
||
Convenience function for power divergence test with :math:`\lambda=1`. | ||
|
||
.. function:: MultinomialLRT(x [,y] [,theta0]) | ||
|
||
Convenience function for power divergence test with :math:`\lambda=0`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,313 @@ | ||
export PowerDivergenceTest, ChisqTest, MultinomialLRT | ||
|
||
function boundproportion{T<:Real}(x::T) | ||
max(min(x,1),0) | ||
end | ||
|
||
|
||
immutable PowerDivergenceTest <: HypothesisTest | ||
lambda::Float64 | ||
theta0::Vector{Float64} | ||
stat::Float64 | ||
df::Int64 | ||
observed::Matrix{Int64} | ||
n::Int64 | ||
thetahat::Vector{Float64} | ||
|
||
expected::Matrix{Float64} | ||
residuals::Matrix{Float64} | ||
stdresiduals::Matrix{Float64} | ||
end | ||
|
||
#test name | ||
function testname(x::PowerDivergenceTest) | ||
if x.lambda == 1 | ||
return("Pearson's Chi-square Test") | ||
elseif x.lambda == 0 | ||
return("Multinomial Likelihood Ratio Test") | ||
elseif x.lambda == -1 | ||
return("Minimum Discrimination Information Test") | ||
elseif x.lambda == -2 | ||
return("Neyman Modified Chi-square Test") | ||
elseif x.lambda == -.5 | ||
return("Freeman-Tukey Test") | ||
else | ||
return("Power Divergence Test") | ||
end | ||
end | ||
|
||
# parameter of interest: name, value under h0, point estimate | ||
population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities", x.theta0, x.thetahat) | ||
|
||
pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=tail) | ||
|
||
function ci(x::PowerDivergenceTest, alpha::Float64=0.05; tail::Symbol=:both, method::Symbol=:sison_glaz, correct::Bool=true, bootstrap_iters::Int64=10000, GC::Bool=true) | ||
check_alpha(alpha) | ||
|
||
m = length(x.thetahat) | ||
|
||
if tail == :left | ||
i = ci(x, alpha*2,method=method, GC=GC) | ||
(Float64,Float64)[ (0.0, i[j][2]) for j in 1:m] | ||
elseif tail == :right | ||
i = ci(x, alpha*2,method=method, GC=GC) | ||
(Float64,Float64)[ (i[j][1], 1.0) for j in 1:m] | ||
elseif tail == :both | ||
if method == :gold | ||
ci_gold(x,alpha,correct=correct,GC=GC) | ||
elseif method == :sison_glaz | ||
ci_sison_glaz(x,alpha, skew_correct=correct) | ||
elseif method == :quesenberry_hurst | ||
ci_quesenberry_hurst(x,alpha,GC=GC) | ||
elseif method == :bootstrap | ||
ci_bootstrap(x,alpha,bootstrap_iters) | ||
else | ||
throw(ArgumentError("method=$(method) is invalid or not implemented yet")) | ||
end | ||
else | ||
throw(ArgumentError("tail=$(tail) is invalid")) | ||
end | ||
end | ||
|
||
# Bootstrap | ||
function ci_bootstrap(x::PowerDivergenceTest,alpha::Float64, iters::Int64) | ||
m = mapslices(x -> quantile(x,[alpha/2,1-alpha/2]), rand(Multinomial(x.n,convert(Vector{Float64},x.thetahat)),iters)/x.n, 2) | ||
(Float64,Float64)[ ( boundproportion(m[i,1]), boundproportion(m[i,2])) for i in 1:length(x.thetahat)] | ||
end | ||
|
||
# Quesenberry, Hurst (1964) | ||
function ci_quesenberry_hurst(x::PowerDivergenceTest,alpha::Float64; GC::Bool=true) | ||
m = length(x.thetahat) | ||
cv = GC ? quantile(Chisq(1),1-alpha/m) : quantile(Chisq(m-1), 1-alpha) | ||
|
||
u = (cv + 2*x.observed + sqrt( cv * (cv + 4 * x.observed .* (x.n - x.observed)/x.n)))/( 2*(x.n + cv)) | ||
l = (cv + 2*x.observed - sqrt( cv * (cv + 4 * x.observed .* (x.n - x.observed)/x.n)))/( 2*(x.n + cv)) | ||
(Float64,Float64)[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m] | ||
end | ||
|
||
# asymptotic simultaneous confidence interval | ||
# Gold (1963) | ||
function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true, GC::Bool=true) | ||
m = length(x.thetahat) | ||
cv = GC ? quantile(Chisq(1), 1-alpha/(2*m)) : quantile(Chisq(m-1), 1-alpha/2) | ||
|
||
u = [ x.thetahat[j] + sqrt(cv * x.thetahat[j]*(1-x.thetahat[j])/x.n) + (correct ? 1/(2*x.n) : 0 ) for j in 1:m] | ||
l = [ x.thetahat[j] - sqrt(cv * x.thetahat[j]*(1-x.thetahat[j])/x.n) - (correct ? 1/(2*x.n) : 0 ) for j in 1:m] | ||
(Float64,Float64)[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m] | ||
end | ||
|
||
# Simultaneous confidence interval | ||
# Sison, Glaz (1995) | ||
# adapted from SAS macro by May, Johnson (2000) | ||
function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Bool=true) | ||
k = length(x.thetahat) | ||
probn = 1/pdf( Poisson(x.n),x.n) | ||
|
||
c = 0 | ||
p_old = 0 | ||
|
||
for c in 1:x.n | ||
#run truncpoi | ||
m = zeros(k,5) | ||
for i in 1:k | ||
lambda = x.observed[i] | ||
#run moments | ||
a = lambda + c | ||
b = max(lambda - c, 0) | ||
poislama = lambda > 0 ? cdf(Poisson(lambda),a) : 1 | ||
poislamb = lambda > 0 ? cdf(Poisson(lambda),b-1) : 1 | ||
den = b > 0 ? poislama-poislamb : poislama | ||
|
||
mu = zeros(4,1) | ||
for r in 1:4 | ||
poisA = 0 | ||
poisB = 0 | ||
plar = lambda > 0 ? cdf(Poisson(lambda),a-r) : 1 | ||
plbr = lambda > 0 ? cdf(Poisson(lambda),b-r-1): 1 | ||
poisA = a - r >= 0 ? poislama-plar : poislama | ||
if b - r - 1 >= 0 | ||
poisB = poislamb - plbr | ||
end | ||
if b - r - 1 < 0 && b-1 >= 0 | ||
poisB = poislamb | ||
end | ||
if b - r - 1 < 0 && b-1 < 0 | ||
poisB = 0 | ||
end | ||
mu[r] = (lambda^r) * (1-(poisA-poisB)/den) | ||
end | ||
# end of moments | ||
m[i,1] = mu[1] | ||
m[i,2] = mu[2] + mu[1] - mu[1]^2 | ||
m[i,3] = mu[3]+mu[2]*(3-3*mu[1])+(mu[1]-3*mu[1]^2+2*mu[1]^3) | ||
m[i,4] = mu[4] + mu[3] * (6-4*mu[1]) + mu[2] * (7-12*mu[1]+6*mu[1]^2) + mu[1]-4*mu[1]^2+6*mu[1]^3-3*mu[1]^4 | ||
m[i,5] = den | ||
end | ||
[ m[i,4] = m[i,4] - 3*m[i,2]^2 for i in 1:k] | ||
s1,s2,s3,s4 = mapslices(sum,m,1) | ||
z = (x.n-s1)/sqrt(s2) | ||
g1 = s3/(s2^(3/2)) | ||
g2 = s4*(s2^2) | ||
|
||
poly = 1 + g1 * (z^3 - 3*z)/6 + g2*(z^4-6*z^2+3)/24 + g1^2 * (z^6 - 15*z^4 + 45*z^2 -15)/72 | ||
f = poly*exp(-z^2/2)/(sqrt(2*pi)) | ||
probx = 1 | ||
|
||
[ probx *= probx*m[i,5] for i in 1:k] | ||
|
||
p = probn*probx*f/sqrt(s2) | ||
# end of truncpoi | ||
|
||
p > 1-alpha && p_old < 1-alpha ? break : nothing | ||
p_old = p | ||
end | ||
|
||
delta = (1-alpha - p_old)/(p_old) | ||
out = zeros(k,5) | ||
num = zeros(k,1) | ||
|
||
c = c-1 | ||
|
||
vol1 = 1 | ||
vol2 = 1 | ||
for i in 1:k | ||
num[i,1] = i | ||
cn = c/x.n | ||
onen = 1/x.n | ||
out[i,1] = x.thetahat[i] | ||
|
||
out[i,2] = max(x.thetahat[i]-cn,0) | ||
out[i,3] = min(x.thetahat[i]+cn+2*delta/x.n,1) | ||
|
||
out[i,4] = max(x.thetahat[i] - cn - onen,0) | ||
out[i,5] = min(x.thetahat[i] + cn + onen,1) | ||
end | ||
if skew_correct | ||
return( (Float64,Float64)[ (boundproportion(out[i,2]),boundproportion(out[i,3])) for i in 1:k] ) | ||
else | ||
return( (Float64,Float64)[ (boundproportion(out[i,4]),boundproportion(out[i,5])) for i in 1:k] ) | ||
end | ||
end | ||
|
||
# power divergence statistic for testing goodness of fit | ||
# Cressie and Read 1984; Read and Cressie 1988 | ||
# lambda = 1: Pearson's Chi-square statistic | ||
# lambda -> 0: Converges to Likelihood Ratio test stat | ||
# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale, Kullback 1978) | ||
# lambda = -2: Neyman Modified chi-squared statistic (Neyman 1949) | ||
# lambda = -.5: Freeman-Tukey statistic (Freeman, Tukey 1950) | ||
|
||
# Under regularity conditions, their asymptotic distributions are all the same (Drost 1989) | ||
# Chi-squared null approximation works best for lambda near 2/3 | ||
function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) | ||
|
||
nrows, ncols = size(x) | ||
n = sum(x) | ||
|
||
#validate date | ||
any( x .< 0) || any( !isfinite(x)) ? error("all entries must be nonnegative and finite") : nothing | ||
n == 0 ? error("at least one entry must be positive") : nothing | ||
isfinite(nrows) && isfinite(ncols) && isfinite(nrows*ncols) ? nothing : error("Invalid number of rows or columns") | ||
|
||
df = thetahat = xhat = V = 0 | ||
|
||
if nrows > 1 && ncols > 1 | ||
rowsums = mapslices(sum, x, 2) | ||
colsums = mapslices(sum, x, 1) | ||
df = (nrows-1)*(ncols-1) | ||
thetahat = [ x[i]/n for i in 1:length(x)] | ||
xhat = rowsums * colsums / n | ||
theta0 = (xhat/n)[:] | ||
V = Float64[ colsums[j]*rowsums[i]*(n-rowsums[i])*(n-colsums[j])/n^3 for i in 1:nrows, j in 1:ncols] | ||
elseif nrows == 1 || ncols == 1 | ||
df = length(x) - 1 | ||
xhat = reshape(n * theta0, size(x)) | ||
thetahat = x/n | ||
V = reshape(n * theta0 .* (1-theta0), size(x)) | ||
|
||
abs( 1 - sum(theta0)) <= sqrt(eps()) ? nothing : error("Probabilities must sum to one") | ||
else | ||
error("Number of rows or columns must be at least 1") | ||
end | ||
|
||
stat = 0 | ||
if lambda == 0 | ||
for i in 1:length(x) | ||
stat += x[i]*(log(x[i]) - log(xhat[i])) | ||
end | ||
stat *= 2 | ||
elseif lambda == -1 | ||
for i in 1:length(x) | ||
stat += xhat[i]*(log(xhat[i]) - log(x[i])) | ||
end | ||
stat *= 2 | ||
else | ||
for i in 1:length(x) | ||
stat += x[i]*( (x[i]/(xhat[i]))^lambda - 1) | ||
end | ||
stat *= 2/(lambda*(lambda+1)) | ||
end | ||
|
||
PowerDivergenceTest(lambda, theta0, stat, df, x, n, thetahat[:], xhat, (x - xhat)./sqrt(xhat), (x-xhat)./sqrt(V)) | ||
end | ||
|
||
#convenience functions | ||
|
||
#PDT | ||
function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T}); lambda::U=1.0) | ||
d = counts(x,y,levels) | ||
PowerDivergenceTest(d,lambda=lambda) | ||
end | ||
|
||
function PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, y::AbstractVector{T}, k::T; lambda::U=1.0) | ||
d = counts(x,y,k) | ||
PowerDivergenceTest(d,lambda=lambda) | ||
end | ||
|
||
PowerDivergenceTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) = | ||
PowerDivergenceTest(reshape(x,length(x),1),lambda=lambda,theta0=theta0) | ||
|
||
#ChisqTest | ||
function ChisqTest{T<: Integer}(x::AbstractMatrix{T}) | ||
PowerDivergenceTest(x,lambda=1.0) | ||
end | ||
|
||
function ChisqTest{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T})) | ||
d = counts(x,y,levels) | ||
PowerDivergenceTest(d,lambda=1.0) | ||
end | ||
|
||
function ChisqTest{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, k::T) | ||
d = counts(x,y,k) | ||
PowerDivergenceTest(d,lambda=1.0) | ||
end | ||
|
||
ChisqTest{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) = | ||
PowerDivergenceTest(reshape(x,length(x),1), lambda=1.0, theta0 = theta0) | ||
|
||
#MultinomialLRT | ||
function MultinomialLRT{T<: Integer}(x::AbstractMatrix{T}) | ||
PowerDivergenceTest(x,lambda=0.0) | ||
end | ||
|
||
function MultinomialLRT{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, levels::(Range1{T},Range1{T})) | ||
d = counts(x,y,levels) | ||
PowerDivergenceTest(d,lambda=0.0) | ||
end | ||
|
||
function MultinomialLRT{T<: Integer}(x::AbstractVector{T}, y::AbstractVector{T}, k::T) | ||
d = counts(x,y,k) | ||
PowerDivergenceTest(d,lambda=0.0) | ||
end | ||
|
||
MultinomialLRT{T<: Integer, U<: FloatingPoint}(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) = | ||
PowerDivergenceTest(reshape(x,length(x),1), lambda=0.0, theta0 = theta0) | ||
|
||
function show_params(io::IO, x::PowerDivergenceTest, ident="") | ||
println(io,ident, "Sample size: $(x.n)") | ||
println(io,ident, "statistic: $(x.stat)") | ||
println(io,ident, "degrees of freedom: $(x.df)") | ||
println(io,ident, "residuals: $(x.residuals[:])") | ||
println(io,ident, "std. residuals: $(x.stdresiduals[:])") | ||
end | ||
|
Oops, something went wrong.
bfb7401
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@simonbyrne I forgot to update the
https://raw.githubusercontent.com/JuliaStats/HypothesisTests.jl/master/doc/parametric.rst
to includepower_divergence_test
could you update this?bfb7401
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Thanks again for doing all this.