Skip to content

Commit

Permalink
Merge pull request #41 from wildart/adt
Browse files Browse the repository at this point in the history
Anderson-Darling test
  • Loading branch information
andreasnoack committed Dec 2, 2015
2 parents bbe6143 + 76cd4ba commit b44aa94
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ julia:
- nightly
notifications:
email: false
matrix:
allow_failures:
- julia: nightly
script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
- julia -e 'Pkg.clone(pwd()); Pkg.build("HypothesisTests"); Pkg.test("HypothesisTests"; coverage=true)'
Expand Down
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ Distributions
Roots
StatsBase
Compat
CurveFit
3 changes: 2 additions & 1 deletion doc/nonparametric.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ parametric counterparts. The package contains the following nonparametric tests:
nonparametric/test_kruskal_wallis
nonparametric/test_mann_whitney_u
nonparametric/test_sign
nonparametric/test_signed_rank
nonparametric/test_signed_rank
nonparametric/test_anderson_darling
31 changes: 31 additions & 0 deletions doc/nonparametric/test_anderson_darling.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Anderson–Darling test
=============================================

The null hypothesis of the Anderson–Darling test is that a dataset comes from
a certain distribution; the reference distribution can be specified explicitely
(one-sample test). K-sample Anderson–Darling tests are available for testing whether
several samples are coming from a single population drawn from the distribution function
which does not have to be specified.

.. function:: OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)

Perform a one sample Anderson–Darling test of the null hypothesis that the data
in vector ``x`` comes from the distribution ``d`` against
the alternative hypothesis that the sample is not drawn from ``d``.

Implements: :ref:`pvalue<pvalue>`

.. function:: KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true)

Perform an k-sample Anderson–Darling test of the null hypothesis that the data
in vectors ``xs`` comes from the same distribution against the alternative
hypothesis that the samples comes from different distributions.

``modified`` paramater enables a modified test calculation for samples whose
observations do not all coincide.

Implements: :ref:`pvalue<pvalue>`

References:

- k-Sample Anderson-Darling Tests, F. W. Scholz and M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924.
2 changes: 2 additions & 0 deletions src/HypothesisTests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ VERSION >= v"0.4.0-dev+6521" && __precompile__()
module HypothesisTests

using Distributions, Roots, StatsBase, Compat
import CurveFit

export testname, pvalue, ci
abstract HypothesisTest
Expand Down Expand Up @@ -136,4 +137,5 @@ include("mann_whitney.jl")
include("t.jl")
include("wilcoxon.jl")
include("power_divergence.jl")
include("anderson_darling.jl")
end
163 changes: 163 additions & 0 deletions src/anderson_darling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Anderson-Darling test

export OneSampleADTest, KSampleADTest

abstract ADTest <: HypothesisTest

## ONE SAMPLE AD-TEST
### http://www.itl.nist.gov/div898/handbook/eda/section3/eda35e.htm

function adstats{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)
n = length(x)
y = sort(x)
μ = mean(y)
σ = std(y)
z = cdf(d, (y-μ)/σ)
i = 1:n
S = ((2*i-1.0)/n) .* (log(z[i])+log(1-z[n+1-i]))
S[isinf(S)] = 0. # remove infinity
= -n-sum(S)
(n, μ, σ, A²)
end

immutable OneSampleADTest <: ADTest
n::Int # number of observations
μ::Float64 # sample mean
σ::Float64 # sample std
::Float64 # Anderson-Darling test statistic
end

function OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)
OneSampleADTest(adstats(x, d)...)
end

testname(::OneSampleADTest) = "One sample Anderson-Darling test"

function show_params(io::IO, x::OneSampleADTest, ident="")
println(io, ident, "number of observations: $(x.n)")
println(io, ident, "sample mean: $(x.μ)")
println(io, ident, "sample SD: $(x.σ)")
println(io, ident, "A² statistic: $(x.A²)")
end

### Ralph B. D'Agostino, Goodness-of-Fit-Techniques, CRC Press, Jun 2, 1986
### https://books.google.com/books?id=1BSEaGVBj5QC&pg=PA97, p.127
function pvalue(x::OneSampleADTest)
z = x.*(1.0 + .75/x.n + 2.25/x.n/x.n)

if z < .200
1.0 - exp(-13.436+101.14z-223.73z^2)
elseif .200 < z < .340
1.0 - exp(-8.318+42.796z-59.938z^2)
elseif .340 < z < .600
exp(0.9177-4.279z-1.38z^2)
else
exp(1.2937-5.709z+0.0186z^2)
end
end


## K-SAMPLE ANDERSON DARLING TEST
### k-Sample Anderson-Darling Tests, F. W. Scholz; M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924.

immutable KSampleADTest <: ADTest
k::Int # number of samples
n::Int # number of observations
σ::Float64 # variance A²k
A²k::Float64 # Anderson-Darling test statistic
end

function KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true)
KSampleADTest(a2_ksample(xs, modified)...)
end

testname(::KSampleADTest) = "k-sample Anderson-Darling test"

function show_params(io::IO, x::KSampleADTest, ident="")
println(io, ident, "number of samples: $(x.k)")
println(io, ident, "number of observations: $(x.n)")
println(io, ident, "SD of A²k: $(x.σ)")
println(io, ident, "A²k statistic: $(x.A²k)")
end

function pvalue(x::KSampleADTest)
m = x.k - 1
Tk = (x.A²k - m) / x.σ
sig = [0.25, 0.1, 0.05, 0.025, 0.01]
b0 = [0.675, 1.281, 1.645, 1.96, 2.326]
b1 = [-0.245, 0.25, 0.678, 1.149, 1.822]
b2 = [-0.105, -0.305, -0.362, -0.391, -0.396]
tm = b0 + b1 / sqrt(m) + b2 / m
f = CurveFit.poly_fit(tm, log(sig), 2)
exp(f[1] + f[2]*Tk + f[3]*Tk^2)
end

function a2_ksample(samples, modified=true)
k = length(samples)
k < 2 && error("Need at least two samples")

n = map(length, samples)
Z = sort(vcat(samples...))
N = length(Z)
Z⁺ = unique(Z)
L = length(Z⁺)

L < 2 && error("Need more then 1 observation")
minimum(n) == 0 && error("One of the samples is empty")

fij = zeros(Int, k, L)
for i in 1:k
for s in samples[i]
fij[i, searchsortedfirst(Z⁺, s)] += 1
end
end

A²k = 0.
if modified
for i in 1:k
inner = 0.
Mij = 0.
Bj = 0.
for j = 1:L
lj = sum(fij[:,j])
Mij += fij[i, j]
Bj += lj
Maij = Mij - fij[i, j]/2.
Baj = Bj - lj/2.
inner += lj/N * (N*Maij-n[i]*Baj)^2 / (Baj*(N-Baj) - N*lj/4.)
end
A²k += inner / n[i]
end
A²k *= (N - 1.) / N
else
for i in 1:k
inner = 0.
Mij = 0.
Bj = 0.
for j = 1:L-1
lj = sum(fij[:,j])
Mij += fij[i, j]
Bj += lj
inner += lj/N * (N*Mij-n[i]*Bj)^2 / (Bj*(N-Bj))
end
A²k += inner / n[i]
end
end

H = sum(map(i->1./i, n))
h = sum(1./(1:N-1))
g = 0.
for i in 1:N-2
for j in i+1:N-1
g += 1. / ((N - i) * j)
end
end

a = (4*g - 6)*(k - 1) + (10 - 6*g)*H
b = (2*g - 4)*k^2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
c = (6*h + 2*g - 2)*k^2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
d = (2*h + 6)*k^2 - 4*h*k
σ²n = (a*N^3 + b*N^2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))

(k, N, sqrt(σ²n), A²k)
end
50 changes: 50 additions & 0 deletions test/anderson_darling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
using HypothesisTests, Distributions, Base.Test

# One sample test
n = 1000
srand(1984948)

x = rand(Normal(), n)
t = OneSampleADTest(x, Normal())
@test_approx_eq_eps t.0.2013 0.1^4
@test_approx_eq_eps pvalue(t) 0.8811 0.1^4

x = rand(DoubleExponential(), n)
t = OneSampleADTest(x, Normal())
@test_approx_eq_eps t.10.7439 0.1^4
@test_approx_eq_eps pvalue(t) 0.0 0.1^4

x = rand(Cauchy(), n)
t = OneSampleADTest(x, Normal())
@test_approx_eq_eps t.278.0190 0.1^4

x = rand(LogNormal(), n)
t = OneSampleADTest(x, Normal())
@test_approx_eq_eps t.85.5217 0.1^4

# k-sample test
samples = Any[
[38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0],
[39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8],
[34.0, 35.0, 39.0, 40.0, 43.0, 43.0, 44.0, 45.0],
[34.0, 34.8, 34.8, 35.4, 37.2, 37.8, 41.2, 42.8]
]

t = KSampleADTest(samples...)
@test_approx_eq_eps t.A²k 8.3926 0.1^4
@test_approx_eq_eps t.σ 1.2038 0.1^4
@test_approx_eq_eps pvalue(t) 0.0020 0.1^4

t = KSampleADTest(samples..., modified = false)
@test_approx_eq_eps t.A²k 8.3559 0.1^4
@test_approx_eq_eps t.σ 1.2038 0.1^4
@test_approx_eq_eps pvalue(t) 0.0021 0.1^4

srand(31412455)
samples = Any[rand(Normal(), 50), rand(Normal(0.5), 30)]
t = KSampleADTest(samples...)
@test pvalue(t) < 0.05

samples = Any[rand(Normal(), 50), rand(Normal(), 30), rand(Normal(), 20)]
t = KSampleADTest(samples...)
@test pvalue(t) > 0.05
9 changes: 9 additions & 0 deletions test/common.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
using HypothesisTests, Base.Test

@test_throws DimensionMismatch HypothesisTests.check_same_length([1], [])
@test_throws ArgumentError HypothesisTests.check_alpha(0.0)

type TestTest <: HypothesisTests.HypothesisTest end
result = HypothesisTests.population_param_of_interest(TestTest())
@test result[1] == "not implemented yet"
@test isnan(result[2])
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ include("mann_whitney.jl")
include("t.jl")
include("wilcoxon.jl")
include("power_divergence.jl")
include("anderson_darling.jl")
include("common.jl")

0 comments on commit b44aa94

Please sign in to comment.