Merge pull request #41 from wildart/adt

Anderson-Darling test
JuliaStats · Dec 2, 2015 · b44aa94 · b44aa94
2 parents bbe6143 + 76cd4ba
commit b44aa94
Show file tree

Hide file tree

Showing 9 changed files with 263 additions and 1 deletion.
diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,9 @@ julia:
     - nightly
 notifications:
     email: false
+matrix:
+  allow_failures:
+    - julia: nightly
 script:
     - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
     - julia -e 'Pkg.clone(pwd()); Pkg.build("HypothesisTests"); Pkg.test("HypothesisTests"; coverage=true)'

diff --git a/REQUIRE b/REQUIRE
@@ -3,3 +3,4 @@ Distributions
 Roots
 StatsBase
 Compat
+CurveFit
diff --git a/doc/nonparametric.rst b/doc/nonparametric.rst
@@ -13,4 +13,5 @@ parametric counterparts. The package contains the following nonparametric tests:
    nonparametric/test_kruskal_wallis
    nonparametric/test_mann_whitney_u
    nonparametric/test_sign
-   nonparametric/test_signed_rank
+   nonparametric/test_signed_rank
+   nonparametric/test_anderson_darling
diff --git a/doc/nonparametric/test_anderson_darling.rst b/doc/nonparametric/test_anderson_darling.rst
@@ -0,0 +1,31 @@
+Anderson–Darling test
+=============================================
+
+The null hypothesis of the Anderson–Darling test is that a dataset comes from
+a certain distribution; the reference distribution can be specified explicitely 
+(one-sample test). K-sample Anderson–Darling tests are available for testing whether
+several samples are coming from a single population drawn from the distribution function
+which does not have to be specified.
+
+.. function:: OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)
+
+    Perform a one sample Anderson–Darling test of the null hypothesis that the data
+    in vector ``x`` comes from the distribution ``d`` against
+    the alternative hypothesis that the sample is not drawn from ``d``.
+
+    Implements: :ref:`pvalue<pvalue>`
+
+.. function:: KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true)
+
+    Perform an k-sample Anderson–Darling test of the null hypothesis that the data
+    in vectors ``xs`` comes from the same distribution against the alternative
+    hypothesis that the samples comes from different distributions.
+
+    ``modified`` paramater enables a modified test calculation for samples whose
+    observations do not all coincide.
+
+    Implements: :ref:`pvalue<pvalue>`
+
+    References:
+
+    - k-Sample Anderson-Darling Tests, F. W. Scholz and M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924.
diff --git a/src/HypothesisTests.jl b/src/HypothesisTests.jl
@@ -27,6 +27,7 @@ VERSION >= v"0.4.0-dev+6521" && __precompile__()
 module HypothesisTests
 
 using Distributions, Roots, StatsBase, Compat
+import CurveFit
 
 export testname, pvalue, ci
 abstract HypothesisTest
@@ -136,4 +137,5 @@ include("mann_whitney.jl")
 include("t.jl")
 include("wilcoxon.jl")
 include("power_divergence.jl")
+include("anderson_darling.jl")
 end
diff --git a/src/anderson_darling.jl b/src/anderson_darling.jl
@@ -0,0 +1,163 @@
+# Anderson-Darling test
+
+export OneSampleADTest, KSampleADTest
+
+abstract ADTest <: HypothesisTest
+
+## ONE SAMPLE AD-TEST
+### http://www.itl.nist.gov/div898/handbook/eda/section3/eda35e.htm
+
+function adstats{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)
+    n = length(x)
+    y = sort(x)
+    μ = mean(y)
+    σ = std(y)
+    z = cdf(d, (y-μ)/σ)
+    i = 1:n
+    S = ((2*i-1.0)/n) .* (log(z[i])+log(1-z[n+1-i]))
+    S[isinf(S)] = 0. # remove infinity
+    A² = -n-sum(S)
+    (n, μ, σ, A²)
+end
+
+immutable OneSampleADTest <: ADTest
+    n::Int      # number of observations
+    μ::Float64  # sample mean
+    σ::Float64  # sample std
+    A²::Float64 # Anderson-Darling test statistic
+end
+
+function OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution)
+    OneSampleADTest(adstats(x, d)...)
+end
+
+testname(::OneSampleADTest) = "One sample Anderson-Darling test"
+
+function show_params(io::IO, x::OneSampleADTest, ident="")
+    println(io, ident, "number of observations:   $(x.n)")
+    println(io, ident, "sample mean:              $(x.μ)")
+    println(io, ident, "sample SD:                $(x.σ)")
+    println(io, ident, "A² statistic:             $(x.A²)")
+end
+
+### Ralph B. D'Agostino, Goodness-of-Fit-Techniques, CRC Press, Jun 2, 1986
+### https://books.google.com/books?id=1BSEaGVBj5QC&pg=PA97, p.127
+function pvalue(x::OneSampleADTest)
+    z = x.A²*(1.0 + .75/x.n + 2.25/x.n/x.n)
+
+    if z < .200
+        1.0 - exp(-13.436+101.14z-223.73z^2)
+    elseif .200 < z < .340
+        1.0 - exp(-8.318+42.796z-59.938z^2)
+    elseif .340 < z < .600
+        exp(0.9177-4.279z-1.38z^2)
+    else
+        exp(1.2937-5.709z+0.0186z^2)
+    end
+end
+
+
+## K-SAMPLE ANDERSON DARLING TEST
+### k-Sample Anderson-Darling Tests, F. W. Scholz; M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924.
+
+immutable KSampleADTest <: ADTest
+    k::Int        # number of samples
+    n::Int        # number of observations
+    σ::Float64   # variance A²k
+    A²k::Float64 # Anderson-Darling test statistic
+end
+
+function KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true)
+    KSampleADTest(a2_ksample(xs, modified)...)
+end
+
+testname(::KSampleADTest) = "k-sample Anderson-Darling test"
+
+function show_params(io::IO, x::KSampleADTest, ident="")
+    println(io, ident, "number of samples:        $(x.k)")
+    println(io, ident, "number of observations:   $(x.n)")
+    println(io, ident, "SD of A²k:               $(x.σ)")
+    println(io, ident, "A²k statistic:           $(x.A²k)")
+end
+
+function pvalue(x::KSampleADTest)
+    m = x.k - 1
+    Tk = (x.A²k - m) / x.σ
+    sig = [0.25, 0.1, 0.05, 0.025, 0.01]
+    b0 = [0.675, 1.281, 1.645, 1.96, 2.326]
+    b1 = [-0.245, 0.25, 0.678, 1.149, 1.822]
+    b2 = [-0.105, -0.305, -0.362, -0.391, -0.396]
+    tm = b0 + b1 / sqrt(m) + b2 / m
+    f = CurveFit.poly_fit(tm, log(sig), 2)
+    exp(f[1] + f[2]*Tk + f[3]*Tk^2)
+end
+
+function a2_ksample(samples, modified=true)
+    k = length(samples)
+    k < 2 && error("Need at least two samples")
+
+    n = map(length, samples)
+    Z = sort(vcat(samples...))
+    N = length(Z)
+    Z⁺ = unique(Z)
+    L = length(Z⁺)
+
+    L < 2 && error("Need more then 1 observation")
+    minimum(n) == 0 && error("One of the samples is empty")
+
+    fij = zeros(Int, k, L)
+    for i in 1:k
+        for s in samples[i]
+            fij[i, searchsortedfirst(Z⁺, s)] += 1
+        end
+    end
+
+    A²k = 0.
+    if modified
+        for i in 1:k
+            inner = 0.
+            Mij = 0.
+            Bj = 0.
+            for j = 1:L
+                lj = sum(fij[:,j])
+                Mij += fij[i, j]
+                Bj += lj
+                Maij = Mij - fij[i, j]/2.
+                Baj = Bj - lj/2.
+                inner += lj/N * (N*Maij-n[i]*Baj)^2 / (Baj*(N-Baj) - N*lj/4.)
+            end
+            A²k += inner / n[i]
+        end
+        A²k *= (N - 1.) / N
+    else
+        for i in 1:k
+            inner = 0.
+            Mij = 0.
+            Bj = 0.
+            for j = 1:L-1
+                lj = sum(fij[:,j])
+                Mij += fij[i, j]
+                Bj += lj
+                inner += lj/N * (N*Mij-n[i]*Bj)^2 / (Bj*(N-Bj))
+            end
+            A²k += inner / n[i]
+        end
+    end
+
+    H = sum(map(i->1./i, n))
+    h = sum(1./(1:N-1))
+    g = 0.
+    for i in 1:N-2
+        for j in i+1:N-1
+            g += 1. / ((N - i) * j)
+        end
+    end
+
+    a = (4*g - 6)*(k - 1) + (10 - 6*g)*H
+    b = (2*g - 4)*k^2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
+    c = (6*h + 2*g - 2)*k^2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
+    d = (2*h + 6)*k^2 - 4*h*k
+    σ²n = (a*N^3 + b*N^2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
+
+    (k, N, sqrt(σ²n), A²k)
+end
diff --git a/test/anderson_darling.jl b/test/anderson_darling.jl
@@ -0,0 +1,50 @@
+using HypothesisTests, Distributions, Base.Test
+
+# One sample test
+n = 1000
+srand(1984948)
+
+x = rand(Normal(), n)
+t = OneSampleADTest(x, Normal())
+@test_approx_eq_eps t.A² 0.2013 0.1^4
+@test_approx_eq_eps pvalue(t) 0.8811 0.1^4
+
+x = rand(DoubleExponential(), n)
+t = OneSampleADTest(x, Normal())
+@test_approx_eq_eps t.A² 10.7439 0.1^4
+@test_approx_eq_eps pvalue(t) 0.0 0.1^4
+
+x = rand(Cauchy(), n)
+t = OneSampleADTest(x, Normal())
+@test_approx_eq_eps t.A² 278.0190 0.1^4
+
+x = rand(LogNormal(), n)
+t = OneSampleADTest(x, Normal())
+@test_approx_eq_eps t.A² 85.5217 0.1^4
+
+# k-sample test
+samples = Any[
+    [38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0],
+    [39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8],
+    [34.0, 35.0, 39.0, 40.0, 43.0, 43.0, 44.0, 45.0],
+    [34.0, 34.8, 34.8, 35.4, 37.2, 37.8, 41.2, 42.8]
+]
+
+t = KSampleADTest(samples...)
+@test_approx_eq_eps t.A²k 8.3926 0.1^4
+@test_approx_eq_eps t.σ 1.2038 0.1^4
+@test_approx_eq_eps pvalue(t) 0.0020 0.1^4
+
+t = KSampleADTest(samples..., modified = false)
+@test_approx_eq_eps t.A²k 8.3559 0.1^4
+@test_approx_eq_eps t.σ 1.2038 0.1^4
+@test_approx_eq_eps pvalue(t) 0.0021 0.1^4
+
+srand(31412455)
+samples = Any[rand(Normal(), 50), rand(Normal(0.5), 30)]
+t = KSampleADTest(samples...)
+@test pvalue(t) < 0.05
+
+samples = Any[rand(Normal(), 50), rand(Normal(), 30), rand(Normal(), 20)]
+t = KSampleADTest(samples...)
+@test pvalue(t) > 0.05
diff --git a/test/common.jl b/test/common.jl
@@ -0,0 +1,9 @@
+using HypothesisTests, Base.Test
+
+@test_throws DimensionMismatch HypothesisTests.check_same_length([1], [])
+@test_throws ArgumentError HypothesisTests.check_alpha(0.0)
+
+type TestTest <: HypothesisTests.HypothesisTest end
+result = HypothesisTests.population_param_of_interest(TestTest())
+@test result[1] == "not implemented yet"
+@test isnan(result[2])
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,3 +8,5 @@ include("mann_whitney.jl")
 include("t.jl")
 include("wilcoxon.jl")
 include("power_divergence.jl")
+include("anderson_darling.jl")
+include("common.jl")