-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from wildart/adt
Anderson-Darling test
- Loading branch information
Showing
9 changed files
with
263 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,4 @@ Distributions | |
Roots | ||
StatsBase | ||
Compat | ||
CurveFit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
Anderson–Darling test | ||
============================================= | ||
|
||
The null hypothesis of the Anderson–Darling test is that a dataset comes from | ||
a certain distribution; the reference distribution can be specified explicitely | ||
(one-sample test). K-sample Anderson–Darling tests are available for testing whether | ||
several samples are coming from a single population drawn from the distribution function | ||
which does not have to be specified. | ||
|
||
.. function:: OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution) | ||
|
||
Perform a one sample Anderson–Darling test of the null hypothesis that the data | ||
in vector ``x`` comes from the distribution ``d`` against | ||
the alternative hypothesis that the sample is not drawn from ``d``. | ||
|
||
Implements: :ref:`pvalue<pvalue>` | ||
|
||
.. function:: KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true) | ||
|
||
Perform an k-sample Anderson–Darling test of the null hypothesis that the data | ||
in vectors ``xs`` comes from the same distribution against the alternative | ||
hypothesis that the samples comes from different distributions. | ||
|
||
``modified`` paramater enables a modified test calculation for samples whose | ||
observations do not all coincide. | ||
|
||
Implements: :ref:`pvalue<pvalue>` | ||
|
||
References: | ||
|
||
- k-Sample Anderson-Darling Tests, F. W. Scholz and M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
# Anderson-Darling test | ||
|
||
export OneSampleADTest, KSampleADTest | ||
|
||
abstract ADTest <: HypothesisTest | ||
|
||
## ONE SAMPLE AD-TEST | ||
### http://www.itl.nist.gov/div898/handbook/eda/section3/eda35e.htm | ||
|
||
function adstats{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution) | ||
n = length(x) | ||
y = sort(x) | ||
μ = mean(y) | ||
σ = std(y) | ||
z = cdf(d, (y-μ)/σ) | ||
i = 1:n | ||
S = ((2*i-1.0)/n) .* (log(z[i])+log(1-z[n+1-i])) | ||
S[isinf(S)] = 0. # remove infinity | ||
A² = -n-sum(S) | ||
(n, μ, σ, A²) | ||
end | ||
|
||
immutable OneSampleADTest <: ADTest | ||
n::Int # number of observations | ||
μ::Float64 # sample mean | ||
σ::Float64 # sample std | ||
A²::Float64 # Anderson-Darling test statistic | ||
end | ||
|
||
function OneSampleADTest{T<:Real}(x::AbstractVector{T}, d::UnivariateDistribution) | ||
OneSampleADTest(adstats(x, d)...) | ||
end | ||
|
||
testname(::OneSampleADTest) = "One sample Anderson-Darling test" | ||
|
||
function show_params(io::IO, x::OneSampleADTest, ident="") | ||
println(io, ident, "number of observations: $(x.n)") | ||
println(io, ident, "sample mean: $(x.μ)") | ||
println(io, ident, "sample SD: $(x.σ)") | ||
println(io, ident, "A² statistic: $(x.A²)") | ||
end | ||
|
||
### Ralph B. D'Agostino, Goodness-of-Fit-Techniques, CRC Press, Jun 2, 1986 | ||
### https://books.google.com/books?id=1BSEaGVBj5QC&pg=PA97, p.127 | ||
function pvalue(x::OneSampleADTest) | ||
z = x.A²*(1.0 + .75/x.n + 2.25/x.n/x.n) | ||
|
||
if z < .200 | ||
1.0 - exp(-13.436+101.14z-223.73z^2) | ||
elseif .200 < z < .340 | ||
1.0 - exp(-8.318+42.796z-59.938z^2) | ||
elseif .340 < z < .600 | ||
exp(0.9177-4.279z-1.38z^2) | ||
else | ||
exp(1.2937-5.709z+0.0186z^2) | ||
end | ||
end | ||
|
||
|
||
## K-SAMPLE ANDERSON DARLING TEST | ||
### k-Sample Anderson-Darling Tests, F. W. Scholz; M. A. Stephens, Journal of the American Statistical Association, Vol. 82, No. 399. (Sep., 1987), pp. 918-924. | ||
|
||
immutable KSampleADTest <: ADTest | ||
k::Int # number of samples | ||
n::Int # number of observations | ||
σ::Float64 # variance A²k | ||
A²k::Float64 # Anderson-Darling test statistic | ||
end | ||
|
||
function KSampleADTest{T<:Real}(xs::AbstractVector{T}...; modified=true) | ||
KSampleADTest(a2_ksample(xs, modified)...) | ||
end | ||
|
||
testname(::KSampleADTest) = "k-sample Anderson-Darling test" | ||
|
||
function show_params(io::IO, x::KSampleADTest, ident="") | ||
println(io, ident, "number of samples: $(x.k)") | ||
println(io, ident, "number of observations: $(x.n)") | ||
println(io, ident, "SD of A²k: $(x.σ)") | ||
println(io, ident, "A²k statistic: $(x.A²k)") | ||
end | ||
|
||
function pvalue(x::KSampleADTest) | ||
m = x.k - 1 | ||
Tk = (x.A²k - m) / x.σ | ||
sig = [0.25, 0.1, 0.05, 0.025, 0.01] | ||
b0 = [0.675, 1.281, 1.645, 1.96, 2.326] | ||
b1 = [-0.245, 0.25, 0.678, 1.149, 1.822] | ||
b2 = [-0.105, -0.305, -0.362, -0.391, -0.396] | ||
tm = b0 + b1 / sqrt(m) + b2 / m | ||
f = CurveFit.poly_fit(tm, log(sig), 2) | ||
exp(f[1] + f[2]*Tk + f[3]*Tk^2) | ||
end | ||
|
||
function a2_ksample(samples, modified=true) | ||
k = length(samples) | ||
k < 2 && error("Need at least two samples") | ||
|
||
n = map(length, samples) | ||
Z = sort(vcat(samples...)) | ||
N = length(Z) | ||
Z⁺ = unique(Z) | ||
L = length(Z⁺) | ||
|
||
L < 2 && error("Need more then 1 observation") | ||
minimum(n) == 0 && error("One of the samples is empty") | ||
|
||
fij = zeros(Int, k, L) | ||
for i in 1:k | ||
for s in samples[i] | ||
fij[i, searchsortedfirst(Z⁺, s)] += 1 | ||
end | ||
end | ||
|
||
A²k = 0. | ||
if modified | ||
for i in 1:k | ||
inner = 0. | ||
Mij = 0. | ||
Bj = 0. | ||
for j = 1:L | ||
lj = sum(fij[:,j]) | ||
Mij += fij[i, j] | ||
Bj += lj | ||
Maij = Mij - fij[i, j]/2. | ||
Baj = Bj - lj/2. | ||
inner += lj/N * (N*Maij-n[i]*Baj)^2 / (Baj*(N-Baj) - N*lj/4.) | ||
end | ||
A²k += inner / n[i] | ||
end | ||
A²k *= (N - 1.) / N | ||
else | ||
for i in 1:k | ||
inner = 0. | ||
Mij = 0. | ||
Bj = 0. | ||
for j = 1:L-1 | ||
lj = sum(fij[:,j]) | ||
Mij += fij[i, j] | ||
Bj += lj | ||
inner += lj/N * (N*Mij-n[i]*Bj)^2 / (Bj*(N-Bj)) | ||
end | ||
A²k += inner / n[i] | ||
end | ||
end | ||
|
||
H = sum(map(i->1./i, n)) | ||
h = sum(1./(1:N-1)) | ||
g = 0. | ||
for i in 1:N-2 | ||
for j in i+1:N-1 | ||
g += 1. / ((N - i) * j) | ||
end | ||
end | ||
|
||
a = (4*g - 6)*(k - 1) + (10 - 6*g)*H | ||
b = (2*g - 4)*k^2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6 | ||
c = (6*h + 2*g - 2)*k^2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h | ||
d = (2*h + 6)*k^2 - 4*h*k | ||
σ²n = (a*N^3 + b*N^2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.)) | ||
|
||
(k, N, sqrt(σ²n), A²k) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
using HypothesisTests, Distributions, Base.Test | ||
|
||
# One sample test | ||
n = 1000 | ||
srand(1984948) | ||
|
||
x = rand(Normal(), n) | ||
t = OneSampleADTest(x, Normal()) | ||
@test_approx_eq_eps t.A² 0.2013 0.1^4 | ||
@test_approx_eq_eps pvalue(t) 0.8811 0.1^4 | ||
|
||
x = rand(DoubleExponential(), n) | ||
t = OneSampleADTest(x, Normal()) | ||
@test_approx_eq_eps t.A² 10.7439 0.1^4 | ||
@test_approx_eq_eps pvalue(t) 0.0 0.1^4 | ||
|
||
x = rand(Cauchy(), n) | ||
t = OneSampleADTest(x, Normal()) | ||
@test_approx_eq_eps t.A² 278.0190 0.1^4 | ||
|
||
x = rand(LogNormal(), n) | ||
t = OneSampleADTest(x, Normal()) | ||
@test_approx_eq_eps t.A² 85.5217 0.1^4 | ||
|
||
# k-sample test | ||
samples = Any[ | ||
[38.7, 41.5, 43.8, 44.5, 45.5, 46.0, 47.7, 58.0], | ||
[39.2, 39.3, 39.7, 41.4, 41.8, 42.9, 43.3, 45.8], | ||
[34.0, 35.0, 39.0, 40.0, 43.0, 43.0, 44.0, 45.0], | ||
[34.0, 34.8, 34.8, 35.4, 37.2, 37.8, 41.2, 42.8] | ||
] | ||
|
||
t = KSampleADTest(samples...) | ||
@test_approx_eq_eps t.A²k 8.3926 0.1^4 | ||
@test_approx_eq_eps t.σ 1.2038 0.1^4 | ||
@test_approx_eq_eps pvalue(t) 0.0020 0.1^4 | ||
|
||
t = KSampleADTest(samples..., modified = false) | ||
@test_approx_eq_eps t.A²k 8.3559 0.1^4 | ||
@test_approx_eq_eps t.σ 1.2038 0.1^4 | ||
@test_approx_eq_eps pvalue(t) 0.0021 0.1^4 | ||
|
||
srand(31412455) | ||
samples = Any[rand(Normal(), 50), rand(Normal(0.5), 30)] | ||
t = KSampleADTest(samples...) | ||
@test pvalue(t) < 0.05 | ||
|
||
samples = Any[rand(Normal(), 50), rand(Normal(), 30), rand(Normal(), 20)] | ||
t = KSampleADTest(samples...) | ||
@test pvalue(t) > 0.05 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
using HypothesisTests, Base.Test | ||
|
||
@test_throws DimensionMismatch HypothesisTests.check_same_length([1], []) | ||
@test_throws ArgumentError HypothesisTests.check_alpha(0.0) | ||
|
||
type TestTest <: HypothesisTests.HypothesisTest end | ||
result = HypothesisTests.population_param_of_interest(TestTest()) | ||
@test result[1] == "not implemented yet" | ||
@test isnan(result[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters