From 8508cb167cab4f2e66addef336ccdec664d9c862 Mon Sep 17 00:00:00 2001 From: Marius Pille Date: Mon, 17 Jul 2023 15:25:05 +0200 Subject: [PATCH 1/3] Add SpearmanCorrelationTest --- src/correlation.jl | 75 +++++++++++++++++++++++++++++++++++++++------ test/correlation.jl | 24 +++++++++++++++ 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/src/correlation.jl b/src/correlation.jl index 40db405a..26cf7f56 100644 --- a/src/correlation.jl +++ b/src/correlation.jl @@ -2,8 +2,11 @@ # TODO: Implement `CorrelationTest` for two arguments using Statistics: clampcor +using StatsBase: corspearman -export CorrelationTest +export CorrelationTest, SpearmanCorrelationTest + +abstract type AbstractCorrelationTest <: HypothesisTest end """ CorrelationTest(x, y) @@ -17,6 +20,7 @@ Perform a t-test for the hypothesis that ``\\text{Cor}(x,y|Z=z) = 0``, i.e. the correlation of vectors `x` and `y` given the matrix `Z` is zero. Implements `pvalue` for the t-test. + Implements `confint` using an approximate confidence interval based on Fisher's ``z``-transform. @@ -29,7 +33,7 @@ See also `partialcor` from StatsBase. * [Section testing using Student's t-distribution from Pearson correlation coefficient on Wikipedia](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Testing_using_Student's_t-distribution) * [K.J. Levy and S.C. Narula (1978): Testing Hypotheses concerning Partial Correlations: Some Methods and Discussion. International Statistical Review 46(2).](https://www.jstor.org/stable/1402814) """ -struct CorrelationTest{T<:Real} <: HypothesisTest +struct CorrelationTest{T<:Real} <: AbstractCorrelationTest r::T n::Int k::Int @@ -60,15 +64,15 @@ struct CorrelationTest{T<:Real} <: HypothesisTest end testname(p::CorrelationTest) = - string("Test for nonzero ", p.k != 0 ? "partial " : "", "correlation") + string("Test for nonzero ", p.k != 0 ? "partial " : "", "Pearson correlation") function population_param_of_interest(p::CorrelationTest) - param = p.k != 0 ? "Partial correlation" : "Correlation" + param = p.k != 0 ? "Partial Pearson correlation" : "Pearson correlation" (param, zero(p.r), p.r) end -StatsAPI.nobs(p::CorrelationTest) = p.n -StatsAPI.dof(p::CorrelationTest) = p.n - 2 - p.k +StatsAPI.nobs(p::AbstractCorrelationTest) = p.n +StatsAPI.dof(p::AbstractCorrelationTest) = p.n - 2 - p.k function StatsAPI.confint(test::CorrelationTest{T}, level::Float64=0.95) where T dof(test) > 1 || return (-one(T), one(T)) # Otherwise we can get NaNs @@ -80,12 +84,65 @@ function StatsAPI.confint(test::CorrelationTest{T}, level::Float64=0.95) where T return (elo, ehi) end -default_tail(::CorrelationTest) = :both -StatsAPI.pvalue(test::CorrelationTest; tail=:both) = pvalue(TDist(dof(test)), test.t, tail=tail) +default_tail(::AbstractCorrelationTest) = :both +StatsAPI.pvalue(test::AbstractCorrelationTest; tail=:both) = pvalue(TDist(dof(test)), test.t, tail=tail) -function show_params(io::IO, test::CorrelationTest, indent="") +function show_params(io::IO, test::AbstractCorrelationTest, indent="") println(io, indent, "number of observations: ", nobs(test)) println(io, indent, "number of conditional variables: ", test.k) println(io, indent, "t-statistic: ", test.t) println(io, indent, "degrees of freedom: ", dof(test)) end + +""" + SpearmanCorrelationTest(x, y) + +Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the rank-based Spearman correlation +of vectors `x` and `y` is zero. + +Implements `pvalue` for the t-test. + +Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation and which performs insufficient in the case of: + +* small sample sizes n < 25 +* a high true population Spearman correlation + +In these cases a bootstrap confidence interval can perform better [2]. + +# External resources +[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating pearson, kendall and spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183. + +[2] A. J. Bishara and J. B. Hittner, “Confidence intervals for correlations when data are not normal,” Behav Res, vol. 49, no. 1, pp. 294–309, Feb. 2017, doi: 10.3758/s13428-016-0702-8. + +""" +struct SpearmanCorrelationTest{T<:Real} <: AbstractCorrelationTest + r::T + n::Int + k::Int + t::T + + # Error checking is done in `corspearman` + function SpearmanCorrelationTest(x::AbstractVector, y::AbstractVector) + r = corspearman(x, y) + n = length(x) + t = r * sqrt((n - 2) / (1 - r^2)) + return new{typeof(r)}(r, n, 0, t) + end +end + +testname(p::SpearmanCorrelationTest) = "Spearman correlation" + +function population_param_of_interest(p::SpearmanCorrelationTest) + param = "Spearman Correlation" + (param, zero(p.r), p.r) +end + +function StatsAPI.confint(test::SpearmanCorrelationTest{T}, level::Float64=0.95) where T + dof(test) > 1 || return (-one(T), one(T)) # Otherwise we can get NaNs + q = quantile(Normal(), 1 - (1-level) / 2) + fisher = atanh(test.r) + bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q # Estimates variance as in Bonnet et al. (2000) + elo = clampcor(tanh(fisher - bound)) + ehi = clampcor(tanh(fisher + bound)) + return (elo, ehi) +end \ No newline at end of file diff --git a/test/correlation.jl b/test/correlation.jl index a8c10c3e..5116b1cb 100644 --- a/test/correlation.jl +++ b/test/correlation.jl @@ -26,6 +26,30 @@ using DelimitedFiles @test pvalue(x) ≈ 0.2948405 atol=1e-6 end +@testset "SpearmanCorrelation" begin + # Columns are line number, calcium, iron + nutrient = readdlm(joinpath(@__DIR__, "data", "nutrient.txt"))[:, 1:3] + w = SpearmanCorrelationTest(nutrient[:,2], nutrient[:,3]) + let out = sprint(show, w) + @test occursin("reject h_0", out) && !occursin("fail to", out) + end + # let ci = confint(w) + # @test first(ci) ≈ 0.3327138 atol=1e-6 + # @test last(ci) ≈ 0.4546639 atol=1e-6 + # end + @test nobs(w) == 737 + @test dof(w) == 735 + @test pvalue(w) < 1e-25 + + x = SpearmanCorrelationTest(nutrient[:,1], nutrient[:,2]) + @test occursin("fail to reject", sprint(show, x)) + # let ci = confint(x) + # @test first(ci) ≈ -0.1105478 atol=1e-6 + # @test last(ci) ≈ 0.0336730 atol=1e-6 + # end + @test pvalue(x) ≈ 0.09275 atol=1e-2 # value from R's cor.test(..., method="spearman") which does not use a t test algorithm AS 89 +end + @testset "Partial correlation" begin # Columns are information, similarities, arithmetic, picture completion wechsler = readdlm(joinpath(@__DIR__, "data", "wechsler.txt"))[:,2:end] From 88699006436402bad342cd430789c3aabc64f34b Mon Sep 17 00:00:00 2001 From: Marius Pille Date: Mon, 17 Jul 2023 15:35:25 +0200 Subject: [PATCH 2/3] Typo --- src/correlation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/correlation.jl b/src/correlation.jl index 26cf7f56..2ee2d801 100644 --- a/src/correlation.jl +++ b/src/correlation.jl @@ -133,7 +133,7 @@ end testname(p::SpearmanCorrelationTest) = "Spearman correlation" function population_param_of_interest(p::SpearmanCorrelationTest) - param = "Spearman Correlation" + param = "Spearman correlation" (param, zero(p.r), p.r) end From ed3313b7359ee3e9e1a3dfbb9734e754e1379549 Mon Sep 17 00:00:00 2001 From: Marius Pille Date: Tue, 1 Aug 2023 13:50:27 +0200 Subject: [PATCH 3/3] incorporate review --- docs/src/multivariate.md | 1 + src/correlation.jl | 37 ++++++++++++++++++++++++------------- test/correlation.jl | 27 ++++++++++++++++++--------- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/docs/src/multivariate.md b/docs/src/multivariate.md index 33422902..4e519b12 100644 --- a/docs/src/multivariate.md +++ b/docs/src/multivariate.md @@ -21,4 +21,5 @@ BartlettTest ```@docs CorrelationTest +SpearmanCorrelationTest ``` diff --git a/src/correlation.jl b/src/correlation.jl index 2ee2d801..7a50f16e 100644 --- a/src/correlation.jl +++ b/src/correlation.jl @@ -11,8 +11,8 @@ abstract type AbstractCorrelationTest <: HypothesisTest end """ CorrelationTest(x, y) -Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the correlation -of vectors `x` and `y` is zero. +Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the Pearson +correlation of vectors `x` and `y` is zero. CorrelationTest(x, y, Z) @@ -97,23 +97,34 @@ end """ SpearmanCorrelationTest(x, y) -Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the rank-based Spearman correlation -of vectors `x` and `y` is zero. +Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the Spearman rank +correlation ρₛ of vectors `x` and `y` is zero. Implements `pvalue` for the t-test. -Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation and which performs insufficient in the case of: +Implements `confint` using an approximate confidence interval adjusting for the +non-normality of the ranks based on [1]. This is still an approximation, which performs +insufficiently in the case of: -* small sample sizes n < 25 -* a high true population Spearman correlation +* sample sizes below 25 +* a true population Spearman correlation |ρₛ| above 0.95 +* ordinal data -In these cases a bootstrap confidence interval can perform better [2]. +In these cases a bootstrap confidence interval can perform better [2,3]. # External resources -[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating pearson, kendall and spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183. - -[2] A. J. Bishara and J. B. Hittner, “Confidence intervals for correlations when data are not normal,” Behav Res, vol. 49, no. 1, pp. 294–309, Feb. 2017, doi: 10.3758/s13428-016-0702-8. - +[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating Pearson, +Kendall and Spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, +doi: 10.1007/BF02294183. + +[2] A. J. Bishara and J. B. Hittner, “Confidence intervals for correlations when data are +not normal,” Behav Res, vol. 49, no. 1, pp. 294–309, Feb. 2017, +doi: 10.3758/s13428-016-0702-8. + +[3] J. Ruscio, “Constructing Confidence Intervals for Spearman’s Rank Correlation with +Ordinal Data: A Simulation Study Comparing Analytic and Bootstrap Methods,” +J. Mod. App. Stat. Meth., vol. 7, no. 2, pp. 416–434, Nov. 2008, +doi: 10.22237/jmasm/1225512360 """ struct SpearmanCorrelationTest{T<:Real} <: AbstractCorrelationTest r::T @@ -130,7 +141,7 @@ struct SpearmanCorrelationTest{T<:Real} <: AbstractCorrelationTest end end -testname(p::SpearmanCorrelationTest) = "Spearman correlation" +testname(p::SpearmanCorrelationTest) = "Spearman correlation" function population_param_of_interest(p::SpearmanCorrelationTest) param = "Spearman correlation" diff --git a/test/correlation.jl b/test/correlation.jl index 5116b1cb..4080247a 100644 --- a/test/correlation.jl +++ b/test/correlation.jl @@ -33,21 +33,30 @@ end let out = sprint(show, w) @test occursin("reject h_0", out) && !occursin("fail to", out) end - # let ci = confint(w) - # @test first(ci) ≈ 0.3327138 atol=1e-6 - # @test last(ci) ≈ 0.4546639 atol=1e-6 - # end + let ci = confint(w) + # Compare against values from R using library(spearmanCI) + @test first(ci) ≈ 0.3938044 atol=1e-3 + @test last(ci) ≈ 0.5178866 atol=1e-2 + # Test for consistency against our results + @test first(ci) ≈ 0.3934359 atol=1e-6 + @test last(ci) ≈ 0.5137945 atol=1e-6 + end @test nobs(w) == 737 @test dof(w) == 735 @test pvalue(w) < 1e-25 x = SpearmanCorrelationTest(nutrient[:,1], nutrient[:,2]) @test occursin("fail to reject", sprint(show, x)) - # let ci = confint(x) - # @test first(ci) ≈ -0.1105478 atol=1e-6 - # @test last(ci) ≈ 0.0336730 atol=1e-6 - # end - @test pvalue(x) ≈ 0.09275 atol=1e-2 # value from R's cor.test(..., method="spearman") which does not use a t test algorithm AS 89 + let ci = confint(x) + # Compare against values from R using library(spearmanCI) + @test first(ci) ≈ -0.1359917 atol=1e-2 + @test last(ci) ≈ 0.01197175 atol=1e-2 + # Test for consistency against our results + @test first(ci) ≈ -0.1333692 atol=1e-6 + @test last(ci) ≈ 0.01065576 atol=1e-6 + end + @test pvalue(x) ≈ 0.09274721 atol=1e-2 # value from R's cor.test(..., method="spearman") which does not use a t test algorithm AS 89 + @test pvalue(x) ≈ 0.09429494 atol=1e-6 # Test for consistency against our results end @testset "Partial correlation" begin