From 47a4457694d78d401816546b34c4aad1be4314a5 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Tue, 26 Jan 2021 12:14:24 +0000 Subject: [PATCH 01/23] Rewrote corkendall (issue 634) New version of corkendall is approx 4 times faster if both arguments are vectors and 7 times faster if at least one is a matrix. See issue 634 for details. --- src/rankcorr.jl | 238 ++++++++++++++++++++++++++++++----------------- test/rankcorr.jl | 57 ++++++++++-- 2 files changed, 206 insertions(+), 89 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index ea8041058..2ed9feb82 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -28,71 +28,76 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) ####################################### -# +# # Kendall correlation -# +# ####################################### -# Knight JASA (1966) - -function corkendall!(x::RealVector, y::RealVector) +# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” +# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. +# JSTOR, www.jstor.org/stable/2282833. Accessed 15 Jan. 2021. +function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) - if n != length(y) error("Vectors must have same length") end + if n ≠ length(y) error("Vectors must have same length") end # Initial sorting - pm = sortperm(y) - x[:] = x[pm] - y[:] = y[pm] - pm[:] = sortperm(x) - x[:] = x[pm] - - # Counting ties in x and y - iT = 1 - nT = 0 - iU = 1 - nU = 0 - for i = 2:n - if x[i] == x[i-1] - iT += 1 - else - nT += iT*(iT - 1) - iT = 1 - end - if y[i] == y[i-1] - iU += 1 - else - nU += iU*(iU - 1) - iU = 1 + x[:] = x[permx] + y[:] = y[permx] + + npairs = float(n) * (n - 1)/ 2 + #ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit + ntiesx, ntiesy, ndoubleties, k, nswaps = 0.0, 0.0, 0.0, 0, 0 + + @inbounds for i ∈ 2:n + if x[i - 1] == x[i] + k += 1 + elseif k > 0 + # Sort the corresponding chunk of y, so the rows of hcat(x,y) are + # sorted first on x, then (where x values are tied) on y. Hence + # double ties can be counted by calling countties. + sort!(view(y, (i - k - 1):(i - 1))) + ntiesx += float(k) * (k + 1)/2 + ndoubleties += countties(y, i - k - 1, i - 1) + k = 0 end end - if iT > 1 nT += iT*(iT - 1) end - nT = div(nT,2) - if iU > 1 nU += iU*(iU - 1) end - nU = div(nU,2) - - # Sort y after x - y[:] = y[pm] - - # Calculate double ties - iV = 1 - nV = 0 - jV = 1 - for i = 2:n - if x[i] == x[i-1] && y[i] == y[i-1] - iV += 1 - else - nV += iV*(iV - 1) - iV = 1 - end + if k > 0 + sort!(view(y, ((n - k):n))) + ntiesx += float(k) * (k + 1)/ 2 + ndoubleties += countties(y, n - k, n) end - if iV > 1 nV += iV*(iV - 1) end - nV = div(nV,2) - nD = div(n*(n - 1),2) - return (nD - nT - nU + nV - 2swaps!(y)) / (sqrt(nD - nT) * sqrt(nD - nU)) + nswaps = mergesort!(y, 1, n) + ntiesy = countties(y, 1, n) + + (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt((npairs - ntiesx) * (npairs - ntiesy)) end +""" + countties(x::RealVector,lo::Int64,hi::Int64) + +Assumes `x` is sorted. Returns the number of ties within `x[lo:hi]`. +""" +function countties(x::AbstractVector, lo::Integer, hi::Integer) + #avoid ovorflows on 32 bit by using floats + thistiecount, result = 0.0, 0.0 + (lo < 1 || hi > length(x)) && error("Bounds error") + @inbounds for i ∈ (lo + 1):hi + if x[i] == x[i - 1] + thistiecount += 1.0 + elseif thistiecount > 0 + result += thistiecount * (thistiecount + 1)/2 + thistiecount = 0.0 + end + end + + if thistiecount > 0 + result += thistiecount * (thistiecount + 1)/2 + end + result +end """ corkendall(x, y=x) @@ -102,19 +107,31 @@ matrices or vectors. """ corkendall(x::RealVector, y::RealVector) = corkendall!(float(copy(x)), float(copy(y))) -corkendall(X::RealMatrix, y::RealVector) = Float64[corkendall!(float(X[:,i]), float(copy(y))) for i in 1:size(X, 2)] - -corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y,2); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i])) for i in 1:n], 1, n)) +corkendall(X::RealMatrix, y::RealVector) = (permy = sortperm(y);Float64[corkendall!(float(copy(y)), float(X[:,i]), permy) for i in 1:size(X, 2)]) -corkendall(X::RealMatrix, Y::RealMatrix) = Float64[corkendall!(float(X[:,i]), float(Y[:,j])) for i in 1:size(X, 2), j in 1:size(Y, 2)] +corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y, 2); permx = sortperm(x); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i]), permx) for i in 1:n], 1, n)) function corkendall(X::RealMatrix) n = size(X, 2) - C = Matrix{eltype(X)}(I, n, n) - for j = 2:n - for i = 1:j-1 - C[i,j] = corkendall!(X[:,i],X[:,j]) - C[j,i] = C[i,j] + C = ones(float(eltype(X)), n, n)# avoids dependency on LinearAlgebra + @inbounds for j ∈ 2:n + permx = sortperm(X[:,j]) + for i ∈ 1:j - 1 + C[j,i] = corkendall!(X[:,j], X[:,i], permx) + C[i,j] = C[j,i] + end + end + return C +end + +function corkendall(X::RealMatrix, Y::RealMatrix) + nr = size(X, 2) + nc = size(Y, 2) + C = zeros(float(eltype(X)), nr, nc) + @inbounds for j ∈ 1:nr + permx = sortperm(X[:,j]) + for i ∈ 1:nc + C[j,i] = corkendall!(X[:,j], Y[:,i], permx) end end return C @@ -122,32 +139,87 @@ end # Auxilliary functions for Kendall's rank correlation -function swaps!(x::RealVector) - n = length(x) - if n == 1 return 0 end - n2 = div(n, 2) - xl = view(x, 1:n2) - xr = view(x, n2+1:n) - nsl = swaps!(xl) - nsr = swaps!(xr) - sort!(xl) - sort!(xr) - return nsl + nsr + mswaps(xl,xr) -end +# Tests appeared to show that a value of 64 is optimal, +# but note that the equivalent constant in base/sort.jl is 20. +const SMALL_THRESHOLD = 64 -function mswaps(x::RealVector, y::RealVector) - i = 1 - j = 1 - nSwaps = 0 - n = length(x) - while i <= n && j <= length(y) - if y[j] < x[i] - nSwaps += n - i + 1 +# Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 +""" + mergesort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + +Mutates `v` by sorting elements `x[lo:hi]` using the mergesort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on MergeSortAlg), +but amended to return the bubblesort distance. +""" +function mergesort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + #avoid overflow errors (if length(v)> 2^16) on 32 bit by using float + nswaps = 0.0 + @inbounds if lo < hi + hi - lo <= SMALL_THRESHOLD && return insertionsort!(v, lo, hi) + + m = midpoint(lo, hi) + (length(t) < m - lo + 1) && resize!(t, m - lo + 1) + + nswaps = mergesort!(v, lo, m, t) + nswaps += mergesort!(v, m + 1, hi, t) + + i, j = 1, lo + while j <= m + t[i] = v[j] + i += 1 j += 1 - else + end + + i, k = 1, lo + while k < j <= hi + if v[j] < t[i] + v[k] = v[j] + j += 1 + nswaps += m - lo + 1 - (i - 1) + else + v[k] = t[i] + i += 1 + end + k += 1 + end + while k < j + v[k] = t[i] + k += 1 i += 1 end end - return nSwaps + return nswaps end +# This implementation of `midpoint` is performance-optimized but safe only if `lo <= hi`. +# This function is also copied from base/sort.jl +midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) +midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) + +# Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 +""" + insertionsort!(v::AbstractVector, lo::Integer, hi::Integer) + +Mutates `v` by sorting elements `x[lo:hi]` using the insertionsort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on InsertionSortAlg), +amended to return the bubblesort distance. +""" +function insertionsort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi return 0 end + nswaps = 0.0 + @inbounds for i = lo + 1:hi + j = i + x = v[i] + while j > lo + if x < v[j - 1] + nswaps += 1.0 + v[j] = v[j - 1] + j -= 1 + continue + end + break + end + v[j] = x + end + return nswaps +end \ No newline at end of file diff --git a/test/rankcorr.jl b/test/rankcorr.jl index e5505c2a0..4c46de9f8 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -26,17 +26,62 @@ c22 = corspearman(x2, x2) # corkendall -@test corkendall(x1, y) ≈ -0.105409255338946 -@test corkendall(x2, y) ≈ -0.117851130197758 +# corkendall + +@test corkendall(x1, y) == -1 / sqrt(90) +@test corkendall(x2, y) == -1 / sqrt(72) +@test corkendall(X, y) == [-1 / sqrt(90), -1 / sqrt(72)] +@test corkendall(y, X) == [-1 / sqrt(90) -1 / sqrt(72)] -@test corkendall(X, y) ≈ [-0.105409255338946, -0.117851130197758] -@test corkendall(y, X) ≈ [-0.105409255338946 -0.117851130197758] +n = 100_000 # so vectors tested are 5 times this length +@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1 / sqrt(90) +@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1 / sqrt(72) +@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1 / sqrt(90), -1 / sqrt(72)] +@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1 / sqrt(90) -1 / sqrt(72)] c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) c22 = corkendall(x2, x2) -@test c11 ≈ 1.0 -@test c22 ≈ 1.0 +@test c11 == 1.0 +@test c22 == 1.0 +@test c12 == 6 / sqrt(80) + @test corkendall(X, X) ≈ [c11 c12; c12 c22] @test corkendall(X) ≈ [c11 c12; c12 c22] + +@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] +@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] + + +@test corkendall(collect(1:100_000), collect(1:100_000)) == 1.0 +@test corkendall(collect(1:100_000), reverse(collect(1:100_000))) == -1.0 + +@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 + +z = [1 1 1; + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + +@test corkendall(z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(z, z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(z[:,1], z) == [1 0 1 / 3] +@test corkendall(z, z[:,1]) == [1;0;1 / 3] + +z = float(z) +@test corkendall(z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(z, z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(z[:,1], z) == [1 0 1 / 3] +@test corkendall(z, z[:,1]) == [1;0;1 / 3] + +w = repeat(z, n) +@test corkendall(w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(w, w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] +@test corkendall(w[:,1], w) == [1 0 1 / 3] +@test corkendall(w, w[:,1]) == [1;0;1 / 3] + From e58ebe7210a1894e26ac5e3c4b5648821af94b7a Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Wed, 27 Jan 2021 11:32:16 +0000 Subject: [PATCH 02/23] fixed type instability in isort! --- src/rankcorr.jl | 41 ++++++++++++++++++++--------------------- test/rankcorr.jl | 5 +---- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 2ed9feb82..dd260de32 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -45,8 +45,8 @@ function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) x[:] = x[permx] y[:] = y[permx] - npairs = float(n) * (n - 1)/ 2 - #ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit + npairs = float(n) * (n - 1) / 2 + # ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit ntiesx, ntiesy, ndoubleties, k, nswaps = 0.0, 0.0, 0.0, 0, 0 @inbounds for i ∈ 2:n @@ -57,18 +57,18 @@ function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) # sorted first on x, then (where x values are tied) on y. Hence # double ties can be counted by calling countties. sort!(view(y, (i - k - 1):(i - 1))) - ntiesx += float(k) * (k + 1)/2 + ntiesx += float(k) * (k + 1) / 2 ndoubleties += countties(y, i - k - 1, i - 1) k = 0 end end if k > 0 sort!(view(y, ((n - k):n))) - ntiesx += float(k) * (k + 1)/ 2 + ntiesx += float(k) * (k + 1) / 2 ndoubleties += countties(y, n - k, n) end - nswaps = mergesort!(y, 1, n) + nswaps = msort!(y, 1, n) ntiesy = countties(y, 1, n) (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / @@ -81,20 +81,20 @@ end Assumes `x` is sorted. Returns the number of ties within `x[lo:hi]`. """ function countties(x::AbstractVector, lo::Integer, hi::Integer) - #avoid ovorflows on 32 bit by using floats + # avoid overflows on 32 bit by using floats thistiecount, result = 0.0, 0.0 (lo < 1 || hi > length(x)) && error("Bounds error") @inbounds for i ∈ (lo + 1):hi if x[i] == x[i - 1] thistiecount += 1.0 elseif thistiecount > 0 - result += thistiecount * (thistiecount + 1)/2 + result += thistiecount * (thistiecount + 1) / 2 thistiecount = 0.0 end end if thistiecount > 0 - result += thistiecount * (thistiecount + 1)/2 + result += thistiecount * (thistiecount + 1) / 2 end result end @@ -139,29 +139,29 @@ end # Auxilliary functions for Kendall's rank correlation -# Tests appeared to show that a value of 64 is optimal, +# Tests appear to show that a value of 64 is optimal, # but note that the equivalent constant in base/sort.jl is 20. const SMALL_THRESHOLD = 64 # Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 """ - mergesort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) -Mutates `v` by sorting elements `x[lo:hi]` using the mergesort algorithm. +Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on MergeSortAlg), but amended to return the bubblesort distance. """ -function mergesort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) - #avoid overflow errors (if length(v)> 2^16) on 32 bit by using float +function msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + # avoid overflow errors (if length(v)> 2^16) on 32 bit by using float nswaps = 0.0 @inbounds if lo < hi - hi - lo <= SMALL_THRESHOLD && return insertionsort!(v, lo, hi) + hi - lo <= SMALL_THRESHOLD && return isort!(v, lo, hi) m = midpoint(lo, hi) (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - nswaps = mergesort!(v, lo, m, t) - nswaps += mergesort!(v, m + 1, hi, t) + nswaps = msort!(v, lo, m, t) + nswaps += msort!(v, m + 1, hi, t) i, j = 1, lo while j <= m @@ -191,21 +191,20 @@ function mergesort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0) return nswaps end -# This implementation of `midpoint` is performance-optimized but safe only if `lo <= hi`. # This function is also copied from base/sort.jl midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) # Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 """ - insertionsort!(v::AbstractVector, lo::Integer, hi::Integer) + isort!(v::AbstractVector, lo::Integer, hi::Integer) -Mutates `v` by sorting elements `x[lo:hi]` using the insertionsort algorithm. +Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on InsertionSortAlg), amended to return the bubblesort distance. """ -function insertionsort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi return 0 end +function isort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi return 0.0 end nswaps = 0.0 @inbounds for i = lo + 1:hi j = i diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 4c46de9f8..b3e33faae 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -24,8 +24,6 @@ c22 = corspearman(x2, x2) @test corspearman(X) ≈ [c11 c12; c12 c22] -# corkendall - # corkendall @test corkendall(x1, y) == -1 / sqrt(90) @@ -83,5 +81,4 @@ w = repeat(z, n) @test corkendall(w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] @test corkendall(w, w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] @test corkendall(w[:,1], w) == [1 0 1 / 3] -@test corkendall(w, w[:,1]) == [1;0;1 / 3] - +@test corkendall(w, w[:,1]) == [1;0;1 / 3] \ No newline at end of file From 2fff9f2318a5de070d09d734695e5632bdcf1690 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Wed, 27 Jan 2021 14:50:52 +0000 Subject: [PATCH 03/23] Guidelines say "it is generally preferred to use ASCII operators and identifiers over Unicode equivalents whenever possible" --- src/rankcorr.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index dd260de32..14b747646 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -39,7 +39,7 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) - if n ≠ length(y) error("Vectors must have same length") end + if n != length(y) error("Vectors must have same length") end # Initial sorting x[:] = x[permx] @@ -49,7 +49,7 @@ function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) # ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit ntiesx, ntiesy, ndoubleties, k, nswaps = 0.0, 0.0, 0.0, 0, 0 - @inbounds for i ∈ 2:n + @inbounds for i = 2:n if x[i - 1] == x[i] k += 1 elseif k > 0 @@ -84,7 +84,7 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) # avoid overflows on 32 bit by using floats thistiecount, result = 0.0, 0.0 (lo < 1 || hi > length(x)) && error("Bounds error") - @inbounds for i ∈ (lo + 1):hi + @inbounds for i = (lo + 1):hi if x[i] == x[i - 1] thistiecount += 1.0 elseif thistiecount > 0 @@ -114,9 +114,9 @@ corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y, 2); permx = sortperm(x); function corkendall(X::RealMatrix) n = size(X, 2) C = ones(float(eltype(X)), n, n)# avoids dependency on LinearAlgebra - @inbounds for j ∈ 2:n + @inbounds for j = 2:n permx = sortperm(X[:,j]) - for i ∈ 1:j - 1 + for i = 1:j - 1 C[j,i] = corkendall!(X[:,j], X[:,i], permx) C[i,j] = C[j,i] end @@ -128,9 +128,9 @@ function corkendall(X::RealMatrix, Y::RealMatrix) nr = size(X, 2) nc = size(Y, 2) C = zeros(float(eltype(X)), nr, nc) - @inbounds for j ∈ 1:nr + @inbounds for j = 1:nr permx = sortperm(X[:,j]) - for i ∈ 1:nc + for i = 1:nc C[j,i] = corkendall!(X[:,j], Y[:,i], permx) end end From f4ff6da452d8ce9d17e388abc46e52f0fda8692b Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Wed, 27 Jan 2021 15:59:00 +0000 Subject: [PATCH 04/23] added test on unequal length vectors --- test/rankcorr.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index b3e33faae..f7c5c0c99 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -26,6 +26,8 @@ c22 = corspearman(x2, x2) # corkendall +@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4],[1,2,3]) + @test corkendall(x1, y) == -1 / sqrt(90) @test corkendall(x2, y) == -1 / sqrt(72) @test corkendall(X, y) == [-1 / sqrt(90), -1 / sqrt(72)] From c8672a3689027897abd3d209e5d9cdcd506d1f05 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Wed, 27 Jan 2021 16:46:11 +0000 Subject: [PATCH 05/23] added tests --- test/rankcorr.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index f7c5c0c99..fc7acf9c2 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -26,14 +26,16 @@ c22 = corspearman(x2, x2) # corkendall -@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4],[1,2,3]) +@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) +@test isnan(corkendall([1,2], [3,NaN])) +@test isnan(corkendall([1,1,1], [1,2,3])) @test corkendall(x1, y) == -1 / sqrt(90) @test corkendall(x2, y) == -1 / sqrt(72) @test corkendall(X, y) == [-1 / sqrt(90), -1 / sqrt(72)] @test corkendall(y, X) == [-1 / sqrt(90) -1 / sqrt(72)] -n = 100_000 # so vectors tested are 5 times this length +n = 100_000 @test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1 / sqrt(90) @test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1 / sqrt(72) @test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1 / sqrt(90), -1 / sqrt(72)] @@ -45,17 +47,17 @@ c22 = corkendall(x2, x2) @test c11 == 1.0 @test c22 == 1.0 -@test c12 == 6 / sqrt(80) +@test c12 == 3 / sqrt(20) @test corkendall(X, X) ≈ [c11 c12; c12 c22] @test corkendall(X) ≈ [c11 c12; c12 c22] @test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] -@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] +@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] - -@test corkendall(collect(1:100_000), collect(1:100_000)) == 1.0 -@test corkendall(collect(1:100_000), reverse(collect(1:100_000))) == -1.0 +@test corkendall(collect(1:n), collect(1:n)) == 1.0 +@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 +@test isnan(corkendall(repeat([1], n), collect(1:n))) @test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 From 8ed9553aaf15c28aa859f0f4aeb7acc487c2222c Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:55:17 +0000 Subject: [PATCH 06/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 14b747646..cf0c27956 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -35,7 +35,7 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) # Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” # Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. -# JSTOR, www.jstor.org/stable/2282833. Accessed 15 Jan. 2021. +# JSTOR, www.jstor.org/stable/2282833. function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) @@ -221,4 +221,4 @@ function isort!(v::AbstractVector, lo::Integer, hi::Integer) v[j] = x end return nswaps -end \ No newline at end of file +end From d17295fcbc0b7a0506758b18ab0b1924c168ae72 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:55:48 +0000 Subject: [PATCH 07/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index cf0c27956..1537d9c27 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -36,7 +36,7 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) # Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” # Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. # JSTOR, www.jstor.org/stable/2282833. -function corkendall!(x::RealVector, y::RealVector, permx=sortperm(x)) +function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integer}=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) if n != length(y) error("Vectors must have same length") end From 9e80325da9966a18f4c63f86461d83454723c652 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:02 +0000 Subject: [PATCH 08/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 1537d9c27..a50216865 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -42,8 +42,8 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ if n != length(y) error("Vectors must have same length") end # Initial sorting - x[:] = x[permx] - y[:] = y[permx] + permute!(x, permx) + permute!(y, permx) npairs = float(n) * (n - 1) / 2 # ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit From dbb3298d37eb3e0c009de94c2894305866c457bc Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:42 +0000 Subject: [PATCH 09/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index a50216865..f883547cf 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -76,7 +76,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ end """ - countties(x::RealVector,lo::Int64,hi::Int64) + countties(x::RealVector, lo::Integer, hi::Integer) Assumes `x` is sorted. Returns the number of ties within `x[lo:hi]`. """ From c31d7eada7e4bc28ec67140f47de954123799345 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:59 +0000 Subject: [PATCH 10/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index f883547cf..77d777ac3 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -78,7 +78,7 @@ end """ countties(x::RealVector, lo::Integer, hi::Integer) -Assumes `x` is sorted. Returns the number of ties within `x[lo:hi]`. +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. """ function countties(x::AbstractVector, lo::Integer, hi::Integer) # avoid overflows on 32 bit by using floats From 56cb219ff3c66ebdcc65465dcc53c7b824cd79e8 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Sun, 31 Jan 2021 18:44:31 +0000 Subject: [PATCH 11/23] changes following review by nalimilan --- src/rankcorr.jl | 32 ++++++++++++------------ test/rankcorr.jl | 64 +++++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 77d777ac3..4e2ecae00 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -1,13 +1,13 @@ # Rank-based correlations -# +# # - Spearman's correlation # - Kendall's correlation -# +# ####################################### -# +# # Spearman correlation -# +# ####################################### """ @@ -28,9 +28,9 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) ####################################### -# +# # Kendall correlation -# +# ####################################### # Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” @@ -47,7 +47,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ npairs = float(n) * (n - 1) / 2 # ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit - ntiesx, ntiesy, ndoubleties, k, nswaps = 0.0, 0.0, 0.0, 0, 0 + ntiesx, ntiesy, ndoubleties, k, nswaps = widen(0), widen(0), widen(0), widen(0), widen(0) @inbounds for i = 2:n if x[i - 1] == x[i] @@ -57,14 +57,14 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # sorted first on x, then (where x values are tied) on y. Hence # double ties can be counted by calling countties. sort!(view(y, (i - k - 1):(i - 1))) - ntiesx += float(k) * (k + 1) / 2 + ntiesx += div(k * (k + 1), 2) ndoubleties += countties(y, i - k - 1, i - 1) k = 0 end end if k > 0 sort!(view(y, ((n - k):n))) - ntiesx += float(k) * (k + 1) / 2 + ntiesx += div(k * (k + 1), 2) ndoubleties += countties(y, n - k, n) end @@ -72,7 +72,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ ntiesy = countties(y, 1, n) (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / - sqrt((npairs - ntiesx) * (npairs - ntiesy)) + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) end """ @@ -81,20 +81,20 @@ end Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. """ function countties(x::AbstractVector, lo::Integer, hi::Integer) - # avoid overflows on 32 bit by using floats - thistiecount, result = 0.0, 0.0 + # avoid overflows on both 32 & 64 bit by using widen + thistiecount, result = widen(0), widen(0) (lo < 1 || hi > length(x)) && error("Bounds error") @inbounds for i = (lo + 1):hi if x[i] == x[i - 1] - thistiecount += 1.0 + thistiecount += 1 elseif thistiecount > 0 - result += thistiecount * (thistiecount + 1) / 2 - thistiecount = 0.0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = 0 end end if thistiecount > 0 - result += thistiecount * (thistiecount + 1) / 2 + result += div(thistiecount * (thistiecount + 1), 2) end result end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index fc7acf9c2..98dcdc304 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -30,16 +30,16 @@ c22 = corspearman(x2, x2) @test isnan(corkendall([1,2], [3,NaN])) @test isnan(corkendall([1,1,1], [1,2,3])) -@test corkendall(x1, y) == -1 / sqrt(90) -@test corkendall(x2, y) == -1 / sqrt(72) -@test corkendall(X, y) == [-1 / sqrt(90), -1 / sqrt(72)] -@test corkendall(y, X) == [-1 / sqrt(90) -1 / sqrt(72)] +@test corkendall(x1, y) == -1/sqrt(90) +@test corkendall(x2, y) == -1/sqrt(72) +@test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] +@test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] n = 100_000 -@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1 / sqrt(90) -@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1 / sqrt(72) -@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1 / sqrt(90), -1 / sqrt(72)] -@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1 / sqrt(90) -1 / sqrt(72)] +@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) +@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) +@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] +@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) @@ -47,7 +47,7 @@ c22 = corkendall(x2, x2) @test c11 == 1.0 @test c22 == 1.0 -@test c12 == 3 / sqrt(20) +@test c12 == 3/sqrt(20) @test corkendall(X, X) ≈ [c11 c12; c12 c22] @test corkendall(X) ≈ [c11 c12; c12 c22] @@ -55,34 +55,38 @@ c22 = corkendall(x2, x2) @test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] @test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] -@test corkendall(collect(1:n), collect(1:n)) == 1.0 +@test corkendall(collect(1:n), collect(1:n)) == 1.0 @test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 + @test isnan(corkendall(repeat([1], n), collect(1:n))) @test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 z = [1 1 1; - 1 1 2; - 1 2 2; - 1 2 2; - 1 2 1; - 2 1 2; - 1 1 2; - 2 2 2] - -@test corkendall(z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(z, z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(z[:,1], z) == [1 0 1 / 3] -@test corkendall(z, z[:,1]) == [1;0;1 / 3] + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] z = float(z) -@test corkendall(z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(z, z) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(z[:,1], z) == [1 0 1 / 3] -@test corkendall(z, z[:,1]) == [1;0;1 / 3] +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] w = repeat(z, n) -@test corkendall(w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(w, w) == [1 0 1 / 3; 0 1 0;1 / 3 0 1] -@test corkendall(w[:,1], w) == [1 0 1 / 3] -@test corkendall(w, w[:,1]) == [1;0;1 / 3] \ No newline at end of file +@test corkendall(w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w, w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w[:,1], w) == [1 0 1/3] +@test corkendall(w, w[:,1]) == [1; 0; 1/3] + +StatsBase.midpoint(1,10) == 5 +StatsBase.midpoint(1,widen(10)) == 5 From 23d56905a298420fae7020f16a606a361aaf29f4 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Sun, 31 Jan 2021 19:18:16 +0000 Subject: [PATCH 12/23] variable k back to being not wide (can't use wide integers as indexes into arrays) --- src/rankcorr.jl | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 4e2ecae00..df282707b 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -44,10 +44,11 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # Initial sorting permute!(x, permx) permute!(y, permx) - - npairs = float(n) * (n - 1) / 2 - # ntiesx, ntiesy, ndoubleties are floats to avoid overflows on 32bit - ntiesx, ntiesy, ndoubleties, k, nswaps = widen(0), widen(0), widen(0), widen(0), widen(0) + + # Use widen to avoid overflows on both 32bit and 64bit + npairs = widen(n) * (n - 1) / 2 + ntiesx, ntiesy, ndoubleties, nswaps = widen(0), widen(0), widen(0), widen(0) + k = 0 @inbounds for i = 2:n if x[i - 1] == x[i] @@ -81,7 +82,7 @@ end Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. """ function countties(x::AbstractVector, lo::Integer, hi::Integer) - # avoid overflows on both 32 & 64 bit by using widen + # Use widen to avoid overflows on both 32bit and 64bit thistiecount, result = widen(0), widen(0) (lo < 1 || hi > length(x)) && error("Bounds error") @inbounds for i = (lo + 1):hi @@ -113,7 +114,7 @@ corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y, 2); permx = sortperm(x); function corkendall(X::RealMatrix) n = size(X, 2) - C = ones(float(eltype(X)), n, n)# avoids dependency on LinearAlgebra + C = ones(float(eltype(X)), n, n)# Avoids dependency on LinearAlgebra @inbounds for j = 2:n permx = sortperm(X[:,j]) for i = 1:j - 1 @@ -152,8 +153,8 @@ This method is a copy-paste-edit of sort! in base/sort.jl (the method specialise but amended to return the bubblesort distance. """ function msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) - # avoid overflow errors (if length(v)> 2^16) on 32 bit by using float - nswaps = 0.0 + # Avoid overflow errors by using widen. + nswaps = widen(0) @inbounds if lo < hi hi - lo <= SMALL_THRESHOLD && return isort!(v, lo, hi) @@ -221,4 +222,4 @@ function isort!(v::AbstractVector, lo::Integer, hi::Integer) v[j] = x end return nswaps -end +end \ No newline at end of file From f9114c148de72a1f09e25cea0a97d8aebbda647c Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Mon, 1 Feb 2021 12:16:53 +0000 Subject: [PATCH 13/23] Update rankcorr.jl Incorporated suggestions from nalimilan. Also further changes to avoid overflow errors --- src/rankcorr.jl | 123 ++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 56 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index df282707b..d8b978bd6 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -46,8 +46,8 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ permute!(y, permx) # Use widen to avoid overflows on both 32bit and 64bit - npairs = widen(n) * (n - 1) / 2 - ntiesx, ntiesy, ndoubleties, nswaps = widen(0), widen(0), widen(0), widen(0) + npairs = div(widen(n) * (n - 1), 2) + ntiesx, ndoubleties, nswaps = widen(0), widen(0), widen(0) k = 0 @inbounds for i = 2:n @@ -57,65 +57,50 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # Sort the corresponding chunk of y, so the rows of hcat(x,y) are # sorted first on x, then (where x values are tied) on y. Hence # double ties can be counted by calling countties. - sort!(view(y, (i - k - 1):(i - 1))) - ntiesx += div(k * (k + 1), 2) + sort!(view(y, (i - k - 1):(i - 1))) # Can't use wide integers here + ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here ndoubleties += countties(y, i - k - 1, i - 1) k = 0 end end if k > 0 sort!(view(y, ((n - k):n))) - ntiesx += div(k * (k + 1), 2) + ntiesx += div(widen(k) * (k + 1), 2) ndoubleties += countties(y, n - k, n) end - nswaps = msort!(y, 1, n) + nswaps = merge_sort!(y, 1, n) ntiesy = countties(y, 1, n) + # Calls to float below prevent possible overflow errors when + # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) end -""" - countties(x::RealVector, lo::Integer, hi::Integer) - -Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. -""" -function countties(x::AbstractVector, lo::Integer, hi::Integer) - # Use widen to avoid overflows on both 32bit and 64bit - thistiecount, result = widen(0), widen(0) - (lo < 1 || hi > length(x)) && error("Bounds error") - @inbounds for i = (lo + 1):hi - if x[i] == x[i - 1] - thistiecount += 1 - elseif thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - thistiecount = 0 - end - end - - if thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - end - result -end - """ corkendall(x, y=x) Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either matrices or vectors. """ -corkendall(x::RealVector, y::RealVector) = corkendall!(float(copy(x)), float(copy(y))) +corkendall(x::RealVector, y::RealVector) = corkendall!(copy(x), copy(y)) -corkendall(X::RealMatrix, y::RealVector) = (permy = sortperm(y);Float64[corkendall!(float(copy(y)), float(X[:,i]), permy) for i in 1:size(X, 2)]) +function corkendall(X::RealMatrix, y::RealVector) + permy = sortperm(y) + return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)]) +end -corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y, 2); permx = sortperm(x); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i]), permx) for i in 1:n], 1, n)) +function corkendall(x::RealVector, Y::RealMatrix) + n = size(Y, 2) + permx = sortperm(x) + return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n)) +end function corkendall(X::RealMatrix) n = size(X, 2) - C = ones(float(eltype(X)), n, n)# Avoids dependency on LinearAlgebra - @inbounds for j = 2:n + C = Matrix{float(eltype(X))}(I, n, n) + for j = 2:n permx = sortperm(X[:,j]) for i = 1:j - 1 C[j,i] = corkendall!(X[:,j], X[:,i], permx) @@ -128,8 +113,8 @@ end function corkendall(X::RealMatrix, Y::RealMatrix) nr = size(X, 2) nc = size(Y, 2) - C = zeros(float(eltype(X)), nr, nc) - @inbounds for j = 1:nr + C = Matrix{float(eltype(X))}(undef, nr, nc) + for j = 1:nr permx = sortperm(X[:,j]) for i = 1:nc C[j,i] = corkendall!(X[:,j], Y[:,i], permx) @@ -140,29 +125,56 @@ end # Auxilliary functions for Kendall's rank correlation +""" + countties(x::RealVector, lo::Integer, hi::Integer) + +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. +""" +function countties(x::AbstractVector, lo::Integer, hi::Integer) + # Use of widen below prevents possible overflow errors when + # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) + w0 = widen(0) + thistiecount, result = w0, w0 + checkbounds(x, lo:hi) + @inbounds for i = (lo + 1):hi + if x[i] == x[i - 1] + thistiecount += 1 + elseif thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = w0 + end + end + + if thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + end + result +end + # Tests appear to show that a value of 64 is optimal, # but note that the equivalent constant in base/sort.jl is 20. -const SMALL_THRESHOLD = 64 +const SMALL_THRESHOLD = 64 -# Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 +# merge_sort! copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) """ - msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on MergeSortAlg), -but amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. """ -function msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) - # Avoid overflow errors by using widen. +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + # Use of widen below prevents possible overflow errors when + # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) nswaps = widen(0) @inbounds if lo < hi - hi - lo <= SMALL_THRESHOLD && return isort!(v, lo, hi) + hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) m = midpoint(lo, hi) (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - nswaps = msort!(v, lo, m, t) - nswaps += msort!(v, m + 1, hi, t) + nswaps = merge_sort!(v, lo, m, t) + nswaps += merge_sort!(v, m + 1, hi, t) i, j = 1, lo while j <= m @@ -192,27 +204,26 @@ function msort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) return nswaps end -# This function is also copied from base/sort.jl +# insertion_sort! and midpoint copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) -# Copy was from https://github.com/JuliaLang/julia/commit/28330a2fef4d9d149ba0fd3ffa06347b50067647 dated 20 Sep 2020 """ - isort!(v::AbstractVector, lo::Integer, hi::Integer) + insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl (the method specialised on InsertionSortAlg), -amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. """ -function isort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi return 0.0 end - nswaps = 0.0 +function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi return widen(0) end + nswaps = widen(0) @inbounds for i = lo + 1:hi j = i x = v[i] while j > lo if x < v[j - 1] - nswaps += 1.0 + nswaps += 1 v[j] = v[j - 1] j -= 1 continue From 913843de3b0b9932573de87031ac7d9eb235c0c1 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:42:06 +0000 Subject: [PATCH 14/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index d8b978bd6..06b4de0b1 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -47,7 +47,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # Use widen to avoid overflows on both 32bit and 64bit npairs = div(widen(n) * (n - 1), 2) - ntiesx, ndoubleties, nswaps = widen(0), widen(0), widen(0) + ntiesx = ndoubleties = nswaps = widen(0) k = 0 @inbounds for i = 2:n @@ -233,4 +233,4 @@ function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) v[j] = x end return nswaps -end \ No newline at end of file +end From 3f5132e88044e7cd225fbfed8aca6473f21989e6 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:44:23 +0000 Subject: [PATCH 15/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 06b4de0b1..82c3bfba2 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -64,7 +64,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ end end if k > 0 - sort!(view(y, ((n - k):n))) + sort!(view(y, (n - k):n)) ntiesx += div(widen(k) * (k + 1), 2) ndoubleties += countties(y, n - k, n) end From 9628be375bdfe6465e08a053a795807bfbdc6b39 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:45:53 +0000 Subject: [PATCH 16/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 82c3bfba2..d1878a27e 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -158,12 +158,12 @@ const SMALL_THRESHOLD = 64 # merge_sort! copied from Julia Base # (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) """ - merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. """ -function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t=similar(v, 0)) +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) # Use of widen below prevents possible overflow errors when # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) nswaps = widen(0) From 7b743492c2ca58d9f7196b4df45278cb4d2a4078 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:56:13 +0000 Subject: [PATCH 17/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index d1878a27e..e8e6b5167 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -57,7 +57,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # Sort the corresponding chunk of y, so the rows of hcat(x,y) are # sorted first on x, then (where x values are tied) on y. Hence # double ties can be counted by calling countties. - sort!(view(y, (i - k - 1):(i - 1))) # Can't use wide integers here + sort!(view(y, (i - k - 1):(i - 1))) ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here ndoubleties += countties(y, i - k - 1, i - 1) k = 0 From 180ff30d3361fc69bc62b503a67b458cb38579b5 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:56:48 +0000 Subject: [PATCH 18/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index e8e6b5167..d3fa8083d 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -66,7 +66,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ if k > 0 sort!(view(y, (n - k):n)) ntiesx += div(widen(k) * (k + 1), 2) - ndoubleties += countties(y, n - k, n) + ndoubleties += countties(y, n - k, n) end nswaps = merge_sort!(y, 1, n) From 0ccd3bedc7c066d18699af57938a75d8eb76582f Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 1 Feb 2021 13:57:00 +0000 Subject: [PATCH 19/23] Update src/rankcorr.jl Co-authored-by: Milan Bouchet-Valat --- src/rankcorr.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index d3fa8083d..8ca41498d 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -133,8 +133,7 @@ Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. function countties(x::AbstractVector, lo::Integer, hi::Integer) # Use of widen below prevents possible overflow errors when # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) - w0 = widen(0) - thistiecount, result = w0, w0 + thistiecount = result = widen(0) checkbounds(x, lo:hi) @inbounds for i = (lo + 1):hi if x[i] == x[i - 1] From 746eaf6431450ada3da28e3f8c785942184f9831 Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Mon, 1 Feb 2021 14:25:50 +0000 Subject: [PATCH 20/23] return type always Float64 --- src/rankcorr.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 8ca41498d..7bdb5bc7d 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -99,7 +99,7 @@ end function corkendall(X::RealMatrix) n = size(X, 2) - C = Matrix{float(eltype(X))}(I, n, n) + C = Matrix{Float64}(I, n, n) for j = 2:n permx = sortperm(X[:,j]) for i = 1:j - 1 @@ -113,7 +113,7 @@ end function corkendall(X::RealMatrix, Y::RealMatrix) nr = size(X, 2) nc = size(Y, 2) - C = Matrix{float(eltype(X))}(undef, nr, nc) + C = Matrix{Float64}(undef, nr, nc) for j = 1:nr permx = sortperm(X[:,j]) for i = 1:nc @@ -140,7 +140,7 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) thistiecount += 1 elseif thistiecount > 0 result += div(thistiecount * (thistiecount + 1), 2) - thistiecount = w0 + thistiecount = widen(0) end end From beb289afad719b25151d38c78f48f2777c63d0cd Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Mon, 1 Feb 2021 17:33:41 +0000 Subject: [PATCH 21/23] Suggested improvements to tests/rankcorr.jl --- test/rankcorr.jl | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 98dcdc304..f03500b7f 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -23,45 +23,60 @@ c22 = corspearman(x2, x2) @test corspearman(X, X) ≈ [c11 c12; c12 c22] @test corspearman(X) ≈ [c11 c12; c12 c22] - # corkendall +#Check error, handling of NaN, Inf etc @test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) @test isnan(corkendall([1,2], [3,NaN])) @test isnan(corkendall([1,1,1], [1,2,3])) +@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 +#Test, with exact equality, some known results. +#RealVector,RealVector @test corkendall(x1, y) == -1/sqrt(90) @test corkendall(x2, y) == -1/sqrt(72) +#RealMatrix,RealVector @test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] +#RealVector, RealMatrix @test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] -n = 100_000 +#n = 78_000 tests for overflow errors on 32 bit. Testing for overflow errors on 64bit would require n be too large for practicality +#This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. +n = 78_000 +#Test with many repeats @test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) @test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) @test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] @test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] +@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 + +#Test with no repeats, note testing for exact equality +@test corkendall(collect(1:n), collect(1:n)) == 1.0 +@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 + +#All elements identical should yield NaN +@test isnan(corkendall(repeat([1], n), collect(1:n))) c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) c22 = corkendall(x2, x2) +#RealMatrix,RealMatrix +@test corkendall(X, X) ≈ [c11 c12; c12 c22] +#RealMatrix +@test corkendall(X) ≈ [c11 c12; c12 c22] + @test c11 == 1.0 @test c22 == 1.0 @test c12 == 3/sqrt(20) -@test corkendall(X, X) ≈ [c11 c12; c12 c22] -@test corkendall(X) ≈ [c11 c12; c12 c22] +#Finished testing for overflow, so redefine n for speedier tests +n = 100 @test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] @test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] -@test corkendall(collect(1:n), collect(1:n)) == 1.0 -@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 - -@test isnan(corkendall(repeat([1], n), collect(1:n))) - -@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 - +#All eight three-element permutations z = [1 1 1; 1 1 2; 1 2 2; From 704fcce41d6d8ec4914a4fb7d9ac4e11bebc72b7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 8 Feb 2021 16:20:26 +0100 Subject: [PATCH 22/23] Add spaces --- test/rankcorr.jl | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index f03500b7f..6110d4c41 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -25,58 +25,59 @@ c22 = corspearman(x2, x2) # corkendall -#Check error, handling of NaN, Inf etc +# Check error, handling of NaN, Inf etc @test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) @test isnan(corkendall([1,2], [3,NaN])) @test isnan(corkendall([1,1,1], [1,2,3])) @test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 -#Test, with exact equality, some known results. -#RealVector,RealVector +# Test, with exact equality, some known results. +# RealVector, RealVector @test corkendall(x1, y) == -1/sqrt(90) @test corkendall(x2, y) == -1/sqrt(72) -#RealMatrix,RealVector +# RealMatrix, RealVector @test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] -#RealVector, RealMatrix +# RealVector, RealMatrix @test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] -#n = 78_000 tests for overflow errors on 32 bit. Testing for overflow errors on 64bit would require n be too large for practicality -#This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. +# n = 78_000 tests for overflow errors on 32 bit +# Testing for overflow errors on 64bit would require n be too large for practicality +# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. n = 78_000 -#Test with many repeats +# Test with many repeats @test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) @test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) @test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] @test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] @test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 -#Test with no repeats, note testing for exact equality +# Test with no repeats, note testing for exact equality @test corkendall(collect(1:n), collect(1:n)) == 1.0 @test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 -#All elements identical should yield NaN +# All elements identical should yield NaN @test isnan(corkendall(repeat([1], n), collect(1:n))) c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) c22 = corkendall(x2, x2) -#RealMatrix,RealMatrix +# RealMatrix, RealMatrix @test corkendall(X, X) ≈ [c11 c12; c12 c22] -#RealMatrix +# RealMatrix @test corkendall(X) ≈ [c11 c12; c12 c22] @test c11 == 1.0 @test c22 == 1.0 @test c12 == 3/sqrt(20) -#Finished testing for overflow, so redefine n for speedier tests +# Finished testing for overflow, so redefine n for speedier tests n = 100 @test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] @test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] -#All eight three-element permutations +# All eight three-element permutations z = [1 1 1; 1 1 2; 1 2 2; From bd2cf5cc1d61aca64cc20e230a309538aea0d13b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 8 Feb 2021 16:50:06 +0100 Subject: [PATCH 23/23] Revert addition of spaces --- src/rankcorr.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 7bdb5bc7d..0592530b8 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -1,13 +1,13 @@ # Rank-based correlations -# +# # - Spearman's correlation # - Kendall's correlation -# +# ####################################### -# +# # Spearman correlation -# +# ####################################### """