From 5480034bd19f047eb09286310454fca7cf02b226 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Wed, 21 Mar 2012 11:04:37 -0700 Subject: [PATCH 1/2] A number of statistics functions. --- base/array.jl | 4 +- base/sort.jl | 11 ++ base/statistics.jl | 254 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 260 insertions(+), 9 deletions(-) diff --git a/base/array.jl b/base/array.jl index 994b9588504cc..d373750403718 100644 --- a/base/array.jl +++ b/base/array.jl @@ -1112,13 +1112,13 @@ function amap(f::Function, A::StridedArray, axis::Integer) end idx = ntuple(ndimsA, j -> j == axis ? 1 : 1:dimsA[j]) - r = f(slice(A, idx)) + r = f(sub(A, idx)) R = Array(typeof(r), axis_size) R[1] = r for i = 2:axis_size idx = ntuple(ndimsA, j -> j == axis ? i : 1:dimsA[j]) - R[i] = f(slice(A, idx)) + R[i] = f(sub(A, idx)) end return R diff --git a/base/sort.jl b/base/sort.jl index 341951404e7d4..9bdcbd3a2f66e 100644 --- a/base/sort.jl +++ b/base/sort.jl @@ -394,3 +394,14 @@ function searchsorted(a, x) end return isless(a[lo],x) ? hi : lo end + +function order(a::AbstractVector) + n = length(a) + p = sort_by(i -> a[i], 1:n) + o = Array(Integer, n) + for i = 1:n + o[p[i]] = i + end + return o +end + diff --git a/base/statistics.jl b/base/statistics.jl index fec6a9592ca59..9bbb1fb8a17f3 100644 --- a/base/statistics.jl +++ b/base/statistics.jl @@ -1,17 +1,49 @@ mean(v::AbstractArray) = sum(v) / numel(v) mean(v::AbstractArray, dim::Int) = sum(v,dim) / size(v,dim) -function std(v::AbstractVector) +weighted_mean(v::AbstractArray, w::AbstractArray) = + sum(v .* w) / sum(w) + +function median(v::AbstractVector) n = numel(v) - m = mean(v) - s = 0.0 - for i=1:n - s += (v[i]-m)^2 + if isodd(n) + return select(v, div(n, 2)) + else + vs = sort(v) + return (vs[div(n, 2)] + vs[div(n, 2) + 1]) / 2 end - return sqrt(s/(n-1)) end -median(v::AbstractVector) = select(v, div(numel(v)+1,2)) +function median(v::AbstractArray) + median(reshape(v, 1, numel(v))) +end + +# variance with known mean +function var(v::AbstractArray, m::Number) + n = numel(v) + d = 0.0 + for i = 1:n + d += (v[i] - m) ^ 2 + end + return d / (n - 1) +end + +# variance +var(v::AbstractArray) = var(v, mean(v)) + +# standard deviation with known mean +function std(v::AbstractArray, m::Number) + sqrt(var(v, m)) +end + +# median absolute deviation with known center +mad(v::AbstractArray, center::Number) = median(abs(v - center)) + +#median absolute deviation +mad(v::AbstractArray) = mad(v, median(v)) + +# standard deviation +std(v::AbstractVector) = std(v, mean(v)) ## hist ## @@ -74,3 +106,211 @@ function histc(A::StridedMatrix, edg) h end +# order (aka, rank), resolving ties using the mean rank +function tiedrank(v::AbstractArray) + n = length(v) + place = sort_by(i -> v[i], 1:n) + ord = Array(Float, n) + + i = 1 + while i <= n + j = i + while j + 1 <= n && v[place[i]] == v[place[j + 1]] + j += 1 + end + + if j > i + m = sum(i:j) / (j - i + 1) + for k = i:j + ord[place[k]] = m + end + else + ord[place[i]] = i + end + + i = j + 1 + end + + return ord +end + +# pearson covariance with known means +function _jl_cov_pearson1(x::AbstractVector, y::AbstractVector, mx::Number, my::Number) + n = numel(x) + r = 0.0 + for i = 1:n + r += (x[i] - mx) * (y[i] - my) + end + r / (n - 1) +end + +# pearson covariance +function cov_pearson(x::AbstractVector, y::AbstractVector) + if numel(x) != numel(y) + error("cov_pearson: incompatible dimensions") + end + + mx = mean(x) + my = mean(y) + _jl_cov_pearson1(x, y, mx, my) +end + +# pearson covariance over all pairs of columns +function _jl_cov_pearson{T}(x::AbstractMatrix, mxs::AbstractVector{T}) + (n,m) = size(x) + R = Array(T, (m,m)) + for i = 1:m + R[i,i] = _jl_cov_pearson1(sub(x, (1:n, i)), + sub(x, (1:n, i)), + mxs[i], mxs[i]) + + for j = (i+1):m + R[i,j] = _jl_cov_pearson1(sub(x, (1:n, i)), + sub(x, (1:n, j)), + mxs[i], mxs[j]) + R[j,i] = R[i,j] + end + end + return R +end +cov_pearson(x::AbstractMatrix) = _jl_cov_pearson(x, amap(mean, x, 2)) + +# pearson covariance over all pairs of columns with known means +function _jl_cov_pearson{T}(x::AbstractMatrix, y::AbstractMatrix, + mxs::AbstractVector{T}, mys::AbstractVector{T}) + (n,m) = size(x) + R = Array(T, (m,m)) + for i = 1:m + for j = 1:m + R[i,j] = _jl_cov_pearson1(sub(x, (1:n, i)), + sub(y, (1:n, j)), + mxs[i], mys[j]) + end + end + return R +end + +# pearson covariance over all pairs of columns +function cov_pearson(x::AbstractMatrix, y::AbstractMatrix) + if size(x) != size(y) + error("cov_pearson: incompatible dimensions") + end + + if is(x, y) + return cov_pearson(x) + end + + _jl_cov_pearson(x, y, amap(mean, x, 2), amap(mean, y, 2)) +end + +# spearman covariance +function cov_spearman(x::AbstractVector, y::AbstractVector) + cov_pearson(tiedrank(x), tiedrank(y)) +end + +# spearman covariance over all pairs of columns +function cov_spearman(x::AbstractMatrix) + cov_pearson(apply(hcat, amap(tiedrank, x, 2))) +end + +# spearman covariance over all pairs of columns +function cov_spearman(x::AbstractMatrix, y::AbstractMatrix) + if is(x, y) + return cov_spearman(x) + end + + cov_pearson( + apply(hcat, amap(tiedrank, x, 2)), + apply(hcat, amap(tiedrank, y, 2))) +end + +const cov = cov_pearson + +# pearson correlation +function cor_pearson(x::AbstractVector, y::AbstractVector) + if numel(x) != numel(y) + error("cor_pearson: incompatible dimensions") + end + + mx = mean(x) + my = mean(y) + sx = std(x, mx) + sy = std(y, my) + + r = _jl_cov_pearson1(x, y, mx, my) + r / (sx * sy) +end + +# pearson correlation over all pairs of columns +function cor_pearson(x::AbstractMatrix) + (n,m) = size(x) + mxs = amap(mean, x, 2) + sxs = similar(mxs) + for i = 1:m + sxs[i] = std(x[:,i], mxs[i]) + end + R = _jl_cov_pearson(x, mxs) + + for i = 1:m + R[i,i] = 1.0 + for j = (i+1):m + R[i,j] /= sxs[i] * sxs[j] + R[j,i] = R[i,j] + end + end + return R +end + +# pearson correlation over all pairs of columns +function cor_pearson(x::AbstractMatrix, y::AbstractMatrix) + if size(x) != size(y) + error("cor_pearson: incompatible dimensions") + end + + if is(x, y) + return cor_pearson(x) + end + + (n,m) = size(x) + mxs = amap(mean, x, 2) + mys = amap(mean, y, 2) + + sxs = similar(mxs) + sys = similar(mys) + for i = 1:m + sxs[i] = std(x[:,i], mxs[i]) + sys[i] = std(y[:,i], mys[i]) + end + R = _jl_cov_pearson(x, y, mxs, mys) + + for i = 1:m + for j = 1:m + R[i,j] /= sxs[i] * sys[j] + end + end + return R +end + +# spearman correlation +function cor_spearman(x::AbstractVector, y::AbstractVector) + cor_pearson(tiedrank(x), tiedrank(y)) +end + +# spearman correlation over all pairs of columns +function cor_spearman(x::AbstractMatrix) + cor_pearson(apply(hcat, amap(tiedrank, x, 2))) +end + +# spearman correlation over all pairs of columns +function cor_spearman(x::AbstractMatrix, y::AbstractMatrix) + if is(x, y) + return cor_spearman(x) + end + + cor_pearson( + apply(hcat, amap(tiedrank, x, 2)), + apply(hcat, amap(tiedrank, y, 2))) +end + +const cor = cor_pearson + From 17f32d36963ce82d88c9e96841040afa8c1425a8 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Wed, 21 Mar 2012 12:41:00 -0700 Subject: [PATCH 2/2] Make a couple Vector functions applicable to Arrays. --- base/sort.jl | 6 +++--- base/statistics.jl | 12 ++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/base/sort.jl b/base/sort.jl index 9bdcbd3a2f66e..8994841c0f4da 100644 --- a/base/sort.jl +++ b/base/sort.jl @@ -341,7 +341,7 @@ function issorted(v::AbstractVector) return true end -function _jl_quickselect(a::AbstractVector, k::Int, lo::Int, hi::Int) +function _jl_quickselect(a::AbstractArray, k::Int, lo::Int, hi::Int) if k < lo || k > hi; error("k is out of bounds"); end while true @@ -375,8 +375,8 @@ function _jl_quickselect(a::AbstractVector, k::Int, lo::Int, hi::Int) end -select(a::AbstractVector, k::Int) = _jl_quickselect(copy(a), k, 1, length(a)) -select!(a::AbstractVector, k::Int) = _jl_quickselect(a, k, 1, length(a)) +select(a::AbstractArray, k::Int) = _jl_quickselect(copy(a), k, 1, length(a)) +select!(a::AbstractArray, k::Int) = _jl_quickselect(a, k, 1, length(a)) function searchsorted(a, x) hi = length(a) diff --git a/base/statistics.jl b/base/statistics.jl index 9bbb1fb8a17f3..604a5f1b0af53 100644 --- a/base/statistics.jl +++ b/base/statistics.jl @@ -4,7 +4,7 @@ mean(v::AbstractArray, dim::Int) = sum(v,dim) / size(v,dim) weighted_mean(v::AbstractArray, w::AbstractArray) = sum(v .* w) / sum(w) -function median(v::AbstractVector) +function median(v::AbstractArray) n = numel(v) if isodd(n) return select(v, div(n, 2)) @@ -14,10 +14,6 @@ function median(v::AbstractVector) end end -function median(v::AbstractArray) - median(reshape(v, 1, numel(v))) -end - # variance with known mean function var(v::AbstractArray, m::Number) n = numel(v) @@ -36,15 +32,15 @@ function std(v::AbstractArray, m::Number) sqrt(var(v, m)) end +# standard deviation +std(v::AbstractArray) = std(v, mean(v)) + # median absolute deviation with known center mad(v::AbstractArray, center::Number) = median(abs(v - center)) #median absolute deviation mad(v::AbstractArray) = mad(v, median(v)) -# standard deviation -std(v::AbstractVector) = std(v, mean(v)) - ## hist ## function hist(v::StridedVector, nbins::Integer)