Skip to content
This repository has been archived by the owner on May 4, 2019. It is now read-only.

Commit

Permalink
Merge pull request #102 from JuliaStats/sjk/sort
Browse files Browse the repository at this point in the history
Efficient sorting of DataArrays
  • Loading branch information
simonster committed Jul 10, 2014
2 parents aca6c87 + 9ca7e13 commit 1359a53
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 9 deletions.
1 change: 1 addition & 0 deletions src/DataArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ module DataArrays
include("linalg.jl")
include("operators.jl")
include("broadcast.jl")
include("sort.jl")
include("extras.jl")
include("grouping.jl")
include("statistics.jl")
Expand Down
20 changes: 16 additions & 4 deletions src/dataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,24 @@ end
#'
#' dv = @data [false, false, true, false]
#' dv_new = copy(dv)
function Base.copy(d::DataArray) # -> DataArray{T}
return DataArray(copy(d.data), copy(d.na))
end
Base.copy(d::DataArray) = Base.copy!(similar(d), d) # -> DataArray{T}

function Base.copy!(dest::DataArray, src::DataArray) # -> DataArray{T}
copy!(dest.data, src.data)
if isbits(eltype(src)) && isbits(eltype(dest))
copy!(dest.data, src.data)
else
# Elements of src_data are not necessarily initialized, so
# only copy initialized elements
dest_data = dest.data
src_data = src.data
length(dest_data) >= length(src_data) || error(BoundsError())
src_chunks = src.na.chunks
for i = 1:length(src_data)
@inbounds if !Base.unsafe_bitgetindex(src_chunks, i)
dest_data[i] = src_data[i]
end
end
end
copy!(dest.na, src.na)
dest
end
Expand Down
4 changes: 2 additions & 2 deletions src/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,8 @@ Base.isequal(::NAtype, ::NAtype) = true
Base.isequal(::NAtype, b) = false
Base.isequal(a, ::NAtype) = false
Base.isless(::NAtype, ::NAtype) = false
Base.isless(::NAtype, b) = true
Base.isless(a, ::NAtype) = false
Base.isless(::NAtype, b) = false
Base.isless(a, ::NAtype) = true

# This is for performance only; the definition in Base is sufficient
# for AbstractDataArrays
Expand Down
74 changes: 74 additions & 0 deletions src/sort.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# This code is heavily based on the floating point sort code in Base

nas2end!(v::AbstractVector, o::Base.Sort.ForwardOrdering) = nas2right!(v,o)
nas2end!(v::AbstractVector, o::Base.Sort.ReverseOrdering) = nas2left!(v,o)
nas2end!{O<:Base.Order.ForwardOrdering}(v::AbstractVector{Int}, o::Base.Order.Perm{O}) = nas2right!(v,o)
nas2end!{O<:Base.Order.ReverseOrdering}(v::AbstractVector{Int}, o::Base.Order.Perm{O}) = nas2left!(v,o)

myisna(o::Base.Order.Ordering, chunks, i::Int) = Base.unsafe_bitgetindex(chunks, i)

swap(o::Base.Order.DirectOrdering, data, i, j) = setindex!(data, data[i], j)
function swap(o::Base.Order.Perm, data, i, j)
data[j], data[i] = data[i], data[j]
end

datachunks(o::Base.Order.Perm, v::AbstractVector{Int}) = (v, o.data.na.chunks)
datachunks(o::Base.Order.DirectOrdering, v::DataVector) = (v.data, v.na.chunks)

function nas2left!(v::Union(AbstractVector{Int}, DataVector), o::Base.Order.Ordering, lo::Int=1, hi::Int=length(v))
data, chunks = datachunks(o, v)

i = lo
@inbounds while i <= hi && myisna(o, chunks, i)
i += 1
end
j = i + 1
@inbounds while j <= hi
if myisna(o, chunks, j)
swap(o, data, i, j)
i += 1
end
j += 1
end
if isa(o, Base.Order.DirectOrdering)
v.na[lo:i-1] = true
v.na[i:hi] = false
end
return i, hi
end

function nas2right!(v::Union(AbstractVector{Int}, DataVector), o::Base.Order.Ordering, lo::Int=1, hi::Int=length(v))
data, chunks = datachunks(o, v)

i = hi
@inbounds while lo <= i && myisna(o, chunks, i)
i -= 1
end
j = i - 1
@inbounds while lo <= j
if myisna(o, chunks, j)
swap(o, data, i, j)
i -= 1
end
j -= 1
end
if isa(o, Base.Order.DirectOrdering)
v.na[lo:i] = false
v.na[i+1:hi] = true
end
return lo, i
end

function dasort!(v::DataVector, a::Base.Sort.Algorithm, o::Base.Order.DirectOrdering)
lo, hi = nas2end!(v, o)
sort!(v.data, lo, hi, a, o)
v
end

function dapermsort!{O<:Base.Order.DirectOrdering,T<:DataVector}(v::AbstractVector{Int}, a::Base.Sort.Algorithm, o::Base.Order.Perm{O,T})
lo, hi = nas2end!(v, o)
sort!(v, lo, hi, a, Base.Order.Perm(o.order, o.data.data))
end

Base.sort!(v::DataVector, a::Base.Sort.Algorithm, o::Base.Order.DirectOrdering) = dasort!(v,a,o)
Base.sort!{O<:Base.Order.DirectOrdering,T<:DataVector}(v::Vector{Int}, a::Base.Sort.Algorithm, o::Base.Order.Perm{O,T}) = dapermsort!(v,a,o)
3 changes: 0 additions & 3 deletions test/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ module TestAbstractArray
using DataArrays

unsorted_dv = @data [2, 1, NA]
sorted_dv = @data [NA, 1, 2]

@assert isequal(sort(unsorted_dv), sorted_dv)
@assert isequal(sortperm(unsorted_dv), [3, 2, 1])
# TODO: Make this work
# tiedrank(dv)

Expand Down
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ my_tests = ["abstractarray.jl",
"padding.jl",
"pooleddataarray.jl",
"extras.jl",
"sort.jl",
"statistics.jl",
"literals.jl",
"newtests/dataarray.jl",
Expand Down
17 changes: 17 additions & 0 deletions test/sort.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module TestSort
using DataArrays, Base.Test

for T in (Float64, BigFloat)
n = 1000
na = randbool(n)
nna = sum(na)
a = Array(T, n)
ra = randn(n-nna)
a[!na] = ra
da = DataArray(a, na)
@test isequal(sort(da), [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(da[sortperm(da)], [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(sort(da, rev=true), [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
@test isequal(da[sortperm(da, rev=true)], [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
end
end

0 comments on commit 1359a53

Please sign in to comment.