Skip to content

Commit

Permalink
Change unique to follow the AbstractArray interface
Browse files Browse the repository at this point in the history
Return unique values in their order of appearance, just like other arrays. The levels
function should be used when one wants the levels in their custom order.
  • Loading branch information
nalimilan committed Apr 10, 2018
1 parent e2db279 commit 688bb03
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 45 deletions.
30 changes: 11 additions & 19 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -616,34 +616,26 @@ end
function _unique(::Type{S},
refs::AbstractArray{T},
pool::CategoricalPool) where {S, T<:Integer}
seen = fill(false, length(index(pool))+1)
trackmissings = S >: Missing
nlevels = length(index(pool)) + 1
order = fill(0, nlevels) # 0 indicates not seen
# If we don't track missings, short-circuit even if none has been seen
seen[1] = !trackmissings
batch = 0
count = S >: Missing ? 0 : 1
@inbounds for i in refs
seen[i + 1] = true
# Only do a costly short-circuit check periodically
batch += 1
if batch > 1000
all(seen) && break
batch = 0
if order[i + 1] == 0
count += 1
order[i + 1] = count
count == nlevels && break
end
end
seenmissing = popfirst!(seen)
res = convert(Vector{S}, index(pool)[seen][sortperm(pool.order[seen])])
if trackmissings && seenmissing
push!(res, missing)
end
res
S[i == 1 ? missing : index(pool)[i - 1] for i in sortperm(order) if order[i] != 0]
end

"""
unique(A::CategoricalArray)
Return levels which appear in `A`, in the same order as [`levels`](@ref)
(and not in their order of appearance). This function is significantly slower than
[`levels`](@ref) since it needs to check whether levels are used or not.
Return levels which appear in `A` in their order of appearance.
This function is significantly slower than [`levels`](@ref)
since it needs to check whether levels are used or not.
"""
unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool)

Expand Down
27 changes: 15 additions & 12 deletions test/11_array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ using CategoricalArrays: DefaultRefType, catvaluetype, leveltype
@test catvaluetype(x) === CategoricalArrays.CategoricalString{R}
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -231,7 +232,7 @@ using CategoricalArrays: DefaultRefType, catvaluetype, leveltype

@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(a)
@test levels(x) == unique(x) == unique(a)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -356,6 +357,7 @@ using CategoricalArrays: DefaultRefType, catvaluetype, leveltype
@test x[3] === x.pool.valindex[3]
@test x[4] === x.pool.valindex[4]
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))

if ordered
@test_throws OrderedLevelsException x[1:2] = -1
Expand All @@ -367,6 +369,7 @@ using CategoricalArrays: DefaultRefType, catvaluetype, leveltype
@test x[3] === x.pool.valindex[3]
@test x[4] === x.pool.valindex[4]
@test levels(x) == vcat(unique(a), -1)
@test unique(x) == unique(collect(x))

if ordered
@test_throws OrderedLevelsException push!(x, 2.0)
Expand Down Expand Up @@ -410,7 +413,7 @@ using CategoricalArrays: DefaultRefType, catvaluetype, leveltype

@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(a)
@test levels(x) == unique(x) == unique(a)
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -645,13 +648,13 @@ end
x = CategoricalArray(["Old", "Young", "Middle", "Young"])
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) == levels(x) == ["Young", "Middle", "Old"]
@test unique(x) == ["Old", "Young", "Middle"]
@test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x
@test levels(x) == ["Young", "Middle", "Old", "Unused"]
@test unique(x) == ["Young", "Middle", "Old"]
@test unique(x) == ["Old", "Young", "Middle"]
@test levels!(x, ["Unused1", "Young", "Middle", "Old", "Unused2"]) === x
@test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"]
@test unique(x) == ["Young", "Middle", "Old"]
@test unique(x) == ["Old", "Young", "Middle"]

x = CategoricalArray(String[])
@test isa(levels(x), Vector{String}) && isempty(levels(x))
Expand All @@ -660,13 +663,13 @@ end
@test levels(x) == ["Young", "Middle", "Old"]
@test isa(unique(x), Vector{String}) && isempty(unique(x))

# To test short-circuit after 1000 elements
x = CategoricalArray(repeat(1:1500, inner=10))
@test levels(x) == collect(1:1500)
@test unique(x) == collect(1:1500)
@test levels!(x, [1600:-1:1; 2000]) === x
@test levels(x) == [1600:-1:1; 2000]
@test unique(x) == collect(1500:-1:1)
# To test short-circuiting
x = CategoricalArray(repeat(1:10, inner=10))
@test levels(x) == collect(1:10)
@test unique(x) == collect(1:10)
@test levels!(x, [19:-1:1; 20]) === x
@test levels(x) == [19:-1:1; 20]
@test unique(x) == collect(1:10)
end

end
31 changes: 18 additions & 13 deletions test/12_missingarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const ≅ = isequal
@test catvaluetype(x) === CategoricalArrays.CategoricalString{R}
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -250,6 +251,7 @@ const ≅ = isequal

@test x a
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) unique(a)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -396,7 +398,7 @@ const ≅ = isequal

@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(a)
@test levels(x) == unique(x) == unique(a)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -536,6 +538,7 @@ const ≅ = isequal
@test x[3] === x.pool.valindex[3]
@test x[4] === x.pool.valindex[4]
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))

if ordered
@test_throws OrderedLevelsException x[1:2] = -1
Expand All @@ -547,6 +550,7 @@ const ≅ = isequal
@test x[3] === x.pool.valindex[3]
@test x[4] === x.pool.valindex[4]
@test levels(x) == vcat(unique(a), -1)
@test unique(x) == unique(collect(x))

if ordered
@test_throws OrderedLevelsException push!(x, 2.0)
Expand Down Expand Up @@ -592,7 +596,7 @@ const ≅ = isequal

@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(a)
@test levels(x) == unique(x) == unique(a)
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -733,6 +737,7 @@ const ≅ = isequal
@test x a
@test isordered(x) === ordered
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) unique(a)
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -1062,16 +1067,16 @@ end
@testset "unique() and levels()" begin
x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"])
@test levels(x) == ["Middle", "Old", "Young"]
@test unique(x) ["Middle", "Old", "Young", missing]
@test unique(x) ["Old", "Young", "Middle", missing]
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) ["Young", "Middle", "Old", missing]
@test unique(x) ["Old", "Young", "Middle", missing]
@test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x
@test levels(x) == ["Young", "Middle", "Old", "Unused"]
@test unique(x) ["Young", "Middle", "Old", missing]
@test unique(x) ["Old", "Young", "Middle", missing]
@test levels!(x, ["Unused1", "Young", "Middle", "Old", "Unused2"]) === x
@test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"]
@test unique(x) ["Young", "Middle", "Old", missing]
@test unique(x) ["Old", "Young", "Middle", missing]

x = CategoricalArray((Union{String, Missing})[missing])
@test isa(levels(x), Vector{String}) && isempty(levels(x))
Expand All @@ -1080,14 +1085,14 @@ end
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) [missing]

# To test short-circuit after 1000 elements
x = CategoricalArray{Union{Int, Missing}}(repeat(1:1500, inner=10))
@test levels(x) == collect(1:1500)
@test unique(x) == collect(1:1500)
@test levels!(x, [1600:-1:1; 2000]) === x
# To test short-circuiting
x = CategoricalArray{Union{Int, Missing}}(repeat(1:10, inner=10))
@test levels(x) == collect(1:10)
@test unique(x) == collect(1:10)
@test levels!(x, [19:-1:1; 20]) === x
x[3] = missing
@test levels(x) == [1600:-1:1; 2000]
@test unique(x) [1500:-1:3; 2; 1; missing]
@test levels(x) == [19:-1:1; 20]
@test unique(x) [1; missing; 2:10]

# in
x = CategoricalArray{Int}(repeat(1:1500, inner=10))
Expand Down
2 changes: 1 addition & 1 deletion test/14_view.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ using CategoricalArrays
x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order)
v = view(x, inds)
@test levels(v) === levels(x)
@test unique(v) == (ndims(v) > 0 ? sort(unique(a[inds])) : [a[inds]])
@test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]])
@test isordered(v) === isordered(x)
end

Expand Down

0 comments on commit 688bb03

Please sign in to comment.