Skip to content
This repository has been archived by the owner on May 4, 2019. It is now read-only.

Commit

Permalink
Change unique() to return values in the same ordering as levels for PDAs
Browse files Browse the repository at this point in the history
While the generic unique() method says it preserves the order of appearance,
the ordering of levels is more likely to be useful. In particular, it will
allow StatsModels to use unique() to get levels present in the data in the
user-defined order, with the first level as reference by default.

The new code is based on CategoricalArrays.
  • Loading branch information
nalimilan committed Feb 19, 2017
1 parent 2a83487 commit 6296769
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 50 deletions.
58 changes: 16 additions & 42 deletions src/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,9 @@ end
#' @description
#'
#' Return a DataVector containing the unique values of a `PooledDataArray`,
#' in the order they appear in the data, including `NA` if any missing entries
#' in the order of `levels`, including `NA` if any missing entries
#' are encountered. For `PooledDataArray`s, this function is much less efficient
#' than `levels`, which does not return the values in the same order.
#' than `levels`.
#'
#' @param da::DataArray{T} `DataArray` whose unique values are desired.
#'
Expand All @@ -286,50 +286,24 @@ end
#' pdv = @pdata [1, -2, 1, NA, 4]
#' distinct_values = unique(pdv)
function Base.unique{T}(pda::PooledDataArray{T})
n = length(pda)
nlevels = length(pda.pool)
unique_values = Vector{T}(0)
sizehint!(unique_values, nlevels)
seen = Set{eltype(pda.refs)}()

firstna = 0
for i in 1:n
if isna(pda, i)
if firstna == 0
firstna = length(unique_values) + 1
end
elseif !in(pda.refs[i], seen)
push!(seen, pda.refs[i])
push!(unique_values, pda.pool[pda.refs[i]])
else
continue
end

if firstna > 0 && length(unique_values) == nlevels
break
seen = fill(false, nlevels + 1)
batch = 0
@inbounds for i in pda.refs
seen[i + 1] = true
# Only do a costly short-circuit check periodically
batch += 1
if batch > 1000
all(seen) && break
batch = 0
end
end

if firstna > 0
res = DataArray(Vector{T}(nlevels + 1))
i = 0
for val in unique_values
i += 1
if i == firstna
res.na[i] = true
i += 1
end
res.data[i] = val
end

if firstna == nlevels + 1
res.na[nlevels + 1] = true
end

return res
else
return DataArray(unique_values)
seenna = shift!(seen)
res = DataArray(levels(pda)[seen])
if seenna
push!(res, NA)
end
res
end

#' @description
Expand Down
20 changes: 12 additions & 8 deletions test/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,20 @@ module TestPDA
@assert levels(setlevels!(@pdata([1.0, 2.0]), [3,4])) == [3.0, 4.0]

y = @pdata [1, NA, -2, 1, NA, 4, NA]
@assert isequal(unique(y), @pdata [1, NA, -2, 4])
@assert isequal(unique(reverse(y)), @data [NA, 4, 1, -2])
@assert isequal(unique(dropna(y)), @data [1, -2, 4])
@assert isequal(unique(reverse(dropna(y))), @data [4, 1, -2])
@assert isequal(unique(y), @pdata [-2, 1, 4, NA])
@assert isequal(unique(reverse(y)), @data [-2, 1, 4, NA])
@assert isequal(unique(dropna(y)), @data levels(dropna(y)))
@assert isequal(unique(reverse(dropna(y))), @data levels(reverse(dropna(y))))

z = @pdata ["frank", NA, "gertrude", "frank", NA, "herbert", NA]
@assert isequal(unique(z), @pdata ["frank", NA, "gertrude", "herbert"])
@assert isequal(unique(reverse(z)), @pdata [NA, "herbert", "frank", "gertrude"])
@assert isequal(unique(dropna(z)), @pdata ["frank", "gertrude", "herbert"])
@assert isequal(unique(reverse(dropna(z))), @pdata ["herbert", "frank", "gertrude"])
@assert isequal(unique(z), @pdata ["frank", "gertrude", "herbert", NA])
@assert isequal(unique(reverse(z)), @pdata ["frank", "gertrude", "herbert", NA])
@assert isequal(unique(dropna(z)), @data levels(dropna(z)))
@assert isequal(unique(reverse(dropna(z))), @data levels(reverse(dropna(z))))

# check case where some levels are not present in data
z[3] = "frank"
@assert isequal(unique(z), @pdata ["frank", "herbert", NA])

# check case where only NA occurs in final position
@assert isequal(unique(@pdata [1, 2, 1, NA]), @pdata [1, 2, NA])
Expand Down

0 comments on commit 6296769

Please sign in to comment.