Skip to content

Commit

Permalink
avoid CategoricalArrays dependency in aggregates
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Nov 7, 2020
1 parent 55533d1 commit a2ea650
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 10 deletions.
21 changes: 12 additions & 9 deletions src/groupeddataframe/fastaggregates.jl
Expand Up @@ -122,10 +122,10 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
# !ismissing check is purely an optimization to avoid a copy later
outcol = similar(incol, condf === !ismissing ? S : T, length(gd))
# Comparison is possible only between CatValues from the same pool
if incol isa CategoricalVector
U = Union{CategoricalArrays.leveltype(outcol),
eltype(outcol) >: Missing ? Missing : Union{}}
outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool)
incolT = typeof(incol).name
if incolT.name === :CategoricalArray &&
nameof(incolT.module) === :CategoricalArrays
outcol = Vector{eltype(incol)}(undef, length(gd))
end
# It is safe to use a non-missing init value
# since missing will poison the result if present
Expand Down Expand Up @@ -198,11 +198,14 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
if checkempty && any(iszero, counts)
throw(ArgumentError("some groups contain only missing values"))
end
# Undo pool sharing done by groupreduce_init
if res isa CategoricalVector && res.pool === incol.pool
V = Union{CategoricalArrays.leveltype(res),
eltype(res) >: Missing ? Missing : Union{}}
res = CategoricalArray{V, 1}(res.refs, copy(res.pool))
# Reallocate Vector created in groupreduce_init with min or max
# for CategoricalVector
incolT = typeof(incol).name
if incolT.name === :CategoricalArray &&
nameof(incolT.module) === :CategoricalArrays && res isa Vector
@assert op === min || op === max
# use the fact that broadcasted identity will create CategoricalVector here
res = identity.(res)
end
if isconcretetype(eltype(res))
return res
Expand Down
32 changes: 31 additions & 1 deletion test/grouping.jl
@@ -1,6 +1,6 @@
module TestGrouping

using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays
using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI
const = isequal

"""Check if passed data frames are `isequal` and have the same element types of columns"""
Expand Down Expand Up @@ -3173,4 +3173,34 @@ end
:min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort
end

@testset "extra CategoricalArray aggregation tests" begin
for ord in (true, false)
df = DataFrame(id = [1, 1, 1, 2, 2, 2], x = categorical(1:6, ordered=ord))
gdf = groupby_checked(df, :id)
res = combine(gdf, :x .=> [minimum, maximum, first, last, length])
@test res == DataFrame(id=[1,2], x_minimum=[1,4], x_maximum=[3,6],
x_first=[1,4], x_last=[3,6], x_length=[3,3])
@test res.x_minimum isa CategoricalVector
@test res.x_maximum isa CategoricalVector
@test res.x_first isa CategoricalVector
@test res.x_last isa CategoricalVector
@test isordered(res.x_minimum) == ord
@test isordered(res.x_maximum) == ord
@test isordered(res.x_first) == ord
@test isordered(res.x_last) == ord
@test DataAPI.refpool(res.x_minimum) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_maximum) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_first) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_last) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_minimum) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x)
@test res.x_minimum.pool != df.x.pool
@test res.x_maximum.pool != df.x.pool
@test res.x_first.pool != df.x.pool
@test res.x_last.pool != df.x.pool
end
end

end # module

0 comments on commit a2ea650

Please sign in to comment.