avoid CategoricalArrays dependency in aggregates

JuliaData · Nov 7, 2020 · a2ea650 · a2ea650
1 parent 55533d1
commit a2ea650
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 10 deletions.
diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
@@ -122,10 +122,10 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
             # !ismissing check is purely an optimization to avoid a copy later
             outcol = similar(incol, condf === !ismissing ? S : T, length(gd))
             # Comparison is possible only between CatValues from the same pool
-            if incol isa CategoricalVector
-                U = Union{CategoricalArrays.leveltype(outcol),
-                          eltype(outcol) >: Missing ? Missing : Union{}}
-                outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool)
+            incolT = typeof(incol).name
+            if incolT.name === :CategoricalArray &&
+                nameof(incolT.module) === :CategoricalArrays
+                outcol = Vector{eltype(incol)}(undef, length(gd))
             end
             # It is safe to use a non-missing init value
             # since missing will poison the result if present
@@ -198,11 +198,14 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
     if checkempty && any(iszero, counts)
         throw(ArgumentError("some groups contain only missing values"))
     end
-    # Undo pool sharing done by groupreduce_init
-    if res isa CategoricalVector && res.pool === incol.pool
-        V = Union{CategoricalArrays.leveltype(res),
-                  eltype(res) >: Missing ? Missing : Union{}}
-        res = CategoricalArray{V, 1}(res.refs, copy(res.pool))
+    # Reallocate Vector created in groupreduce_init with min or max
+    # for CategoricalVector
+    incolT = typeof(incol).name
+    if incolT.name === :CategoricalArray &&
+        nameof(incolT.module) === :CategoricalArrays && res isa Vector
+        @assert op === min || op === max
+        # use the fact that broadcasted identity will create CategoricalVector here
+        res = identity.(res)
     end
     if isconcretetype(eltype(res))
         return res

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -1,6 +1,6 @@
 module TestGrouping
 
-using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays
+using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI
 const ≅ = isequal
 
 """Check if passed data frames are `isequal` and have the same element types of columns"""
@@ -3173,4 +3173,34 @@ end
                     :min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort
 end
 
+@testset "extra CategoricalArray aggregation tests" begin
+    for ord in (true, false)
+        df = DataFrame(id = [1, 1, 1, 2, 2, 2], x = categorical(1:6, ordered=ord))
+        gdf = groupby_checked(df, :id)
+        res = combine(gdf, :x .=> [minimum, maximum, first, last, length])
+        @test res == DataFrame(id=[1,2], x_minimum=[1,4], x_maximum=[3,6],
+                               x_first=[1,4], x_last=[3,6], x_length=[3,3])
+        @test res.x_minimum isa CategoricalVector
+        @test res.x_maximum isa CategoricalVector
+        @test res.x_first isa CategoricalVector
+        @test res.x_last isa CategoricalVector
+        @test isordered(res.x_minimum) == ord
+        @test isordered(res.x_maximum) == ord
+        @test isordered(res.x_first) == ord
+        @test isordered(res.x_last) == ord
+        @test DataAPI.refpool(res.x_minimum) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_maximum) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_first) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_last) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_minimum) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x)
+        @test res.x_minimum.pool != df.x.pool
+        @test res.x_maximum.pool != df.x.pool
+        @test res.x_first.pool != df.x.pool
+        @test res.x_last.pool != df.x.pool
+    end
+end
+
 end # module