Merge b46ed00 into 4b58684

JuliaData · Sep 21, 2020 · 18de384 · 18de384
2 parents 4b58684 + b46ed00
commit 18de384
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 56 deletions.
diff --git a/Project.toml b/Project.toml
@@ -35,7 +35,7 @@ test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]
 
 [compat]
 julia = "1"
-CategoricalArrays = "0.8"
+CategoricalArrays = "0.8.3"
 Compat = "2.2, 3"
 DataAPI = "1.2"
 InvertedIndices = "1"

diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -94,10 +94,20 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
 # 4) whether groups are already sorted
 # Optional `groups` vector is set to the group indices of each row (starting at 1)
 # With skipmissing=true, rows with missing values are attributed index 0.
+row_group_slots(cols::Tuple{Vararg{AbstractVector}},
+                hash::Val = Val(true),
+                groups::Union{Vector{Int}, Nothing} = nothing,
+                skipmissing::Bool = false,
+                sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} =
+    row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort)
+
+# Generic fallback method based on open adressing hash table
 function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
+                         refpools::Any,
                          hash::Val = Val(true),
                          groups::Union{Vector{Int}, Nothing} = nothing,
-                         skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
+                         skipmissing::Bool = false,
+                         sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
     @assert groups === nothing || length(groups) == length(cols[1])
     rhashes, missings = hashrows(cols, skipmissing)
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
@@ -140,70 +150,133 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
     return ngroups, rhashes, gslots, false
 end
 
-nlevels(x::PooledArray) = length(x.pool)
-nlevels(x) = length(levels(x))
-
-function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}},
+# Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
+function row_group_slots(cols::NTuple{N,<:AbstractVector},
+                         refpools::NTuple{N,<:AbstractVector},
                          hash::Val{false},
                          groups::Union{Vector{Int}, Nothing} = nothing,
-                         skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
+                         skipmissing::Bool = false,
+                         sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
     # Computing neither hashes nor groups isn't very useful,
     # and this method needs to allocate a groups vector anyway
     @assert groups !== nothing && all(col -> length(col) == length(groups), cols)
 
+    refpools = map(DataAPI.refpool, cols)
+    refs = map(DataAPI.refarray, cols)
+    missinginds = map(refpools) do refpool
+        eltype(refpool) >: Missing ?
+            something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1
+    end
+
     # If skipmissing=true, rows with missings all go to group 0,
     # which will be removed by functions down the stream
-    ngroupstup = map(cols) do c
-        nlevels(c) + (!skipmissing && eltype(c) >: Missing)
+    ngroupstup = map(refpools, missinginds) do refpool, missingind
+        len = length(refpool)
+        if skipmissing && missingind <= lastindex(refpool)
+            return len - 1
+        else
+            return len
+        end
     end
     ngroups = prod(ngroupstup)
 
-    # Fall back to hashing if there would be too many empty combinations.
+    # Fall back to hashing if there would be too many empty combinations
+    # or if the pool does not contain only unique values
     # The first check ensures the computation of ngroups did not overflow.
     # The rationale for the 2 threshold is that while the fallback method is always slower,
     # it allocates a hash table of size length(groups) instead of the remap vector
     # of size ngroups (i.e. the number of possible combinations) in this method:
     # so it makes sense to allocate more memory for better performance,
     # but it needs to remain reasonable compared with the size of the data frame.
-    if prod(Int128.(ngroupstup)) > typemax(Int) || ngroups > 2 * length(groups)
+    anydups = !all(allunique, refpools)
+    if prod(Int128.(ngroupstup)) > typemax(Int) ||
+       ngroups > 2 * length(groups) ||
+       anydups
+        # In the simplest case, we can work directly with the reference codes
+        newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) ||
+                  sort ||
+                  anydups ? cols : refs
         return invoke(row_group_slots,
-                      Tuple{Tuple{Vararg{AbstractVector}}, Val,
-                            Union{Vector{Int}, Nothing}, Bool},
-                      cols, hash, groups, skipmissing)
+                      Tuple{Tuple{Vararg{AbstractVector}}, Any, Val,
+                            Union{Vector{Int}, Nothing}, Bool, Bool},
+                      newcols, refpools, hash, groups, skipmissing, sort)
     end
 
     seen = fill(false, ngroups)
-    # Compute vector mapping missing to -1 if skipmissing=true
-    refmaps = map(cols) do col
-        nlevs = nlevels(col)
-        refmap = collect(-1:(nlevs-1))
-        # First value in refmap is only used by CategoricalArray
-        # (corresponds to ref 0, i.e. missing values)
-        refmap[1] = skipmissing ? -1 : nlevs
-        if col isa PooledArray{>: Missing} && skipmissing
-            missingind = get(col.invpool, missing, 0)
-            if missingind > 0
-                refmap[missingind+1] = -1
-                refmap[missingind+2:end] .-= 1
-            end
-        end
-        refmap
-    end
     strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N,Int}
-    @inbounds for i in eachindex(groups)
-        local refs
-        let i=i # Workaround for julia#15276
-            refs = map(c -> c.refs[i], cols)
+    firstinds = map(firstindex, refpools)
+    if sort
+        nminds = map(refpools, missinginds) do refpool, missingind
+            missingind > lastindex(refpool) ?
+                eachindex(refpool) : setdiff(eachindex(refpool), missingind)
         end
-        vals = map((m, r, s) -> m[r+1] * s, refmaps, refs, strides)
-        j = sum(vals) + 1
-        # x < 0 happens with -1 in refmap, which corresponds to missing
-        if skipmissing && any(x -> x < 0, vals)
-            j = 0
+        if skipmissing
+            sorted = all(issorted(view(refpool, nmind))
+                         for (refpool, nmind) in zip(refpools, nminds))
         else
-            seen[j] = true
+            sorted = all(issorted, refpools)
+        end
+    else
+        sorted = false
+    end
+    if sort && !sorted
+        # Compute vector mapping missing to -1 if skipmissing=true
+        refmaps = map(cols, refpools, missinginds, nminds) do col, refpool, missingind, nmind
+            refmap = collect(0:length(refpool)-1)
+            if skipmissing
+                fi = firstindex(refpool)
+                if missingind <= lastindex(refpool)
+                    refmap[missingind-fi+1] = -1
+                    refmap[missingind-fi+2:end] .-= 1
+                end
+                if sort
+                    perm = sortperm(view(refpool, nmind))
+                    invpermute!(view(refmap, nmind .- fi .+ 1), perm)
+                end
+            elseif sort
+                # collect is needed for CategoricalRefPool
+                invpermute!(refmap, sortperm(collect(refpool)))
+            end
+            refmap
+        end
+        @inbounds for i in eachindex(groups)
+            local refs_i
+            let i=i # Workaround for julia#15276
+                refs_i = map(c -> c[i], refs)
+            end
+            vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds)
+            j = sum(vals) + 1
+            # x < 0 happens with -1 in refmap, which corresponds to missing
+            if skipmissing && any(x -> x < 0, vals)
+                j = 0
+            else
+                seen[j] = true
+            end
+            groups[i] = j
+        end
+    else
+        @inbounds for i in eachindex(groups)
+            local refs_i
+            let i=i # Workaround for julia#15276
+                refs_i = map(refs, missinginds) do ref, missingind
+                    r = Int(ref[i])
+                    if skipmissing
+                        return r == missingind ? -1 : (r > missingind ? r-1 : r)
+                    else
+                        return r
+                    end
+                end
+            end
+            vals = map((r, s, fi) -> (r-fi) * s, refs_i, strides, firstinds)
+            j = sum(vals) + 1
+            # x < 0 happens with -1, which corresponds to missing
+            if skipmissing && any(x -> x < 0, vals)
+                j = 0
+            else
+                seen[j] = true
+            end
+            groups[i] = j
         end
-        groups[i] = j
     end
     if !all(seen) # Compress group indices to remove unused ones
         oldngroups = ngroups
@@ -220,8 +293,7 @@ function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}}
         # To catch potential bugs inducing unnecessary computations
         @assert oldngroups != ngroups
     end
-    sorted = all(col -> col isa CategoricalVector, cols)
-    return ngroups, UInt[], Int[], sorted
+    return ngroups, UInt[], Int[], sort
 end
 
 
@@ -267,7 +339,7 @@ end
 function group_rows(df::AbstractDataFrame)
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
-        row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false)
+        row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false, false)
     rperm, starts, stops = compute_indices(groups, ngroups)
     return RowGroupDict(df, rhashes, gslots, groups, rperm, starts, stops)
 end

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
@@ -160,7 +160,8 @@ function groupby(df::AbstractDataFrame, cols;
 
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
-        row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), groups, skipmissing)
+        row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
+                        groups, skipmissing, sort)
 
     gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing,
                           Threads.ReentrantLock())

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -594,6 +594,122 @@ end
     end
 end
 
+@testset "grouping arrays that allow missing without missings" begin
+    xv = ["A", "B", "B", "B", "A", "B", "A", "A"]
+    yv = ["B", "A", "A", "B", "A", "B", "A", "A"]
+    xvars = (xv,
+             categorical(xv),
+             levels!(categorical(xv), ["A", "B", "X"]),
+             levels!(categorical(xv), ["X", "B", "A"]),
+             _levels!(PooledArray(xv), ["A", "B"]),
+             _levels!(PooledArray(xv), ["B", "A", "X"]),
+             _levels!(PooledArray(xv), ["X", "A", "B"]))
+    yvars = (yv,
+             categorical(yv),
+             levels!(categorical(yv), ["A", "B", "X"]),
+             levels!(categorical(yv), ["B", "X", "A"]),
+             _levels!(PooledArray(yv), ["A", "B"]),
+             _levels!(PooledArray(yv), ["A", "B", "X"]),
+             _levels!(PooledArray(yv), ["B", "A", "X"]))
+    for x in xvars, y in yvars,
+        fx in (identity, allowmissing),
+        fy in (identity, allowmissing)
+        df = DataFrame(Key1 = fx(x), Key2 = fy(y), Value = 1:8)
+
+        @testset "sort=false, skipmissing=false" begin
+            gd = groupby_checked(df, :Key1)
+            @test length(gd) == 2
+            @test isequal_unordered(gd, [
+                    DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
+                    DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]),
+                ])
+
+            gd = groupby_checked(df, [:Key1, :Key2])
+            @test length(gd) == 4
+            @test isequal_unordered(gd, [
+                    DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
+                    DataFrame(Key1="A", Key2="B", Value=1),
+                    DataFrame(Key1="B", Key2="A", Value=[2, 3]),
+                    DataFrame(Key1="B", Key2="B", Value=[4, 6])
+                ])
+        end
+
+        @testset "sort=false, skipmissing=true" begin
+            gd = groupby_checked(df, :Key1, skipmissing=true)
+            @test length(gd) == 2
+            @test isequal_unordered(gd, [
+                DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
+                DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6])
+            ])
+
+            gd = groupby_checked(df, [:Key1, :Key2], skipmissing=true)
+            @test length(gd) == 4
+            @test isequal_unordered(gd, [
+                    DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
+                    DataFrame(Key1="A", Key2="B", Value=1),
+                    DataFrame(Key1="B", Key2="A", Value=[2, 3]),
+                    DataFrame(Key1="B", Key2="B", Value=[4, 6])
+                ])
+        end
+
+        @testset "sort=true, skipmissing=false" begin
+            gd = groupby_checked(df, :Key1, sort=true)
+            @test length(gd) == 2
+            @test isequal_unordered(gd, [
+                DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
+                DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]),
+            ])
+            @test issorted(vcat(gd...), :Key1)
+
+            gd = groupby_checked(df, [:Key1, :Key2], sort=true)
+            @test length(gd) == 4
+            @test isequal_unordered(gd, [
+                DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
+                DataFrame(Key1="A", Key2="B", Value=1),
+                DataFrame(Key1="B", Key2="A", Value=[2, 3]),
+                DataFrame(Key1="B", Key2="B", Value=[4, 6]),
+            ])
+            @test issorted(vcat(gd...), [:Key1, :Key2])
+        end
+
+        @testset "sort=true, skipmissing=true" begin
+            gd = groupby_checked(df, :Key1, sort=true, skipmissing=true)
+            @test length(gd) == 2
+            @test isequal_unordered(gd, [
+                DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
+                DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6])
+            ])
+            @test issorted(vcat(gd...), :Key1)
+
+            gd = groupby_checked(df, [:Key1, :Key2], sort=true, skipmissing=true)
+            @test length(gd) == 4
+            @test isequal_unordered(gd, [
+                DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
+                DataFrame(Key1="A", Key2="B", Value=1),
+                DataFrame(Key1="B", Key2="A", Value=[2, 3]),
+                DataFrame(Key1="B", Key2="B", Value=[4, 6])
+            ])
+            @test issorted(vcat(gd...), [:Key1, :Key2])
+        end
+    end
+end
+
+@testset "grouping refarray with fallback" begin
+    # The high number of categories compared to the number of rows triggers the use
+    # of the fallback grouping method
+    for x in ([3, 1, 2], [3, 1, missing])
+        df = DataFrame(x=categorical(x, levels=10000:-1:1),
+                       x2=categorical(x, levels=3:-1:1),
+                       y=[1, 2, 3])
+        for skipmissing in (true, false)
+            @test groupby(df, :x, sort=true, skipmissing=skipmissing) ≅
+                groupby(df, :x, sort=true, skipmissing=skipmissing)
+            @test isequal_unordered(groupby(df, :x, skipmissing=skipmissing),
+                                    collect(AbstractDataFrame, groupby(df, :x, skipmissing=skipmissing)))
+        end
+    end
+end
+
 @testset "grouping with three keys" begin
     # We need many rows so that optimized CategoricalArray method is used
     xv = rand(["A", "B", missing], 100)
@@ -632,17 +748,6 @@ end
         dfs = [groupby_checked(dfb, [:Key1, :Key2, :Key3], sort=true, skipmissing=true)...]
         @test isequal_unordered(gd, dfs)
         @test issorted(vcat(gd...), [:Key1, :Key2, :Key3])
-
-        # This is an implementation detail but it allows checking
-        # that the optimized method is used
-        if df.Key1 isa CategoricalVector &&
-            df.Key2 isa CategoricalVector &&
-            df.Key3 isa CategoricalVector
-            @test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true) ≅
-                groupby_checked(df, [:Key1, :Key2, :Key3], sort=false)
-            @test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true, skipmissing=true) ≅
-                groupby_checked(df, [:Key1, :Key2, :Key3], sort=false, skipmissing=true)
-        end
     end
 end