Add skipnull argument to groupby()

The new argument allows skipping values with at least one null in one of the grouping columns. Doing this in groupby() is more efficient and more convenient than dropping them before calling groupby().
JuliaData · Oct 18, 2017 · 2050f44 · 2050f44
1 parent 8cf2be9
commit 2050f44
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 40 deletions.
diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -22,15 +22,26 @@ end
 
 # "kernel" functions for hashrows()
 # adjust row hashes by the hashes of column elements
-function hashrows_col!(h::Vector{UInt}, v::AbstractVector)
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractVector{T}) where T
+    ln = length(n)
     @inbounds for i in eachindex(h)
-        h[i] = hash(v[i], h[i])
+        el = v[i]
+        h[i] = hash(el, h[i])
+        if T >: Null && ln > 0
+            # el isa Null should be redundant
+            # but it gives much more efficient code on Julia 0.6
+            n[i] |= (el isa Null || isnull(el))
+        end
     end
     h
 end
 
 # should give the same hash as AbstractVector{T}
-function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{T}) where T
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractCategoricalVector{T}) where T
     # TODO is it possible to optimize by hashing the pool values once?
     @inbounds for (i, ref) in enumerate(v.refs)
         h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
@@ -40,23 +51,30 @@ end
 
 # should give the same hash as AbstractVector{T}
 # enables efficient sequential memory access pattern
-function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{>: Null})
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractCategoricalVector{>: Null})
+    ln = length(n)
     # TODO is it possible to optimize by hashing the pool values once?
     @inbounds for (i, ref) in enumerate(v.refs)
-        h[i] = ref == 0 ?
-               hash(null, h[i]) :
-               hash(CategoricalArrays.index(v.pool)[ref], h[i])
+        if ref == 0
+            h[i] = hash(null, h[i])
+            ln > 0 && (n[i] = true)
+        else
+            h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
+        end
     end
     h
 end
 
 # Calculate the vector of `df` rows hash values.
-function hashrows(df::AbstractDataFrame)
-    res = zeros(UInt, nrow(df))
+function hashrows(df::AbstractDataFrame, skipnull::Bool)
+    rhashes = zeros(UInt, nrow(df))
+    nulls = fill(false, skipnull ? nrow(df) : 0)
     for col in columns(df)
-        hashrows_col!(res, col)
+        hashrows_col!(rhashes, nulls, col)
     end
-    return res
+    return (rhashes, nulls)
 end
 
 # Helper function for RowGroupDict.
@@ -66,39 +84,50 @@ end
 # 3) slot array for a hash map, non-zero values are
 #    the indices of the first row in a group
 # Optional group vector is set to the group indices of each row
-row_group_slots(df::AbstractDataFrame, groups::Union{Vector{Int}, Void} = nothing) =
-    row_group_slots(ntuple(i -> df[i], ncol(df)), hashrows(df), groups)
+function row_group_slots(df::AbstractDataFrame,
+                         groups::Union{Vector{Int}, Void} = nothing,
+                         skipnull::Bool = false)
+    rhashes, nulls = hashrows(df, skipnull)
+    row_group_slots(ntuple(i -> df[i], ncol(df)), rhashes, nulls, groups, skipnull)
+end
 
 function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
                          rhashes::AbstractVector{UInt},
-                         groups::Union{Vector{Int}, Void} = nothing)
+                         nulls::AbstractVector{Bool},
+                         groups::Union{Vector{Int}, Void} = nothing,
+                         skipnull::Bool = false)
     @assert groups === nothing || length(groups) == length(cols[1])
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataFrames.jl/pull/17#discussion_r102481481
     sz = Base._tablesz(length(rhashes))
     @assert sz >= length(rhashes)
     szm1 = sz-1
     gslots = zeros(Int, sz)
-    ngroups = 0
-    @inbounds for i in eachindex(rhashes)
+    # If nulls are to be skipped, they will all go to group 1
+    ngroups = skipnull ? 1 : 0
+    @inbounds for i in eachindex(rhashes, nulls)
         # find the slot and group index for a row
         slotix = rhashes[i] & szm1 + 1
-        gix = 0
+        # Use 0 for non-null values to catch bugs if group is not found
+        gix = skipnull && nulls[i] ? 1 : 0
         probe = 0
-        while true
-            g_row = gslots[slotix]
-            if g_row == 0 # unoccupied slot, current row starts a new group
-                gslots[slotix] = i
-                gix = ngroups += 1
-                break
-            elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
-                if isequal_row(cols, i, g_row) # hit
-                    gix = groups !== nothing ? groups[g_row] : 0
+        # Skip rows contaning at least one null (assigning them to group 0)
+        if !skipnull || !nulls[i]
+            while true
+                g_row = gslots[slotix]
+                if g_row == 0 # unoccupied slot, current row starts a new group
+                    gslots[slotix] = i
+                    gix = ngroups += 1
+                    break
+                elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
+                    if isequal_row(cols, i, g_row) # hit
+                        gix = groups !== nothing ? groups[g_row] : 0
+                    end
+                    break
                 end
-                break
+                slotix = slotix & szm1 + 1 # check the next slot
+                probe += 1
+                @assert probe < sz
             end
-            slotix = slotix & szm1 + 1 # check the next slot
-            probe += 1
-            @assert probe < sz
         end
         if groups !== nothing
             groups[i] = gix
@@ -109,9 +138,9 @@ end
 
 # Builds RowGroupDict for a given DataFrame.
 # Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
-function group_rows(df::AbstractDataFrame)
+function group_rows(df::AbstractDataFrame, skipnull::Bool = false)
     groups = Vector{Int}(nrow(df))
-    ngroups, rhashes, gslots = row_group_slots(df, groups)
+    ngroups, rhashes, gslots = row_group_slots(df, groups, skipnull)
 
     # count elements in each group
     stops = zeros(Int, ngroups)
@@ -136,6 +165,14 @@ function group_rows(df::AbstractDataFrame)
         stops[gix] += 1
     end
     stops .-= 1
+
+    # drop group 1 which contains rows with nulls in grouping columns
+    if skipnull
+        splice!(starts, 1)
+        splice!(stops, 1)
+        ngroups -= 1
+    end
+
     return RowGroupDict(df, ngroups, rhashes, gslots, groups, rperm, starts, stops)
 end
 

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -30,14 +30,16 @@ end
 A view of an AbstractDataFrame split into row groups
 
 ```julia
-groupby(d::AbstractDataFrame, cols)
-groupby(cols)
+groupby(d::AbstractDataFrame, cols; sort = false, skipnull = false)
+groupby(cols; sort = false, skipnull = false)
 ```
 
 ### Arguments
 
 * `d` : an AbstractDataFrame to split (optional, see [Returns](#returns))
 * `cols` : data table columns to group by
+* `sort`: whether to sort rows according to the values of the grouping columns `cols`
+* `skipnull`: whether to skip rows with `null` values in one of the grouping columns `cols`
 
 ### Returns
 
@@ -79,9 +81,10 @@ df |> groupby([:a, :b]) |> [sum, length]
 ```
 
 """
-function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) where T
+function groupby(df::AbstractDataFrame, cols::Vector{T};
+                 sort::Bool = false, skipnull::Bool = false) where T
     sdf = df[cols]
-    df_groups = group_rows(sdf)
+    df_groups = group_rows(sdf, skipnull)
     # sort the groups
     if sort
         group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts]))
@@ -91,11 +94,15 @@ function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) whe
     GroupedDataFrame(df, cols, df_groups.rperm,
                      df_groups.starts, df_groups.stops)
 end
-groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
+groupby(d::AbstractDataFrame, cols;
+        sort::Bool = false, skipnull::Bool = false) =
+    groupby(d, [cols], sort = sort, skipnull = skipnull)
 
 # add a function curry
-groupby(cols::Vector{T}; sort::Bool = false) where {T} = x -> groupby(x, cols, sort = sort)
-groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
+groupby(cols::Vector{T}; sort::Bool = false, skipnull::Bool = false) where {T} =
+    x -> groupby(x, cols, sort = sort, skipnull = skipnull)
+groupby(cols; sort::Bool = false, skipnull::Bool = false) =
+    x -> groupby(x, cols, sort = sort, skipnull = skipnull)
 
 Base.start(gd::GroupedDataFrame) = 1
 Base.next(gd::GroupedDataFrame, state::Int) =

diff --git a/test/dataframerow.jl b/test/dataframerow.jl
@@ -44,7 +44,7 @@ module TestDataFrameRow
 
 
     # check that hashrows() function generates the same hashes as DataFrameRow
-    df_rowhashes = DataFrames.hashrows(df)
+    df_rowhashes, _ = DataFrames.hashrows(df, false)
     @test df_rowhashes == [hash(dr) for dr in eachrow(df)]
 
     # test incompatible frames

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -1,5 +1,6 @@
 module TestGrouping
     using Base.Test, DataFrames
+    const ≅ = isequal
 
     srand(1)
     df = DataFrame(a = repeat(Union{Int, Null}[1, 2, 3, 4], outer=[2]),
@@ -181,4 +182,68 @@ module TestGrouping
     @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
     @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
     @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
+
+    @testset "grouping with nulls" begin
+        df = DataFrame(Key1 = ["A", null, "B", "B", "A"],
+                       Key2 = CategoricalArray(["B", "A", "A", null, "A"]),
+                       Value = 1:5)
+
+        @testset "sort=false, skipnull=false" begin
+            gd = groupby(df, :Key1)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+            @test gd[3] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2])
+            @test length(gd) == 5
+            @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4)
+            @test gd[5] ≅ DataFrame(Key1="A", Key2="A", Value=5)
+        end
+
+        @testset "sort=false, skipnull=true" begin
+            gd = groupby(df, :Key1, skipnull=true)
+            @test length(gd) == 2
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2], skipnull=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[2] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[3] == DataFrame(Key1="A", Key2="A", Value=5)
+        end
+
+        @testset "sort=true, skipnull=false" begin
+            gd = groupby(df, :Key1, sort=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+            @test gd[3] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+
+            gd = groupby(df, [:Key1, :Key2], sort=true)
+            @test length(gd) == 5
+            @test gd[1] ≅ DataFrame(Key1="A", Key2="A", Value=5)            
+            @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4)
+            @test gd[5] ≅ DataFrame(Key1=null, Key2="A", Value=2)        
+        end
+
+        @testset "sort=false, skipnull=true" begin
+            gd = groupby(df, :Key1, sort=true, skipnull=true)
+            @test length(gd) == 2
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2], sort=true, skipnull=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1="A", Key2="A", Value=5)
+            @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+        end
+    end
 end