Merge 2050f44 into 86c6145

JuliaData · Oct 18, 2017 · 6e96eee · 6e96eee
2 parents 86c6145 + 2050f44
commit 6e96eee
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 40 deletions.
diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -22,15 +22,26 @@ end
 
 # "kernel" functions for hashrows()
 # adjust row hashes by the hashes of column elements
-function hashrows_col!(h::Vector{UInt}, v::AbstractVector)
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractVector{T}) where T
+    ln = length(n)
     @inbounds for i in eachindex(h)
-        h[i] = hash(v[i], h[i])
+        el = v[i]
+        h[i] = hash(el, h[i])
+        if T >: Null && ln > 0
+            # el isa Null should be redundant
+            # but it gives much more efficient code on Julia 0.6
+            n[i] |= (el isa Null || isnull(el))
+        end
     end
     h
 end
 
 # should give the same hash as AbstractVector{T}
-function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{T}) where T
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractCategoricalVector{T}) where T
     # TODO is it possible to optimize by hashing the pool values once?
     @inbounds for (i, ref) in enumerate(v.refs)
         h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
@@ -40,23 +51,30 @@ end
 
 # should give the same hash as AbstractVector{T}
 # enables efficient sequential memory access pattern
-function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{>: Null})
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractCategoricalVector{>: Null})
+    ln = length(n)
     # TODO is it possible to optimize by hashing the pool values once?
     @inbounds for (i, ref) in enumerate(v.refs)
-        h[i] = ref == 0 ?
-               hash(null, h[i]) :
-               hash(CategoricalArrays.index(v.pool)[ref], h[i])
+        if ref == 0
+            h[i] = hash(null, h[i])
+            ln > 0 && (n[i] = true)
+        else
+            h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
+        end
     end
     h
 end
 
 # Calculate the vector of `df` rows hash values.
-function hashrows(df::AbstractDataFrame)
-    res = zeros(UInt, nrow(df))
+function hashrows(df::AbstractDataFrame, skipnull::Bool)
+    rhashes = zeros(UInt, nrow(df))
+    nulls = fill(false, skipnull ? nrow(df) : 0)
     for col in columns(df)
-        hashrows_col!(res, col)
+        hashrows_col!(rhashes, nulls, col)
     end
-    return res
+    return (rhashes, nulls)
 end
 
 # Helper function for RowGroupDict.
@@ -66,39 +84,50 @@ end
 # 3) slot array for a hash map, non-zero values are
 #    the indices of the first row in a group
 # Optional group vector is set to the group indices of each row
-row_group_slots(df::AbstractDataFrame, groups::Union{Vector{Int}, Void} = nothing) =
-    row_group_slots(ntuple(i -> df[i], ncol(df)), hashrows(df), groups)
+function row_group_slots(df::AbstractDataFrame,
+                         groups::Union{Vector{Int}, Void} = nothing,
+                         skipnull::Bool = false)
+    rhashes, nulls = hashrows(df, skipnull)
+    row_group_slots(ntuple(i -> df[i], ncol(df)), rhashes, nulls, groups, skipnull)
+end
 
 function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
                          rhashes::AbstractVector{UInt},
-                         groups::Union{Vector{Int}, Void} = nothing)
+                         nulls::AbstractVector{Bool},
+                         groups::Union{Vector{Int}, Void} = nothing,
+                         skipnull::Bool = false)
     @assert groups === nothing || length(groups) == length(cols[1])
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataFrames.jl/pull/17#discussion_r102481481
     sz = Base._tablesz(length(rhashes))
     @assert sz >= length(rhashes)
     szm1 = sz-1
     gslots = zeros(Int, sz)
-    ngroups = 0
-    @inbounds for i in eachindex(rhashes)
+    # If nulls are to be skipped, they will all go to group 1
+    ngroups = skipnull ? 1 : 0
+    @inbounds for i in eachindex(rhashes, nulls)
         # find the slot and group index for a row
         slotix = rhashes[i] & szm1 + 1
-        gix = 0
+        # Use 0 for non-null values to catch bugs if group is not found
+        gix = skipnull && nulls[i] ? 1 : 0
         probe = 0
-        while true
-            g_row = gslots[slotix]
-            if g_row == 0 # unoccupied slot, current row starts a new group
-                gslots[slotix] = i
-                gix = ngroups += 1
-                break
-            elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
-                if isequal_row(cols, i, g_row) # hit
-                    gix = groups !== nothing ? groups[g_row] : 0
+        # Skip rows contaning at least one null (assigning them to group 0)
+        if !skipnull || !nulls[i]
+            while true
+                g_row = gslots[slotix]
+                if g_row == 0 # unoccupied slot, current row starts a new group
+                    gslots[slotix] = i
+                    gix = ngroups += 1
+                    break
+                elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
+                    if isequal_row(cols, i, g_row) # hit
+                        gix = groups !== nothing ? groups[g_row] : 0
+                    end
+                    break
                 end
-                break
+                slotix = slotix & szm1 + 1 # check the next slot
+                probe += 1
+                @assert probe < sz
             end
-            slotix = slotix & szm1 + 1 # check the next slot
-            probe += 1
-            @assert probe < sz
         end
         if groups !== nothing
             groups[i] = gix
@@ -109,9 +138,9 @@ end
 
 # Builds RowGroupDict for a given DataFrame.
 # Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
-function group_rows(df::AbstractDataFrame)
+function group_rows(df::AbstractDataFrame, skipnull::Bool = false)
     groups = Vector{Int}(nrow(df))
-    ngroups, rhashes, gslots = row_group_slots(df, groups)
+    ngroups, rhashes, gslots = row_group_slots(df, groups, skipnull)
 
     # count elements in each group
     stops = zeros(Int, ngroups)
@@ -136,6 +165,14 @@ function group_rows(df::AbstractDataFrame)
         stops[gix] += 1
     end
     stops .-= 1
+
+    # drop group 1 which contains rows with nulls in grouping columns
+    if skipnull
+        splice!(starts, 1)
+        splice!(stops, 1)
+        ngroups -= 1
+    end
+
     return RowGroupDict(df, ngroups, rhashes, gslots, groups, rperm, starts, stops)
 end
 

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -30,14 +30,16 @@ end
 A view of an AbstractDataFrame split into row groups
 
 ```julia
-groupby(d::AbstractDataFrame, cols)
-groupby(cols)
+groupby(d::AbstractDataFrame, cols; sort = false, skipnull = false)
+groupby(cols; sort = false, skipnull = false)
 ```
 
 ### Arguments
 
 * `d` : an AbstractDataFrame to split (optional, see [Returns](#returns))
 * `cols` : data table columns to group by
+* `sort`: whether to sort rows according to the values of the grouping columns `cols`
+* `skipnull`: whether to skip rows with `null` values in one of the grouping columns `cols`
 
 ### Returns
 
@@ -79,9 +81,10 @@ df |> groupby([:a, :b]) |> [sum, length]
 ```
 
 """
-function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) where T
+function groupby(df::AbstractDataFrame, cols::Vector{T};
+                 sort::Bool = false, skipnull::Bool = false) where T
     sdf = df[cols]
-    df_groups = group_rows(sdf)
+    df_groups = group_rows(sdf, skipnull)
     # sort the groups
     if sort
         group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts]))
@@ -91,11 +94,15 @@ function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) whe
     GroupedDataFrame(df, cols, df_groups.rperm,
                      df_groups.starts, df_groups.stops)
 end
-groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
+groupby(d::AbstractDataFrame, cols;
+        sort::Bool = false, skipnull::Bool = false) =
+    groupby(d, [cols], sort = sort, skipnull = skipnull)
 
 # add a function curry
-groupby(cols::Vector{T}; sort::Bool = false) where {T} = x -> groupby(x, cols, sort = sort)
-groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
+groupby(cols::Vector{T}; sort::Bool = false, skipnull::Bool = false) where {T} =
+    x -> groupby(x, cols, sort = sort, skipnull = skipnull)
+groupby(cols; sort::Bool = false, skipnull::Bool = false) =
+    x -> groupby(x, cols, sort = sort, skipnull = skipnull)
 
 Base.start(gd::GroupedDataFrame) = 1
 Base.next(gd::GroupedDataFrame, state::Int) =

diff --git a/test/dataframerow.jl b/test/dataframerow.jl
@@ -44,7 +44,7 @@ module TestDataFrameRow
 
 
     # check that hashrows() function generates the same hashes as DataFrameRow
-    df_rowhashes = DataFrames.hashrows(df)
+    df_rowhashes, _ = DataFrames.hashrows(df, false)
     @test df_rowhashes == [hash(dr) for dr in eachrow(df)]
 
     # test incompatible frames

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -1,5 +1,6 @@
 module TestGrouping
     using Base.Test, DataFrames
+    const ≅ = isequal
 
     srand(1)
     df = DataFrame(a = repeat(Union{Int, Null}[1, 2, 3, 4], outer=[2]),
@@ -181,4 +182,68 @@ module TestGrouping
     @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
     @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
     @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
+
+    @testset "grouping with nulls" begin
+        df = DataFrame(Key1 = ["A", null, "B", "B", "A"],
+                       Key2 = CategoricalArray(["B", "A", "A", null, "A"]),
+                       Value = 1:5)
+
+        @testset "sort=false, skipnull=false" begin
+            gd = groupby(df, :Key1)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+            @test gd[3] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2])
+            @test length(gd) == 5
+            @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4)
+            @test gd[5] ≅ DataFrame(Key1="A", Key2="A", Value=5)
+        end
+
+        @testset "sort=false, skipnull=true" begin
+            gd = groupby(df, :Key1, skipnull=true)
+            @test length(gd) == 2
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2], skipnull=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[2] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[3] == DataFrame(Key1="A", Key2="A", Value=5)
+        end
+
+        @testset "sort=true, skipnull=false" begin
+            gd = groupby(df, :Key1, sort=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+            @test gd[3] ≅ DataFrame(Key1=null, Key2="A", Value=2)
+
+            gd = groupby(df, [:Key1, :Key2], sort=true)
+            @test length(gd) == 5
+            @test gd[1] ≅ DataFrame(Key1="A", Key2="A", Value=5)            
+            @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+            @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4)
+            @test gd[5] ≅ DataFrame(Key1=null, Key2="A", Value=2)        
+        end
+
+        @testset "sort=false, skipnull=true" begin
+            gd = groupby(df, :Key1, sort=true, skipnull=true)
+            @test length(gd) == 2
+            @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
+            @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
+
+            gd = groupby(df, [:Key1, :Key2], sort=true, skipnull=true)
+            @test length(gd) == 3
+            @test gd[1] == DataFrame(Key1="A", Key2="A", Value=5)
+            @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
+            @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
+        end
+    end
 end