diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index c77e4da20b..fc6b758842 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -22,15 +22,26 @@ end # "kernel" functions for hashrows() # adjust row hashes by the hashes of column elements -function hashrows_col!(h::Vector{UInt}, v::AbstractVector) +function hashrows_col!(h::Vector{UInt}, + n::Vector{Bool}, + v::AbstractVector{T}) where T + ln = length(n) @inbounds for i in eachindex(h) - h[i] = hash(v[i], h[i]) + el = v[i] + h[i] = hash(el, h[i]) + if T >: Null && ln > 0 + # el isa Null should be redundant + # but it gives much more efficient code on Julia 0.6 + n[i] |= (el isa Null || isnull(el)) + end end h end # should give the same hash as AbstractVector{T} -function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{T}) where T +function hashrows_col!(h::Vector{UInt}, + n::Vector{Bool}, + v::AbstractCategoricalVector{T}) where T # TODO is it possible to optimize by hashing the pool values once? @inbounds for (i, ref) in enumerate(v.refs) h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i]) @@ -40,23 +51,30 @@ end # should give the same hash as AbstractVector{T} # enables efficient sequential memory access pattern -function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{>: Null}) +function hashrows_col!(h::Vector{UInt}, + n::Vector{Bool}, + v::AbstractCategoricalVector{>: Null}) + ln = length(n) # TODO is it possible to optimize by hashing the pool values once? @inbounds for (i, ref) in enumerate(v.refs) - h[i] = ref == 0 ? - hash(null, h[i]) : - hash(CategoricalArrays.index(v.pool)[ref], h[i]) + if ref == 0 + h[i] = hash(null, h[i]) + ln > 0 && (n[i] = true) + else + h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i]) + end end h end # Calculate the vector of `df` rows hash values. -function hashrows(df::AbstractDataFrame) - res = zeros(UInt, nrow(df)) +function hashrows(df::AbstractDataFrame, skipnull::Bool) + rhashes = zeros(UInt, nrow(df)) + nulls = fill(false, skipnull ? nrow(df) : 0) for col in columns(df) - hashrows_col!(res, col) + hashrows_col!(rhashes, nulls, col) end - return res + return (rhashes, nulls) end # Helper function for RowGroupDict. @@ -66,39 +84,50 @@ end # 3) slot array for a hash map, non-zero values are # the indices of the first row in a group # Optional group vector is set to the group indices of each row -row_group_slots(df::AbstractDataFrame, groups::Union{Vector{Int}, Void} = nothing) = - row_group_slots(ntuple(i -> df[i], ncol(df)), hashrows(df), groups) +function row_group_slots(df::AbstractDataFrame, + groups::Union{Vector{Int}, Void} = nothing, + skipnull::Bool = false) + rhashes, nulls = hashrows(df, skipnull) + row_group_slots(ntuple(i -> df[i], ncol(df)), rhashes, nulls, groups, skipnull) +end function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, rhashes::AbstractVector{UInt}, - groups::Union{Vector{Int}, Void} = nothing) + nulls::AbstractVector{Bool}, + groups::Union{Vector{Int}, Void} = nothing, + skipnull::Bool = false) @assert groups === nothing || length(groups) == length(cols[1]) # inspired by Dict code from base cf. https://github.com/JuliaData/DataFrames.jl/pull/17#discussion_r102481481 sz = Base._tablesz(length(rhashes)) @assert sz >= length(rhashes) szm1 = sz-1 gslots = zeros(Int, sz) - ngroups = 0 - @inbounds for i in eachindex(rhashes) + # If nulls are to be skipped, they will all go to group 1 + ngroups = skipnull ? 1 : 0 + @inbounds for i in eachindex(rhashes, nulls) # find the slot and group index for a row slotix = rhashes[i] & szm1 + 1 - gix = 0 + # Use 0 for non-null values to catch bugs if group is not found + gix = skipnull && nulls[i] ? 1 : 0 probe = 0 - while true - g_row = gslots[slotix] - if g_row == 0 # unoccupied slot, current row starts a new group - gslots[slotix] = i - gix = ngroups += 1 - break - elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit - if isequal_row(cols, i, g_row) # hit - gix = groups !== nothing ? groups[g_row] : 0 + # Skip rows contaning at least one null (assigning them to group 0) + if !skipnull || !nulls[i] + while true + g_row = gslots[slotix] + if g_row == 0 # unoccupied slot, current row starts a new group + gslots[slotix] = i + gix = ngroups += 1 + break + elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit + if isequal_row(cols, i, g_row) # hit + gix = groups !== nothing ? groups[g_row] : 0 + end + break end - break + slotix = slotix & szm1 + 1 # check the next slot + probe += 1 + @assert probe < sz end - slotix = slotix & szm1 + 1 # check the next slot - probe += 1 - @assert probe < sz end if groups !== nothing groups[i] = gix @@ -109,9 +138,9 @@ end # Builds RowGroupDict for a given DataFrame. # Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). -function group_rows(df::AbstractDataFrame) +function group_rows(df::AbstractDataFrame, skipnull::Bool = false) groups = Vector{Int}(nrow(df)) - ngroups, rhashes, gslots = row_group_slots(df, groups) + ngroups, rhashes, gslots = row_group_slots(df, groups, skipnull) # count elements in each group stops = zeros(Int, ngroups) @@ -136,6 +165,14 @@ function group_rows(df::AbstractDataFrame) stops[gix] += 1 end stops .-= 1 + + # drop group 1 which contains rows with nulls in grouping columns + if skipnull + splice!(starts, 1) + splice!(stops, 1) + ngroups -= 1 + end + return RowGroupDict(df, ngroups, rhashes, gslots, groups, rperm, starts, stops) end diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index dd509683b4..32952f813d 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -30,14 +30,16 @@ end A view of an AbstractDataFrame split into row groups ```julia -groupby(d::AbstractDataFrame, cols) -groupby(cols) +groupby(d::AbstractDataFrame, cols; sort = false, skipnull = false) +groupby(cols; sort = false, skipnull = false) ``` ### Arguments * `d` : an AbstractDataFrame to split (optional, see [Returns](#returns)) * `cols` : data table columns to group by +* `sort`: whether to sort rows according to the values of the grouping columns `cols` +* `skipnull`: whether to skip rows with `null` values in one of the grouping columns `cols` ### Returns @@ -79,9 +81,10 @@ df |> groupby([:a, :b]) |> [sum, length] ``` """ -function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) where T +function groupby(df::AbstractDataFrame, cols::Vector{T}; + sort::Bool = false, skipnull::Bool = false) where T sdf = df[cols] - df_groups = group_rows(sdf) + df_groups = group_rows(sdf, skipnull) # sort the groups if sort group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts])) @@ -91,11 +94,15 @@ function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) whe GroupedDataFrame(df, cols, df_groups.rperm, df_groups.starts, df_groups.stops) end -groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort) +groupby(d::AbstractDataFrame, cols; + sort::Bool = false, skipnull::Bool = false) = + groupby(d, [cols], sort = sort, skipnull = skipnull) # add a function curry -groupby(cols::Vector{T}; sort::Bool = false) where {T} = x -> groupby(x, cols, sort = sort) -groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort) +groupby(cols::Vector{T}; sort::Bool = false, skipnull::Bool = false) where {T} = + x -> groupby(x, cols, sort = sort, skipnull = skipnull) +groupby(cols; sort::Bool = false, skipnull::Bool = false) = + x -> groupby(x, cols, sort = sort, skipnull = skipnull) Base.start(gd::GroupedDataFrame) = 1 Base.next(gd::GroupedDataFrame, state::Int) = diff --git a/test/dataframerow.jl b/test/dataframerow.jl index b9b076c9fd..f073b9bfe6 100644 --- a/test/dataframerow.jl +++ b/test/dataframerow.jl @@ -44,7 +44,7 @@ module TestDataFrameRow # check that hashrows() function generates the same hashes as DataFrameRow - df_rowhashes = DataFrames.hashrows(df) + df_rowhashes, _ = DataFrames.hashrows(df, false) @test df_rowhashes == [hash(dr) for dr in eachrow(df)] # test incompatible frames diff --git a/test/grouping.jl b/test/grouping.jl index 194c0a5648..8b8f89e3ff 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1,5 +1,6 @@ module TestGrouping using Base.Test, DataFrames + const ≅ = isequal srand(1) df = DataFrame(a = repeat(Union{Int, Null}[1, 2, 3, 4], outer=[2]), @@ -181,4 +182,68 @@ module TestGrouping @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2) @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4) + + @testset "grouping with nulls" begin + df = DataFrame(Key1 = ["A", null, "B", "B", "A"], + Key2 = CategoricalArray(["B", "A", "A", null, "A"]), + Value = 1:5) + + @testset "sort=false, skipnull=false" begin + gd = groupby(df, :Key1) + @test length(gd) == 3 + @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5]) + @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2) + @test gd[3] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4) + + gd = groupby(df, [:Key1, :Key2]) + @test length(gd) == 5 + @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1) + @test gd[2] ≅ DataFrame(Key1=null, Key2="A", Value=2) + @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) + @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4) + @test gd[5] ≅ DataFrame(Key1="A", Key2="A", Value=5) + end + + @testset "sort=false, skipnull=true" begin + gd = groupby(df, :Key1, skipnull=true) + @test length(gd) == 2 + @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5]) + @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4) + + gd = groupby(df, [:Key1, :Key2], skipnull=true) + @test length(gd) == 3 + @test gd[1] == DataFrame(Key1="A", Key2="B", Value=1) + @test gd[2] == DataFrame(Key1="B", Key2="A", Value=3) + @test gd[3] == DataFrame(Key1="A", Key2="A", Value=5) + end + + @testset "sort=true, skipnull=false" begin + gd = groupby(df, :Key1, sort=true) + @test length(gd) == 3 + @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5]) + @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4) + @test gd[3] ≅ DataFrame(Key1=null, Key2="A", Value=2) + + gd = groupby(df, [:Key1, :Key2], sort=true) + @test length(gd) == 5 + @test gd[1] ≅ DataFrame(Key1="A", Key2="A", Value=5) + @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1) + @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) + @test gd[4] ≅ DataFrame(Key1="B", Key2=null, Value=4) + @test gd[5] ≅ DataFrame(Key1=null, Key2="A", Value=2) + end + + @testset "sort=false, skipnull=true" begin + gd = groupby(df, :Key1, sort=true, skipnull=true) + @test length(gd) == 2 + @test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5]) + @test gd[2] ≅ DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4) + + gd = groupby(df, [:Key1, :Key2], sort=true, skipnull=true) + @test length(gd) == 3 + @test gd[1] == DataFrame(Key1="A", Key2="A", Value=5) + @test gd[2] == DataFrame(Key1="A", Key2="B", Value=1) + @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) + end + end end