Skip to content

Commit

Permalink
Add skipnull argument to groupby()
Browse files Browse the repository at this point in the history
The new argument allows skipping values with at least one null in one
of the grouping columns. Doing this in groupby() is more efficient
and more convenient than dropping them before calling groupby().
  • Loading branch information
nalimilan committed Oct 18, 2017
1 parent 8cf2be9 commit 2050f44
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 40 deletions.
101 changes: 69 additions & 32 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,26 @@ end

# "kernel" functions for hashrows()
# adjust row hashes by the hashes of column elements
function hashrows_col!(h::Vector{UInt}, v::AbstractVector)
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractVector{T}) where T
ln = length(n)
@inbounds for i in eachindex(h)
h[i] = hash(v[i], h[i])
el = v[i]
h[i] = hash(el, h[i])
if T >: Null && ln > 0
# el isa Null should be redundant
# but it gives much more efficient code on Julia 0.6
n[i] |= (el isa Null || isnull(el))
end
end
h
end

# should give the same hash as AbstractVector{T}
function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{T}) where T
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractCategoricalVector{T}) where T
# TODO is it possible to optimize by hashing the pool values once?
@inbounds for (i, ref) in enumerate(v.refs)
h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
Expand All @@ -40,23 +51,30 @@ end

# should give the same hash as AbstractVector{T}
# enables efficient sequential memory access pattern
function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{>: Null})
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractCategoricalVector{>: Null})
ln = length(n)
# TODO is it possible to optimize by hashing the pool values once?
@inbounds for (i, ref) in enumerate(v.refs)
h[i] = ref == 0 ?
hash(null, h[i]) :
hash(CategoricalArrays.index(v.pool)[ref], h[i])
if ref == 0
h[i] = hash(null, h[i])
ln > 0 && (n[i] = true)
else
h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
end
end
h
end

# Calculate the vector of `df` rows hash values.
function hashrows(df::AbstractDataFrame)
res = zeros(UInt, nrow(df))
function hashrows(df::AbstractDataFrame, skipnull::Bool)
rhashes = zeros(UInt, nrow(df))
nulls = fill(false, skipnull ? nrow(df) : 0)
for col in columns(df)
hashrows_col!(res, col)
hashrows_col!(rhashes, nulls, col)
end
return res
return (rhashes, nulls)
end

# Helper function for RowGroupDict.
Expand All @@ -66,39 +84,50 @@ end
# 3) slot array for a hash map, non-zero values are
# the indices of the first row in a group
# Optional group vector is set to the group indices of each row
row_group_slots(df::AbstractDataFrame, groups::Union{Vector{Int}, Void} = nothing) =
row_group_slots(ntuple(i -> df[i], ncol(df)), hashrows(df), groups)
function row_group_slots(df::AbstractDataFrame,
groups::Union{Vector{Int}, Void} = nothing,
skipnull::Bool = false)
rhashes, nulls = hashrows(df, skipnull)
row_group_slots(ntuple(i -> df[i], ncol(df)), rhashes, nulls, groups, skipnull)
end

function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
rhashes::AbstractVector{UInt},
groups::Union{Vector{Int}, Void} = nothing)
nulls::AbstractVector{Bool},
groups::Union{Vector{Int}, Void} = nothing,
skipnull::Bool = false)
@assert groups === nothing || length(groups) == length(cols[1])
# inspired by Dict code from base cf. https://github.com/JuliaData/DataFrames.jl/pull/17#discussion_r102481481
sz = Base._tablesz(length(rhashes))
@assert sz >= length(rhashes)
szm1 = sz-1
gslots = zeros(Int, sz)
ngroups = 0
@inbounds for i in eachindex(rhashes)
# If nulls are to be skipped, they will all go to group 1
ngroups = skipnull ? 1 : 0
@inbounds for i in eachindex(rhashes, nulls)
# find the slot and group index for a row
slotix = rhashes[i] & szm1 + 1
gix = 0
# Use 0 for non-null values to catch bugs if group is not found
gix = skipnull && nulls[i] ? 1 : 0
probe = 0
while true
g_row = gslots[slotix]
if g_row == 0 # unoccupied slot, current row starts a new group
gslots[slotix] = i
gix = ngroups += 1
break
elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
if isequal_row(cols, i, g_row) # hit
gix = groups !== nothing ? groups[g_row] : 0
# Skip rows contaning at least one null (assigning them to group 0)
if !skipnull || !nulls[i]
while true
g_row = gslots[slotix]
if g_row == 0 # unoccupied slot, current row starts a new group
gslots[slotix] = i
gix = ngroups += 1
break
elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
if isequal_row(cols, i, g_row) # hit
gix = groups !== nothing ? groups[g_row] : 0
end
break
end
break
slotix = slotix & szm1 + 1 # check the next slot
probe += 1
@assert probe < sz
end
slotix = slotix & szm1 + 1 # check the next slot
probe += 1
@assert probe < sz
end
if groups !== nothing
groups[i] = gix
Expand All @@ -109,9 +138,9 @@ end

# Builds RowGroupDict for a given DataFrame.
# Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
function group_rows(df::AbstractDataFrame)
function group_rows(df::AbstractDataFrame, skipnull::Bool = false)
groups = Vector{Int}(nrow(df))
ngroups, rhashes, gslots = row_group_slots(df, groups)
ngroups, rhashes, gslots = row_group_slots(df, groups, skipnull)

# count elements in each group
stops = zeros(Int, ngroups)
Expand All @@ -136,6 +165,14 @@ function group_rows(df::AbstractDataFrame)
stops[gix] += 1
end
stops .-= 1

# drop group 1 which contains rows with nulls in grouping columns
if skipnull
splice!(starts, 1)
splice!(stops, 1)
ngroups -= 1
end

return RowGroupDict(df, ngroups, rhashes, gslots, groups, rperm, starts, stops)
end

Expand Down
21 changes: 14 additions & 7 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ end
A view of an AbstractDataFrame split into row groups
```julia
groupby(d::AbstractDataFrame, cols)
groupby(cols)
groupby(d::AbstractDataFrame, cols; sort = false, skipnull = false)
groupby(cols; sort = false, skipnull = false)
```
### Arguments
* `d` : an AbstractDataFrame to split (optional, see [Returns](#returns))
* `cols` : data table columns to group by
* `sort`: whether to sort rows according to the values of the grouping columns `cols`
* `skipnull`: whether to skip rows with `null` values in one of the grouping columns `cols`
### Returns
Expand Down Expand Up @@ -79,9 +81,10 @@ df |> groupby([:a, :b]) |> [sum, length]
```
"""
function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) where T
function groupby(df::AbstractDataFrame, cols::Vector{T};
sort::Bool = false, skipnull::Bool = false) where T
sdf = df[cols]
df_groups = group_rows(sdf)
df_groups = group_rows(sdf, skipnull)
# sort the groups
if sort
group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts]))
Expand All @@ -91,11 +94,15 @@ function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) whe
GroupedDataFrame(df, cols, df_groups.rperm,
df_groups.starts, df_groups.stops)
end
groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
groupby(d::AbstractDataFrame, cols;
sort::Bool = false, skipnull::Bool = false) =
groupby(d, [cols], sort = sort, skipnull = skipnull)

# add a function curry
groupby(cols::Vector{T}; sort::Bool = false) where {T} = x -> groupby(x, cols, sort = sort)
groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
groupby(cols::Vector{T}; sort::Bool = false, skipnull::Bool = false) where {T} =
x -> groupby(x, cols, sort = sort, skipnull = skipnull)
groupby(cols; sort::Bool = false, skipnull::Bool = false) =
x -> groupby(x, cols, sort = sort, skipnull = skipnull)

Base.start(gd::GroupedDataFrame) = 1
Base.next(gd::GroupedDataFrame, state::Int) =
Expand Down
2 changes: 1 addition & 1 deletion test/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ module TestDataFrameRow


# check that hashrows() function generates the same hashes as DataFrameRow
df_rowhashes = DataFrames.hashrows(df)
df_rowhashes, _ = DataFrames.hashrows(df, false)
@test df_rowhashes == [hash(dr) for dr in eachrow(df)]

# test incompatible frames
Expand Down
65 changes: 65 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module TestGrouping
using Base.Test, DataFrames
const = isequal

srand(1)
df = DataFrame(a = repeat(Union{Int, Null}[1, 2, 3, 4], outer=[2]),
Expand Down Expand Up @@ -181,4 +182,68 @@ module TestGrouping
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)

@testset "grouping with nulls" begin
df = DataFrame(Key1 = ["A", null, "B", "B", "A"],
Key2 = CategoricalArray(["B", "A", "A", null, "A"]),
Value = 1:5)

@testset "sort=false, skipnull=false" begin
gd = groupby(df, :Key1)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=null, Key2="A", Value=2)
@test gd[3] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2])
@test length(gd) == 5
@test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[2] DataFrame(Key1=null, Key2="A", Value=2)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] DataFrame(Key1="B", Key2=null, Value=4)
@test gd[5] DataFrame(Key1="A", Key2="A", Value=5)
end

@testset "sort=false, skipnull=true" begin
gd = groupby(df, :Key1, skipnull=true)
@test length(gd) == 2
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2], skipnull=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[2] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[3] == DataFrame(Key1="A", Key2="A", Value=5)
end

@testset "sort=true, skipnull=false" begin
gd = groupby(df, :Key1, sort=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
@test gd[3] DataFrame(Key1=null, Key2="A", Value=2)

gd = groupby(df, [:Key1, :Key2], sort=true)
@test length(gd) == 5
@test gd[1] DataFrame(Key1="A", Key2="A", Value=5)
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] DataFrame(Key1="B", Key2=null, Value=4)
@test gd[5] DataFrame(Key1=null, Key2="A", Value=2)
end

@testset "sort=false, skipnull=true" begin
gd = groupby(df, :Key1, sort=true, skipnull=true)
@test length(gd) == 2
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2], sort=true, skipnull=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1="A", Key2="A", Value=5)
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
end
end
end

0 comments on commit 2050f44

Please sign in to comment.