Skip to content

Commit

Permalink
Merge 2050f44 into 86c6145
Browse files Browse the repository at this point in the history
  • Loading branch information
nalimilan committed Oct 18, 2017
2 parents 86c6145 + 2050f44 commit 6e96eee
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 40 deletions.
101 changes: 69 additions & 32 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,26 @@ end

# "kernel" functions for hashrows()
# adjust row hashes by the hashes of column elements
function hashrows_col!(h::Vector{UInt}, v::AbstractVector)
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractVector{T}) where T
ln = length(n)
@inbounds for i in eachindex(h)
h[i] = hash(v[i], h[i])
el = v[i]
h[i] = hash(el, h[i])
if T >: Null && ln > 0
# el isa Null should be redundant
# but it gives much more efficient code on Julia 0.6
n[i] |= (el isa Null || isnull(el))
end
end
h
end

# should give the same hash as AbstractVector{T}
function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{T}) where T
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractCategoricalVector{T}) where T
# TODO is it possible to optimize by hashing the pool values once?
@inbounds for (i, ref) in enumerate(v.refs)
h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
Expand All @@ -40,23 +51,30 @@ end

# should give the same hash as AbstractVector{T}
# enables efficient sequential memory access pattern
function hashrows_col!(h::Vector{UInt}, v::AbstractCategoricalVector{>: Null})
function hashrows_col!(h::Vector{UInt},
n::Vector{Bool},
v::AbstractCategoricalVector{>: Null})
ln = length(n)
# TODO is it possible to optimize by hashing the pool values once?
@inbounds for (i, ref) in enumerate(v.refs)
h[i] = ref == 0 ?
hash(null, h[i]) :
hash(CategoricalArrays.index(v.pool)[ref], h[i])
if ref == 0
h[i] = hash(null, h[i])
ln > 0 && (n[i] = true)
else
h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i])
end
end
h
end

# Calculate the vector of `df` rows hash values.
function hashrows(df::AbstractDataFrame)
res = zeros(UInt, nrow(df))
function hashrows(df::AbstractDataFrame, skipnull::Bool)
rhashes = zeros(UInt, nrow(df))
nulls = fill(false, skipnull ? nrow(df) : 0)
for col in columns(df)
hashrows_col!(res, col)
hashrows_col!(rhashes, nulls, col)
end
return res
return (rhashes, nulls)
end

# Helper function for RowGroupDict.
Expand All @@ -66,39 +84,50 @@ end
# 3) slot array for a hash map, non-zero values are
# the indices of the first row in a group
# Optional group vector is set to the group indices of each row
row_group_slots(df::AbstractDataFrame, groups::Union{Vector{Int}, Void} = nothing) =
row_group_slots(ntuple(i -> df[i], ncol(df)), hashrows(df), groups)
function row_group_slots(df::AbstractDataFrame,
groups::Union{Vector{Int}, Void} = nothing,
skipnull::Bool = false)
rhashes, nulls = hashrows(df, skipnull)
row_group_slots(ntuple(i -> df[i], ncol(df)), rhashes, nulls, groups, skipnull)
end

function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
rhashes::AbstractVector{UInt},
groups::Union{Vector{Int}, Void} = nothing)
nulls::AbstractVector{Bool},
groups::Union{Vector{Int}, Void} = nothing,
skipnull::Bool = false)
@assert groups === nothing || length(groups) == length(cols[1])
# inspired by Dict code from base cf. https://github.com/JuliaData/DataFrames.jl/pull/17#discussion_r102481481
sz = Base._tablesz(length(rhashes))
@assert sz >= length(rhashes)
szm1 = sz-1
gslots = zeros(Int, sz)
ngroups = 0
@inbounds for i in eachindex(rhashes)
# If nulls are to be skipped, they will all go to group 1
ngroups = skipnull ? 1 : 0
@inbounds for i in eachindex(rhashes, nulls)
# find the slot and group index for a row
slotix = rhashes[i] & szm1 + 1
gix = 0
# Use 0 for non-null values to catch bugs if group is not found
gix = skipnull && nulls[i] ? 1 : 0
probe = 0
while true
g_row = gslots[slotix]
if g_row == 0 # unoccupied slot, current row starts a new group
gslots[slotix] = i
gix = ngroups += 1
break
elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
if isequal_row(cols, i, g_row) # hit
gix = groups !== nothing ? groups[g_row] : 0
# Skip rows contaning at least one null (assigning them to group 0)
if !skipnull || !nulls[i]
while true
g_row = gslots[slotix]
if g_row == 0 # unoccupied slot, current row starts a new group
gslots[slotix] = i
gix = ngroups += 1
break
elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
if isequal_row(cols, i, g_row) # hit
gix = groups !== nothing ? groups[g_row] : 0
end
break
end
break
slotix = slotix & szm1 + 1 # check the next slot
probe += 1
@assert probe < sz
end
slotix = slotix & szm1 + 1 # check the next slot
probe += 1
@assert probe < sz
end
if groups !== nothing
groups[i] = gix
Expand All @@ -109,9 +138,9 @@ end

# Builds RowGroupDict for a given DataFrame.
# Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
function group_rows(df::AbstractDataFrame)
function group_rows(df::AbstractDataFrame, skipnull::Bool = false)
groups = Vector{Int}(nrow(df))
ngroups, rhashes, gslots = row_group_slots(df, groups)
ngroups, rhashes, gslots = row_group_slots(df, groups, skipnull)

# count elements in each group
stops = zeros(Int, ngroups)
Expand All @@ -136,6 +165,14 @@ function group_rows(df::AbstractDataFrame)
stops[gix] += 1
end
stops .-= 1

# drop group 1 which contains rows with nulls in grouping columns
if skipnull
splice!(starts, 1)
splice!(stops, 1)
ngroups -= 1
end

return RowGroupDict(df, ngroups, rhashes, gslots, groups, rperm, starts, stops)
end

Expand Down
21 changes: 14 additions & 7 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ end
A view of an AbstractDataFrame split into row groups
```julia
groupby(d::AbstractDataFrame, cols)
groupby(cols)
groupby(d::AbstractDataFrame, cols; sort = false, skipnull = false)
groupby(cols; sort = false, skipnull = false)
```
### Arguments
* `d` : an AbstractDataFrame to split (optional, see [Returns](#returns))
* `cols` : data table columns to group by
* `sort`: whether to sort rows according to the values of the grouping columns `cols`
* `skipnull`: whether to skip rows with `null` values in one of the grouping columns `cols`
### Returns
Expand Down Expand Up @@ -79,9 +81,10 @@ df |> groupby([:a, :b]) |> [sum, length]
```
"""
function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) where T
function groupby(df::AbstractDataFrame, cols::Vector{T};
sort::Bool = false, skipnull::Bool = false) where T
sdf = df[cols]
df_groups = group_rows(sdf)
df_groups = group_rows(sdf, skipnull)
# sort the groups
if sort
group_perm = sortperm(view(sdf, df_groups.rperm[df_groups.starts]))
Expand All @@ -91,11 +94,15 @@ function groupby(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false) whe
GroupedDataFrame(df, cols, df_groups.rperm,
df_groups.starts, df_groups.stops)
end
groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
groupby(d::AbstractDataFrame, cols;
sort::Bool = false, skipnull::Bool = false) =
groupby(d, [cols], sort = sort, skipnull = skipnull)

# add a function curry
groupby(cols::Vector{T}; sort::Bool = false) where {T} = x -> groupby(x, cols, sort = sort)
groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
groupby(cols::Vector{T}; sort::Bool = false, skipnull::Bool = false) where {T} =
x -> groupby(x, cols, sort = sort, skipnull = skipnull)
groupby(cols; sort::Bool = false, skipnull::Bool = false) =
x -> groupby(x, cols, sort = sort, skipnull = skipnull)

Base.start(gd::GroupedDataFrame) = 1
Base.next(gd::GroupedDataFrame, state::Int) =
Expand Down
2 changes: 1 addition & 1 deletion test/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ module TestDataFrameRow


# check that hashrows() function generates the same hashes as DataFrameRow
df_rowhashes = DataFrames.hashrows(df)
df_rowhashes, _ = DataFrames.hashrows(df, false)
@test df_rowhashes == [hash(dr) for dr in eachrow(df)]

# test incompatible frames
Expand Down
65 changes: 65 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module TestGrouping
using Base.Test, DataFrames
const = isequal

srand(1)
df = DataFrame(a = repeat(Union{Int, Null}[1, 2, 3, 4], outer=[2]),
Expand Down Expand Up @@ -181,4 +182,68 @@ module TestGrouping
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)

@testset "grouping with nulls" begin
df = DataFrame(Key1 = ["A", null, "B", "B", "A"],
Key2 = CategoricalArray(["B", "A", "A", null, "A"]),
Value = 1:5)

@testset "sort=false, skipnull=false" begin
gd = groupby(df, :Key1)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=null, Key2="A", Value=2)
@test gd[3] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2])
@test length(gd) == 5
@test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[2] DataFrame(Key1=null, Key2="A", Value=2)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] DataFrame(Key1="B", Key2=null, Value=4)
@test gd[5] DataFrame(Key1="A", Key2="A", Value=5)
end

@testset "sort=false, skipnull=true" begin
gd = groupby(df, :Key1, skipnull=true)
@test length(gd) == 2
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2], skipnull=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[2] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[3] == DataFrame(Key1="A", Key2="A", Value=5)
end

@testset "sort=true, skipnull=false" begin
gd = groupby(df, :Key1, sort=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)
@test gd[3] DataFrame(Key1=null, Key2="A", Value=2)

gd = groupby(df, [:Key1, :Key2], sort=true)
@test length(gd) == 5
@test gd[1] DataFrame(Key1="A", Key2="A", Value=5)
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] DataFrame(Key1="B", Key2=null, Value=4)
@test gd[5] DataFrame(Key1=null, Key2="A", Value=2)
end

@testset "sort=false, skipnull=true" begin
gd = groupby(df, :Key1, sort=true, skipnull=true)
@test length(gd) == 2
@test gd[1] == DataFrame(Key1=["A", "A"], Key2=["B", "A"], Value=[1, 5])
@test gd[2] DataFrame(Key1=["B", "B"], Key2=["A", null], Value=3:4)

gd = groupby(df, [:Key1, :Key2], sort=true, skipnull=true)
@test length(gd) == 3
@test gd[1] == DataFrame(Key1="A", Key2="A", Value=5)
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=1)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
end
end
end

0 comments on commit 6e96eee

Please sign in to comment.