Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.


Restore old grouping algorithm and improve it
Browse files Browse the repository at this point in the history
Follow the strategy used by Pandas. The new implementation is more efficient
since it avoids creating a NullableCategoricalArray: the integer codes are
combined on the fly with those computed from previous columns. Hashing only
happens once by giving arbitrary codes to levels in the first pass; after that,
only integer codes are used.

Move the per-column operations to separate functions which can be specialized
by the compiler for each column type. This also allows using a more efficient
method for CategoricalArray.

Fix ordering of CategoricalArray levels when levels have been reordered,
and sort null values last for consistency with other nullable arrays. Enable
sorting by default since its cost is relatively small compared with the rest.

Avoid some allocations by using in place operations, use Base.unique!().
  • Loading branch information
nalimilan committed Jun 18, 2017
1 parent 302d779 commit d6104d7
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 29 deletions.
150 changes: 127 additions & 23 deletions src/groupeddatatable/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,111 @@ end
# Split

function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NULL
n = length(x)
# counts = x.pool
counts = fill(0, ngroups + 1)
for i = 1:n
counts[x[i] + 1] += 1

# mark the start of each contiguous group of like-indexed data
where = fill(1, ngroups + 1)
if null_last
for i = 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
where[1] = where[end] + counts[end]
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]

# this is our indexer
result = fill(0, n)
for i = 1:n
label = x[i] + 1
result[where[label]] = i
where[label] += 1
result, where, counts

# Assign an integer code to each level of x, and combine these codes with existing vector
function combine_col!{T}(x::AbstractVector, col::AbstractVector{T},
ngroups::Integer, sort::Bool)
d = Dict{T, UInt32}()
y = Vector{UInt32}(length(x))
n = 0
# Note: using get! instead of triggers lots of allocations
@inbounds for i in eachindex(x)
v = col[i]
index = Base.ht_keyindex(d, v)
if index < 0 # new level
@inbounds y[i] = d[v] = n
n += 1
y[i] = d.vals[index]

if sort
# compute mapping from unsorted to sorted codes
tmp = sortperm(collect(keys(d)))
perm = ipermute!(collect(0:(n-1)), tmp)
refperm = sortperm!(tmp, collect(values(d)))
permute!(perm, tmp)

@inbounds for i in eachindex(x)
x[i] += perm[y[i] + 1] * ngroups
@inbounds for i in eachindex(x)
x[i] += y[i] * ngroups


# More efficient method which can use the references directly
# Levels are always sorted
function combine_col!(x::AbstractVector,
col::Union{AbstractCategoricalVector, AbstractNullableCategoricalVector},
ngroups::Integer, sort::Bool)
nlevels = length(levels(col))
order = CategoricalArrays.order(col.pool)
codes = similar(order, length(order)+1)
codes[1] = nlevels # Sort nulls last, only used if present
codes[2:end] .= order .- 1
anynulls = false
@inbounds for i in eachindex(x)
ref = col.refs[i]
x[i] += codes[ref + 1] * ngroups
if eltype(col) <: Nullable
anynulls |= (ref == 0)
nlevels + anynulls

A view of an AbstractDataTable split into row groups
groupby(d::AbstractDataTable, cols)
groupby(d::AbstractDataTable, cols; sort = true)
groupby(cols; sort = true)
### Arguments
* `d` : an AbstractDataTable to split (optional, see [Returns](#returns))
* `cols` : data table columns to group by
* `sort`: whether to sort row groups; disable sorting for maximum performance
### Returns
Expand Down Expand Up @@ -79,17 +172,24 @@ dt |> groupby([:a, :b]) |> [sum, length]
function groupby{T}(dt::AbstractDataTable, cols::Vector{T}; sort::Bool = false)
sdt = dt[cols]
dt_groups = group_rows(sdt)
# sort the groups
if sort
group_perm = sortperm(view(sdt, dt_groups.rperm[dt_groups.starts]))
permute!(dt_groups.starts, group_perm)
Base.permute!!(dt_groups.stops, group_perm)
function groupby{T}(d::AbstractDataTable, cols::Vector{T}; sort::Bool = true)
## a subset of Wes McKinney's algorithm here:

x = ones(UInt32, nrow(d))
ngroups = 1
for j in length(cols):-1:1
# also compute the number of groups, which is the product of the set lengths
ngroups *= combine_col!(x, d[cols[j]], ngroups, sort)
# TODO if ngroups is really big, shrink it
GroupedDataTable(dt, cols, dt_groups.rperm,
dt_groups.starts, dt_groups.stops)
(idx, starts) = groupsort_indexer(x, ngroups)
# Remove zero-length groupings
starts = _groupedunique!(starts)
ends = starts[2:end]
ends .-= 1
GroupedDataTable(d, cols, idx, starts, ends)
groupby(d::AbstractDataTable, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)

Expand Down Expand Up @@ -263,8 +363,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
based on columns `col`
by(d::AbstractDataTable, cols, f::Function; sort::Bool = false)
by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
by(d::AbstractDataTable, cols, f::Function; sort::Bool = true)
by(f::Function, d::AbstractDataTable, cols; sort::Bool = true)
### Arguments
Expand All @@ -273,7 +373,7 @@ by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
* `f` : a function to be applied to groups; expects each argument to
be an AbstractDataTable
* `sort`: sort row groups (no sorting by default)
* `sort`: whether to sort row groups; disable sorting for maximum performance
`f` can return a value, a vector, or a DataTable. For a value or
vector, these are merged into a column along with the `cols` keys. For
Expand Down Expand Up @@ -321,8 +421,8 @@ Split-apply-combine that applies a set of functions over columns of an
AbstractDataTable or GroupedDataTable
aggregate(d::AbstractDataTable, cols, fs)
aggregate(gd::GroupedDataTable, fs)
aggregate(d::AbstractDataTable, cols, fs; sort::Bool=true)
aggregate(gd::GroupedDataTable, fs; sort::Bool=true)
### Arguments
Expand All @@ -332,6 +432,7 @@ aggregate(gd::GroupedDataTable, fs)
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
* `fs` : a function or vector of functions to be applied to vectors
within groups; expects each argument to be a column vector
* `sort`: whether to sort row groups; disable sorting for maximum performance
Each `fs` should return a value or vector. All returns must be the
same length.
Expand All @@ -353,15 +454,17 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent
aggregate(d::AbstractDataTable, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=false)
aggregate(d::AbstractDataTable, fs::Function; sort::Bool=true) =
aggregate(d, [fs], sort=sort)
function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=true)
headers = _makeheaders(fs, _names(d))
_aggregate(d, fs, headers, sort)

# Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable
aggregate(gd::GroupedDataTable, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=false)
aggregate(gd::GroupedDataTable, f::Function; sort::Bool=true) =
aggregate(gd, [f], sort=sort)
function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=true)
headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
sort && sort!(res, cols=headers)
Expand All @@ -375,7 +478,7 @@ end
function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
cols::Union{S, AbstractVector{S}},
fs::Union{T, Vector{T}};
aggregate(groupby(d, cols, sort=sort), fs)

Expand All @@ -384,7 +487,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
[Symbol(colname,'_',fname) for fname in fnames for colname in cn]

function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false)
function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T},
headers::Vector{Symbol}, sort::Bool=true)
res = DataTable(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
sort && sort!(res, cols=headers)
Expand Down
20 changes: 20 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@ function countnull(a::CategoricalArray)
return res

if !isdefined(Base, :unique!) # Julia < 0.7
function _groupedunique!(A::AbstractVector)
isempty(A) && return A
idxs = eachindex(A)
y = first(A)
state = start(idxs)
i, state = next(idxs, state)
for x in A
if !isequal(x, y)
i, state = next(idxs, state)
y = A[i] = x
resize!(A, i - first(idxs) + 1)
# unique!() includes a fast path for sorted vectors
_groupedunique!(A::AbstractVector) = unique!(A)

# Gets the name of a function. Used in groupedatatable/grouping.jl
function _fnames{T<:Function}(fs::Vector{T})
λcounter = 0
Expand Down
35 changes: 29 additions & 6 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,34 @@ module TestGrouping
levels!(dt[:Key1], ["Z", "B", "A"])
levels!(dt[:Key2], ["Z", "B", "A"])
gd = groupby(dt, :Key1)
@test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
@test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
@test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
@test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
gd = groupby(dt, [:Key1, :Key2])
@test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1))
@test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4))
@test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4))
@test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1))

# test NullableArray and NullableCategoricalArray with nulls
for (S, T) in ((NullableArray, NullableArray),
(NullableCategoricalArray, NullableCategoricalArray),
(NullableArray, NullableCategoricalArray),
(NullableCategoricalArray, NullableArray))
dt = DataTable(Key1 = S(["A", "A", "B", Nullable(), Nullable()]),
Key2 = T(["A", "B", "A", Nullable(), "A"]),
Value = 1:5)
gd = groupby(dt, :Key1)
@test isequal(gd[1], DataTable(Key1=Nullable{String}["A", "A"],
Key2=Nullable{String}["A", "B"], Value=1:2))
@test isequal(gd[2], DataTable(Key1=Nullable{String}["B"],
Key2=Nullable{String}["A"], Value=3))
@test isequal(gd[3], DataTable(Key1=[Nullable(), Nullable()],
Key2=Nullable{String}[Nullable(), "A"], Value=4:5))
gd = groupby(dt, [:Key1, :Key2])
@test isequal(gd[1], DataTable(Key1=Nullable("A"), Key2=Nullable("A"), Value=1))
@test isequal(gd[2], DataTable(Key1=Nullable("A"), Key2=Nullable("B"), Value=2))
@test isequal(gd[3], DataTable(Key1=Nullable("B"), Key2=Nullable("A"), Value=3))
@test isequal(gd[4], DataTable(Key1=Nullable(), Key2=Nullable("A"), Value=5))
@test isequal(gd[5], DataTable(Key1=Nullable(), Key2=Nullable(), Value=4))

0 comments on commit d6104d7

Please sign in to comment.