Skip to content

Commit

Permalink
Modify aggregate for efficiency
Browse files Browse the repository at this point in the history
Bypassing map and combine for aggregate speeds up code and reduces
memory allocations by ~1-2 orders of magnitude. The `By` function
still uses map and combine to allow more complex anonymous functions
that can return DataFrames and do blocks. Functions for naming new
columns were inlined, and their Julia 0.4 Compat removed. Added tests.
  • Loading branch information
cjprybol committed Oct 9, 2017
1 parent eb3d10a commit 2b42e2d
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 41 deletions.
31 changes: 12 additions & 19 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs)
* `fs` : a function or vector of functions to be applied to vectors
within groups; expects each argument to be a column vector
Each `fs` should return a value or vector. All returns must be the
same length.
### Returns
* `::DataFrame`
Expand All @@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))] # equivalent
"""
aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
headers = _makeheaders(fs, _names(d))
_aggregate(d, fs, headers, sort)
headers = [Symbol(c, "_", f) for f in fs for c in names(d)]
res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers)
sort && sort!(res)
res
end

# Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame
aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
sort && sort!(res, cols=headers)
res = gd.parent[gd.idx[gd.starts], gd.cols]
cols = setdiff(names(gd.parent), gd.cols)
for f in fs
for c in cols
res[Symbol(c, "_", f)] = [f(g[c]) for g in gd]
end
end
sort && sort!(res)
res
end

Expand All @@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame,
sort::Bool=false) where {S<:ColumnIndex, T <:Function}
aggregate(groupby(d, cols, sort=sort), fs)
end

function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function
fnames = _fnames(fs) # see other/utils.jl
[Symbol(colname,'_',fname) for fname in fnames for colname in cn]
end

function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function
res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
sort && sort!(res, cols=headers)
res
end
15 changes: 0 additions & 15 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,18 +141,3 @@ function countnull(a::CategoricalArray)
end
return res
end

# Gets the name of a function. Used in groupeDataFrame/grouping.jl
function _fnames(fs::Vector{T}) where T<:Function
λcounter = 0
names = map(fs) do f
name = string(f)
if name == "(anonymous function)" # Anonymous functions with Julia < 0.5
λcounter += 1
name = "λ$(λcounter)"
end
name
end
names
end

36 changes: 36 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -181,4 +181,40 @@ module TestGrouping
@test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
@test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
@test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)

@testset "aggregate" begin
# test converting functions to valid column names
@test Symbol.([mean, sum]) == [:mean, :sum]
@test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x))))

dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1)

@test aggregate(groupby(dt, :group), sum) ==
aggregate(dt, :group, sum) ==
DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10])

@test aggregate(groupby(dt, :group), sum, sort = true) ==
aggregate(dt, :group, sum, sort = true) ==
DataFrame(group = 'a':'c', x_sum = [10, 26, 42])

@test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12)
anonymous = x -> length(x)
@test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12,
Symbol("x_$anonymous") => 12)

dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4),
a = 1:48, b = fill(24.5, 48))
@test aggregate(dt, [sum, length]) ==
DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176,
year_length = 48, month_length = 48, a_length = 48, b_length = 48)
@test aggregate(dt, [:year], [sum, length]) ==
DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510],
b_sum = fill(294, 4), month_length = fill(12, 4),
a_length = fill(12, 4), b_length = fill(12, 4))

@test aggregate(dt, [:month], [sum, length]) ==
DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120),
b_sum = fill(98, 12), year_length = fill(4, 12),
a_length = fill(4, 12), b_length = fill(4, 12))
end
end
7 changes: 0 additions & 7 deletions test/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,6 @@ module TestUtils
pdata[1:end] = null
@test DataFrames.countnull(pdata) == 20

funs = [mean, sum, var, x -> sum(x)]
if string(funs[end]) == "(anonymous function)" # Julia < 0.5
@test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"]
else
@test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])]
end

@testset "describe" begin
io = IOBuffer()
df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),
Expand Down

0 comments on commit 2b42e2d

Please sign in to comment.