From 2b42e2d90ded57b9244f141666851b26a402ffc8 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sun, 8 Oct 2017 21:47:19 -0700 Subject: [PATCH] Modify aggregate for efficiency Bypassing map and combine for aggregate speeds up code and reduces memory allocations by ~1-2 orders of magnitude. The `By` function still uses map and combine to allow more complex anonymous functions that can return DataFrames and do blocks. Functions for naming new columns were inlined, and their Julia 0.4 Compat removed. Added tests. --- src/groupeddataframe/grouping.jl | 31 +++++++++++---------------- src/other/utils.jl | 15 ------------- test/grouping.jl | 36 ++++++++++++++++++++++++++++++++ test/utils.jl | 7 ------- 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 63a63f5224..3fee71a8a5 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs) * `fs` : a function or vector of functions to be applied to vectors within groups; expects each argument to be a column vector -Each `fs` should return a value or vector. All returns must be the -same length. - ### Returns * `::DataFrame` @@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))] # equivalent """ aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort) function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function - headers = _makeheaders(fs, _names(d)) - _aggregate(d, fs, headers, sort) + headers = [Symbol(c, "_", f) for f in fs for c in names(d)] + res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers) + sort && sort!(res) + res end # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort) function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function - headers = _makeheaders(fs, setdiff(_names(gd), gd.cols)) - res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) - sort && sort!(res, cols=headers) + res = gd.parent[gd.idx[gd.starts], gd.cols] + cols = setdiff(names(gd.parent), gd.cols) + for f in fs + for c in cols + res[Symbol(c, "_", f)] = [f(g[c]) for g in gd] + end + end + sort && sort!(res) res end @@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame, sort::Bool=false) where {S<:ColumnIndex, T <:Function} aggregate(groupby(d, cols, sort=sort), fs) end - -function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function - fnames = _fnames(fs) # see other/utils.jl - [Symbol(colname,'_',fname) for fname in fnames for colname in cn] -end - -function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function - res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers) - sort && sort!(res, cols=headers) - res -end diff --git a/src/other/utils.jl b/src/other/utils.jl index 8aba353e1a..597f59810d 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -141,18 +141,3 @@ function countnull(a::CategoricalArray) end return res end - -# Gets the name of a function. Used in groupeDataFrame/grouping.jl -function _fnames(fs::Vector{T}) where T<:Function - λcounter = 0 - names = map(fs) do f - name = string(f) - if name == "(anonymous function)" # Anonymous functions with Julia < 0.5 - λcounter += 1 - name = "λ$(λcounter)" - end - name - end - names -end - diff --git a/test/grouping.jl b/test/grouping.jl index a6021a21cd..5460a83edb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -181,4 +181,40 @@ module TestGrouping @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2) @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4) + + @testset "aggregate" begin + # test converting functions to valid column names + @test Symbol.([mean, sum]) == [:mean, :sum] + @test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x)))) + + dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1) + + @test aggregate(groupby(dt, :group), sum) == + aggregate(dt, :group, sum) == + DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10]) + + @test aggregate(groupby(dt, :group), sum, sort = true) == + aggregate(dt, :group, sum, sort = true) == + DataFrame(group = 'a':'c', x_sum = [10, 26, 42]) + + @test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12) + anonymous = x -> length(x) + @test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12, + Symbol("x_$anonymous") => 12) + + dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4), + a = 1:48, b = fill(24.5, 48)) + @test aggregate(dt, [sum, length]) == + DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176, + year_length = 48, month_length = 48, a_length = 48, b_length = 48) + @test aggregate(dt, [:year], [sum, length]) == + DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510], + b_sum = fill(294, 4), month_length = fill(12, 4), + a_length = fill(12, 4), b_length = fill(12, 4)) + + @test aggregate(dt, [:month], [sum, length]) == + DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120), + b_sum = fill(98, 12), year_length = fill(4, 12), + a_length = fill(4, 12), b_length = fill(4, 12)) + end end diff --git a/test/utils.jl b/test/utils.jl index ae15b1af29..40c4f9be7d 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -49,13 +49,6 @@ module TestUtils pdata[1:end] = null @test DataFrames.countnull(pdata) == 20 - funs = [mean, sum, var, x -> sum(x)] - if string(funs[end]) == "(anonymous function)" # Julia < 0.5 - @test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"] - else - @test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])] - end - @testset "describe" begin io = IOBuffer() df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),