From 2b42e2d90ded57b9244f141666851b26a402ffc8 Mon Sep 17 00:00:00 2001
From: Cameron Prybol <cameron.prybol@gmail.com>
Date: Sun, 8 Oct 2017 21:47:19 -0700
Subject: [PATCH] Modify aggregate for efficiency

Bypassing map and combine for aggregate speeds up code and reduces
memory allocations by ~1-2 orders of magnitude. The `By` function
still uses map and combine to allow more complex anonymous functions
that can return DataFrames and do blocks. Functions for naming new
columns were inlined, and their Julia 0.4 Compat removed. Added tests.
---
 src/groupeddataframe/grouping.jl | 31 +++++++++++----------------
 src/other/utils.jl               | 15 -------------
 test/grouping.jl                 | 36 ++++++++++++++++++++++++++++++++
 test/utils.jl                    |  7 -------
 4 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
index 63a63f5224..3fee71a8a5 100644
--- a/src/groupeddataframe/grouping.jl
+++ b/src/groupeddataframe/grouping.jl
@@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs)
 * `fs` : a function or vector of functions to be applied to vectors
   within groups; expects each argument to be a column vector
 
-Each `fs` should return a value or vector. All returns must be the
-same length.
-
 ### Returns
 
 * `::DataFrame`
@@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))]   # equivalent
 """
 aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
 function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, _names(d))
-    _aggregate(d, fs, headers, sort)
+    headers = [Symbol(c, "_", f) for f in fs for c in names(d)]
+    res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers)
+    sort && sort!(res)
+    res
 end
 
 # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame
 aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
 function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
-    res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
-    sort && sort!(res, cols=headers)
+    res = gd.parent[gd.idx[gd.starts], gd.cols]
+    cols = setdiff(names(gd.parent), gd.cols)
+    for f in fs
+        for c in cols
+            res[Symbol(c, "_", f)] = [f(g[c]) for g in gd]
+        end
+    end
+    sort && sort!(res)
     res
 end
 
@@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame,
                    sort::Bool=false) where {S<:ColumnIndex, T <:Function}
     aggregate(groupby(d, cols, sort=sort), fs)
 end
-
-function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function
-    fnames = _fnames(fs) # see other/utils.jl
-    [Symbol(colname,'_',fname) for fname in fnames for colname in cn]
-end
-
-function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function
-    res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
-    sort && sort!(res, cols=headers)
-    res
-end
diff --git a/src/other/utils.jl b/src/other/utils.jl
index 8aba353e1a..597f59810d 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -141,18 +141,3 @@ function countnull(a::CategoricalArray)
     end
     return res
 end
-
-# Gets the name of a function. Used in groupeDataFrame/grouping.jl
-function _fnames(fs::Vector{T}) where T<:Function
-    λcounter = 0
-    names = map(fs) do f
-        name = string(f)
-        if name == "(anonymous function)" # Anonymous functions with Julia < 0.5
-            λcounter += 1
-            name = "λ$(λcounter)"
-        end
-        name
-    end
-    names
-end
-
diff --git a/test/grouping.jl b/test/grouping.jl
index a6021a21cd..5460a83edb 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -181,4 +181,40 @@ module TestGrouping
     @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
     @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
     @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
+
+    @testset "aggregate" begin
+        # test converting functions to valid column names
+        @test Symbol.([mean, sum]) == [:mean, :sum]
+        @test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x))))
+
+        dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1)
+
+        @test aggregate(groupby(dt, :group), sum) ==
+              aggregate(dt, :group, sum)  ==
+            DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10])
+
+        @test aggregate(groupby(dt, :group), sum, sort = true) ==
+              aggregate(dt, :group, sum, sort = true)  ==
+            DataFrame(group = 'a':'c', x_sum = [10, 26, 42])
+
+        @test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12)
+        anonymous = x -> length(x)
+        @test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12,
+                                                    Symbol("x_$anonymous") => 12)
+
+        dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4),
+                       a = 1:48, b = fill(24.5, 48))
+        @test aggregate(dt, [sum, length]) ==
+            DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176,
+                      year_length = 48, month_length = 48, a_length = 48, b_length = 48)
+        @test aggregate(dt, [:year], [sum, length]) ==
+            DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510],
+                      b_sum = fill(294, 4), month_length = fill(12, 4),
+                      a_length = fill(12, 4), b_length = fill(12, 4))
+
+        @test aggregate(dt, [:month], [sum, length]) ==
+            DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120),
+                      b_sum = fill(98, 12), year_length = fill(4, 12),
+                      a_length = fill(4, 12), b_length = fill(4, 12))
+    end
 end
diff --git a/test/utils.jl b/test/utils.jl
index ae15b1af29..40c4f9be7d 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -49,13 +49,6 @@ module TestUtils
     pdata[1:end] = null
     @test DataFrames.countnull(pdata) == 20
 
-    funs = [mean, sum, var, x -> sum(x)]
-    if string(funs[end]) == "(anonymous function)" # Julia < 0.5
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"]
-    else
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])]
-    end
-
     @testset "describe" begin
         io = IOBuffer()
         df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),