JuliaData · cjprybol · Oct 9, 2017 · nalimilan · Oct 9, 2017 · cjprybol
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs)
 * `fs` : a function or vector of functions to be applied to vectors
   within groups; expects each argument to be a column vector
 
-Each `fs` should return a value or vector. All returns must be the
-same length.
-
 ### Returns
 
 * `::DataFrame`
@@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))]   # equivalent
 """
 aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
 function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, _names(d))
-    _aggregate(d, fs, headers, sort)
+    headers = [Symbol(c, "_", f) for f in fs for c in names(d)]
+    res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers)
+    sort && sort!(res)
+    res
 end
 
 # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame
 aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
 function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
-    res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
-    sort && sort!(res, cols=headers)
+    res = gd.parent[gd.idx[gd.starts], gd.cols]
+    cols = setdiff(names(gd.parent), gd.cols)
+    for f in fs
+        for c in cols
+            res[Symbol(c, "_", f)] = [f(g[c]) for g in gd]
+        end
+    end
+    sort && sort!(res)
     res
 end
 
@@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame,
                    sort::Bool=false) where {S<:ColumnIndex, T <:Function}
     aggregate(groupby(d, cols, sort=sort), fs)
 end
-
-function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function
-    fnames = _fnames(fs) # see other/utils.jl
-    [Symbol(colname,'_',fname) for fname in fnames for colname in cn]
-end
-
-function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function
-    res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
-    sort && sort!(res, cols=headers)
-    res
-end
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -141,18 +141,3 @@ function countnull(a::CategoricalArray)
     end
     return res
 end
-
-# Gets the name of a function. Used in groupeDataFrame/grouping.jl
-function _fnames(fs::Vector{T}) where T<:Function
-    λcounter = 0
-    names = map(fs) do f
-        name = string(f)
-        if name == "(anonymous function)" # Anonymous functions with Julia < 0.5
-            λcounter += 1
-            name = "λ$(λcounter)"
-        end
-        name
-    end
-    names
-end
-
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -181,4 +181,40 @@ module TestGrouping
     @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
     @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
     @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
+
+    @testset "aggregate" begin
+        # test converting functions to valid column names
+        @test Symbol.([mean, sum]) == [:mean, :sum]
+        @test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x))))
+
+        dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1)
+
+        @test aggregate(groupby(dt, :group), sum) ==
+              aggregate(dt, :group, sum)  ==
+            DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10])
+
+        @test aggregate(groupby(dt, :group), sum, sort = true) ==
+              aggregate(dt, :group, sum, sort = true)  ==
+            DataFrame(group = 'a':'c', x_sum = [10, 26, 42])
+
+        @test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12)
+        anonymous = x -> length(x)
+        @test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12,
+                                                    Symbol("x_$anonymous") => 12)
+
+        dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4),
+                       a = 1:48, b = fill(24.5, 48))
+        @test aggregate(dt, [sum, length]) ==
+            DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176,
+                      year_length = 48, month_length = 48, a_length = 48, b_length = 48)
+        @test aggregate(dt, [:year], [sum, length]) ==
+            DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510],
+                      b_sum = fill(294, 4), month_length = fill(12, 4),
+                      a_length = fill(12, 4), b_length = fill(12, 4))
+
+        @test aggregate(dt, [:month], [sum, length]) ==
+            DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120),
+                      b_sum = fill(98, 12), year_length = fill(4, 12),
+                      a_length = fill(4, 12), b_length = fill(4, 12))
+    end
 end
diff --git a/test/utils.jl b/test/utils.jl
@@ -49,13 +49,6 @@ module TestUtils
     pdata[1:end] = null
     @test DataFrames.countnull(pdata) == 20
 
-    funs = [mean, sum, var, x -> sum(x)]
-    if string(funs[end]) == "(anonymous function)" # Julia < 0.5
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"]
-    else
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])]
-    end
-
     @testset "describe" begin
         io = IOBuffer()
         df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),