Modify aggregate for efficiency

Bypassing map and combine for aggregate speeds up code and reduces memory allocations by ~1-2 orders of magnitude. The `By` function still uses map and combine to allow more complex anonymous functions that can return DataFrames and do blocks. Functions for naming new columns were inlined, and their Julia 0.4 Compat removed. Added tests.
JuliaData · Oct 9, 2017 · 2b42e2d · 2b42e2d
1 parent eb3d10a
commit 2b42e2d
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 41 deletions.
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs)
 * `fs` : a function or vector of functions to be applied to vectors
   within groups; expects each argument to be a column vector
 
-Each `fs` should return a value or vector. All returns must be the
-same length.
-
 ### Returns
 
 * `::DataFrame`
@@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))]   # equivalent
 """
 aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
 function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, _names(d))
-    _aggregate(d, fs, headers, sort)
+    headers = [Symbol(c, "_", f) for f in fs for c in names(d)]
+    res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers)
+    sort && sort!(res)
+    res
 end
 
 # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame
 aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
 function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function
-    headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
-    res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
-    sort && sort!(res, cols=headers)
+    res = gd.parent[gd.idx[gd.starts], gd.cols]
+    cols = setdiff(names(gd.parent), gd.cols)
+    for f in fs
+        for c in cols
+            res[Symbol(c, "_", f)] = [f(g[c]) for g in gd]
+        end
+    end
+    sort && sort!(res)
     res
 end
 
@@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame,
                    sort::Bool=false) where {S<:ColumnIndex, T <:Function}
     aggregate(groupby(d, cols, sort=sort), fs)
 end
-
-function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function
-    fnames = _fnames(fs) # see other/utils.jl
-    [Symbol(colname,'_',fname) for fname in fnames for colname in cn]
-end
-
-function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function
-    res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
-    sort && sort!(res, cols=headers)
-    res
-end
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -141,18 +141,3 @@ function countnull(a::CategoricalArray)
     end
     return res
 end
-
-# Gets the name of a function. Used in groupeDataFrame/grouping.jl
-function _fnames(fs::Vector{T}) where T<:Function
-    λcounter = 0
-    names = map(fs) do f
-        name = string(f)
-        if name == "(anonymous function)" # Anonymous functions with Julia < 0.5
-            λcounter += 1
-            name = "λ$(λcounter)"
-        end
-        name
-    end
-    names
-end
-
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -181,4 +181,40 @@ module TestGrouping
     @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2)
     @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3)
     @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4)
+
+    @testset "aggregate" begin
+        # test converting functions to valid column names
+        @test Symbol.([mean, sum]) == [:mean, :sum]
+        @test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x))))
+
+        dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1)
+
+        @test aggregate(groupby(dt, :group), sum) ==
+              aggregate(dt, :group, sum)  ==
+            DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10])
+
+        @test aggregate(groupby(dt, :group), sum, sort = true) ==
+              aggregate(dt, :group, sum, sort = true)  ==
+            DataFrame(group = 'a':'c', x_sum = [10, 26, 42])
+
+        @test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12)
+        anonymous = x -> length(x)
+        @test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12,
+                                                    Symbol("x_$anonymous") => 12)
+
+        dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4),
+                       a = 1:48, b = fill(24.5, 48))
+        @test aggregate(dt, [sum, length]) ==
+            DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176,
+                      year_length = 48, month_length = 48, a_length = 48, b_length = 48)
+        @test aggregate(dt, [:year], [sum, length]) ==
+            DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510],
+                      b_sum = fill(294, 4), month_length = fill(12, 4),
+                      a_length = fill(12, 4), b_length = fill(12, 4))
+
+        @test aggregate(dt, [:month], [sum, length]) ==
+            DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120),
+                      b_sum = fill(98, 12), year_length = fill(4, 12),
+                      a_length = fill(4, 12), b_length = fill(4, 12))
+    end
 end
diff --git a/test/utils.jl b/test/utils.jl
@@ -49,13 +49,6 @@ module TestUtils
     pdata[1:end] = null
     @test DataFrames.countnull(pdata) == 20
 
-    funs = [mean, sum, var, x -> sum(x)]
-    if string(funs[end]) == "(anonymous function)" # Julia < 0.5
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"]
-    else
-        @test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])]
-    end
-
     @testset "describe" begin
         io = IOBuffer()
         df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),