diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 63a63f5224..3fee71a8a5 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -320,9 +320,6 @@ aggregate(gd::GroupedDataFrame, fs) * `fs` : a function or vector of functions to be applied to vectors within groups; expects each argument to be a column vector -Each `fs` should return a value or vector. All returns must be the -same length. - ### Returns * `::DataFrame` @@ -342,16 +339,23 @@ df |> groupby(:a) |> [sum, x->mean(Nulls.skip(x))] # equivalent """ aggregate(d::AbstractDataFrame, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort) function aggregate(d::AbstractDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function - headers = _makeheaders(fs, _names(d)) - _aggregate(d, fs, headers, sort) + headers = [Symbol(c, "_", f) for f in fs for c in names(d)] + res = DataFrame(Any[f(d[c]) for f in fs for c in names(d)], headers) + sort && sort!(res) + res end # Applies aggregate to non-key cols of each SubDataFrame of a GroupedDataFrame aggregate(gd::GroupedDataFrame, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort) function aggregate(gd::GroupedDataFrame, fs::Vector{T}; sort::Bool=false) where T<:Function - headers = _makeheaders(fs, setdiff(_names(gd), gd.cols)) - res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) - sort && sort!(res, cols=headers) + res = gd.parent[gd.idx[gd.starts], gd.cols] + cols = setdiff(names(gd.parent), gd.cols) + for f in fs + for c in cols + res[Symbol(c, "_", f)] = [f(g[c]) for g in gd] + end + end + sort && sort!(res) res end @@ -365,14 +369,3 @@ function aggregate(d::AbstractDataFrame, sort::Bool=false) where {S<:ColumnIndex, T <:Function} aggregate(groupby(d, cols, sort=sort), fs) end - -function _makeheaders(fs::Vector{T}, cn::Vector{Symbol}) where T<:Function - fnames = _fnames(fs) # see other/utils.jl - [Symbol(colname,'_',fname) for fname in fnames for colname in cn] -end - -function _aggregate(d::AbstractDataFrame, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) where T<:Function - res = DataFrame(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers) - sort && sort!(res, cols=headers) - res -end diff --git a/src/other/utils.jl b/src/other/utils.jl index 8aba353e1a..597f59810d 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -141,18 +141,3 @@ function countnull(a::CategoricalArray) end return res end - -# Gets the name of a function. Used in groupeDataFrame/grouping.jl -function _fnames(fs::Vector{T}) where T<:Function - λcounter = 0 - names = map(fs) do f - name = string(f) - if name == "(anonymous function)" # Anonymous functions with Julia < 0.5 - λcounter += 1 - name = "λ$(λcounter)" - end - name - end - names -end - diff --git a/test/grouping.jl b/test/grouping.jl index a6021a21cd..5460a83edb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -181,4 +181,40 @@ module TestGrouping @test gd[2] == DataFrame(Key1="A", Key2="B", Value=2) @test gd[3] == DataFrame(Key1="B", Key2="A", Value=3) @test gd[4] == DataFrame(Key1="B", Key2="B", Value=4) + + @testset "aggregate" begin + # test converting functions to valid column names + @test Symbol.([mean, sum]) == [:mean, :sum] + @test ismatch(r"#\d+", string(Symbol(x -> reduce(^, x)))) + + dt = DataFrame(group = repeat('c':-1:'a', inner = 4), x = 12:-1:1) + + @test aggregate(groupby(dt, :group), sum) == + aggregate(dt, :group, sum) == + DataFrame(group = 'c':-1:'a', x_sum = [42, 26, 10]) + + @test aggregate(groupby(dt, :group), sum, sort = true) == + aggregate(dt, :group, sum, sort = true) == + DataFrame(group = 'a':'c', x_sum = [10, 26, 42]) + + @test aggregate(dt, length) == DataFrame(group_length = 12, x_length = 12) + anonymous = x -> length(x) + @test aggregate(dt, anonymous) == DataFrame(Symbol("group_$anonymous") => 12, + Symbol("x_$anonymous") => 12) + + dt = DataFrame(year = repeat(1:4, inner = 12), month = repeat(1:12, outer = 4), + a = 1:48, b = fill(24.5, 48)) + @test aggregate(dt, [sum, length]) == + DataFrame(year_sum = 120, month_sum = 312, a_sum = 1176, b_sum = 1176, + year_length = 48, month_length = 48, a_length = 48, b_length = 48) + @test aggregate(dt, [:year], [sum, length]) == + DataFrame(year = 1:4, month_sum = fill(78, 4), a_sum = [78, 222, 366, 510], + b_sum = fill(294, 4), month_length = fill(12, 4), + a_length = fill(12, 4), b_length = fill(12, 4)) + + @test aggregate(dt, [:month], [sum, length]) == + DataFrame(month = 1:12, year_sum = fill(10, 12), a_sum = collect(76:4:120), + b_sum = fill(98, 12), year_length = fill(4, 12), + a_length = fill(4, 12), b_length = fill(4, 12)) + end end diff --git a/test/utils.jl b/test/utils.jl index ae15b1af29..40c4f9be7d 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -49,13 +49,6 @@ module TestUtils pdata[1:end] = null @test DataFrames.countnull(pdata) == 20 - funs = [mean, sum, var, x -> sum(x)] - if string(funs[end]) == "(anonymous function)" # Julia < 0.5 - @test DataFrames._fnames(funs) == ["mean", "sum", "var", "λ1"] - else - @test DataFrames._fnames(funs) == ["mean", "sum", "var", string(funs[end])] - end - @testset "describe" begin io = IOBuffer() df = DataFrame(Any[collect(1:4), Vector{Union{Int, Null}}(2:5),