From f409948ed16c8a692e29feb679c98cc259319628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 12 Oct 2020 16:31:19 +0200 Subject: [PATCH 01/10] part 1 of implementation --- src/groupeddataframe/splitapplycombine.jl | 455 ++++++++++++---------- 1 file changed, 257 insertions(+), 198 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 5664e7449e..58a94885d4 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -453,52 +453,39 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum, renamecols=false) │ 8 │ 4 │ 1 │ 8 │ 9 │ ``` """ -function combine(f::Base.Callable, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) - return combine_helper(f, gd, keepkeys=keepkeys, ungroup=ungroup, - copycols=true, keeprows=false, renamecols=renamecols) -end - -combine(f::typeof(nrow), gd::GroupedDataFrame; +combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = - combine(gd, [nrow => :nrow], keepkeys=keepkeys, ungroup=ungroup, - renamecols=renamecols) - -function combine(p::Pair, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) - # move handling of aggregate to specialized combine - p_from, p_to = p - - # verify if it is not better to use a fast path, which we achieve - # by moving to combine(::GroupedDataFrame, ::AbstractVector) method - # note that even if length(gd) == 0 we can do this step - if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to), gd) || p_from === nrow - return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup, renamecols=renamecols) - end - - if p_from isa Tuple - cs = collect(p_from) - # an explicit error is thrown as this was allowed in the past - throw(ArgumentError("passing a Tuple $p_from as column selector is not supported" * - ", use a vector $cs instead")) - else - cs = p_from - end - return combine_helper(cs => p_to, gd, keepkeys=keepkeys, ungroup=ungroup, - copycols=true, keeprows=false, renamecols=renamecols) -end + return combine(gd, f, keepkeys=keepkeys, ungroup=ungroup, renamecols=renamecols) combine(gd::GroupedDataFrame, - cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; + cs::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex}...; keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = _combine_prepare(gd, cs..., keepkeys=keepkeys, ungroup=ungroup, copycols=true, keeprows=false, renamecols=renamecols) +function gen_groups(idx::Vector{Int}) + groups = zeros(Int, length(idx)) + groups[1] = 1 + j = 1 + last_idx = idx[1] + @inbounds for i in 2:length(idx) + cur_idx = idx[i] + j += cur_idx != last_idx + last_idx = cur_idx + groups[i] = j + end + return groups +end + function _combine_prepare(gd::GroupedDataFrame, - @nospecialize(cs::Union{Pair, typeof(nrow), + @nospecialize(cs::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex}...); keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool, renamecols::Bool) + if !ungroup && !keepkeys + throw(ArgumentError("keepkeys=false when ungroup=false is not allowed")) + end + cs_vec = [] for p in cs if p === nrow @@ -514,91 +501,32 @@ function _combine_prepare(gd::GroupedDataFrame, # an explicit error is thrown as this was allowed in the past throw(ArgumentError("passing a Tuple $(first(x)) as column selector is not supported" * ", use a vector $(collect(first(x))) instead")) - for (i, v) in enumerate(cs_vec) - if first(v) isa Tuple - cs_vec[i] = collect(first(v)) => last(v) - end - end end - cs_norm_pre = [normalize_selection(index(parent(gd)), c, renamecols) for c in cs_vec] - seen_cols = Set{Symbol}() - process_vectors = false - for v in cs_norm_pre - if v isa Pair - out_col = last(last(v)) - if out_col in seen_cols - throw(ArgumentError("Duplicate output column name $out_col requested")) + + cs_norm = [] + optional_transform = Bool[] + for arg in [normalize_selection(index(parent(gd)), c, renamecols) for c in cs_vec] + if arg isa AbstractVector{Int} + for col_idx in arg + push!(cs_norm, col_idx => identity => _names(gd)[col_idx]) + push!(optional_transform, true) end - push!(seen_cols, out_col) else - @assert v isa AbstractVector{Int} - process_vectors = true + push!(cs_norm, arg) + push!(optional_transform, false) end end - processed_cols = Set{Symbol}() - if process_vectors - cs_norm = Pair[] - for (i, v) in enumerate(cs_norm_pre) - if v isa Pair - push!(cs_norm, v) - push!(processed_cols, last(last(v))) - else - @assert v isa AbstractVector{Int} - for col_idx in v - col_name = _names(gd)[col_idx] - if !(col_name in processed_cols) - push!(processed_cols, col_name) - if col_name in seen_cols - trans_idx = findfirst(cs_norm_pre) do p - p isa Pair || return false - last(last(p)) == col_name - end - @assert !isnothing(trans_idx) && trans_idx > i - push!(cs_norm, cs_norm_pre[trans_idx]) - # it is safe to delete from cs_norm_pre - # as we have not reached trans_idx index yet - deleteat!(cs_norm_pre, trans_idx) - else - push!(cs_norm, col_idx => identity => col_name) - end - end - end - end - end - else - cs_norm = collect(Pair, cs_norm_pre) - end - f = Pair[first(x) => first(last(x)) for x in cs_norm] - nms = Symbol[last(last(x)) for x in cs_norm] - return combine_helper(f, gd, nms, keepkeys=keepkeys, ungroup=ungroup, - copycols=copycols, keeprows=keeprows, renamecols=renamecols) -end -function gen_groups(idx::Vector{Int}) - groups = zeros(Int, length(idx)) - groups[1] = 1 - j = 1 - last_idx = idx[1] - @inbounds for i in 2:length(idx) - cur_idx = idx[i] - j += cur_idx != last_idx - last_idx = cur_idx - groups[i] = j - end - return groups -end + # cs_norm holds now either src => fun => dst or just fun + # if optional_transform[i] is true then the transformation will be skipped + # if earlier column with a column with the same name was created + + idx, valscat = _combine(gd, cs_norm, optional_transform, copycols, keeprows, renamecols) -function combine_helper(f, gd::GroupedDataFrame, - nms::Union{AbstractVector{Symbol},Nothing}=nothing; - keepkeys::Bool, ungroup::Bool, - copycols::Bool, keeprows::Bool, renamecols::Bool) - if !ungroup && !keepkeys - throw(ArgumentError("keepkeys=false when ungroup=false is not allowed")) - end - idx, valscat = _combine(f, gd, nms, copycols, keeprows, renamecols) !keepkeys && ungroup && return valscat - keys = groupcols(gd) - for key in keys + + gd_keys = groupcols(gd) + for key in gd_keys if hasproperty(valscat, key) if (keeprows && !isequal(valscat[!, key], parent(gd)[!, key])) || (!keeprows && !isequal(valscat[!, key], view(parent(gd)[!, key], idx))) @@ -612,17 +540,17 @@ function combine_helper(f, gd::GroupedDataFrame, else newparent = length(gd) > 0 ? parent(gd)[idx, gd.cols] : parent(gd)[1:0, gd.cols] end - added_cols = select(valscat, Not(intersect(keys, _names(valscat))), copycols=false) + added_cols = select(valscat, Not(intersect(gd_keys, _names(valscat))), copycols=false) hcat!(newparent, length(gd) > 0 ? added_cols : similar(added_cols, 0), copycols=false) ungroup && return newparent - if length(idx) == 0 && !(keeprows && length(keys) > 0) + if length(idx) == 0 && !(keeprows && length(gd_keys) > 0) @assert nrow(newparent) == 0 return GroupedDataFrame(newparent, copy(gd.cols), Int[], Int[], Int[], Int[], 0, Dict{Any,Int}(), Threads.ReentrantLock()) elseif keeprows - @assert length(keys) > 0 || idx == gd.idx + @assert length(gd_keys) > 0 || idx == gd.idx # in this case we are sure that the result GroupedDataFrame has the # same structure as the source except that grouping columns are at the start return Threads.lock(gd.lazy_lock) do @@ -1117,8 +1045,9 @@ function (agg::Aggregate{typeof(length)})(incol::AbstractVector, gd::GroupedData end end -isagg((col, fun)::Pair, gdf::GroupedDataFrame) = - col isa ColumnIndex && check_aggregate(fun, parent(gdf)[!, col]) isa AbstractAggregate +isagg((col, (fun, outcol))::Pair{<:ColumnIndex, <:Pair{<:Any, <:SymbolOrString}}, gdf::GroupedDataFrame) = + check_aggregate(fun, parent(gdf)[!, col]) isa AbstractAggregate +isagg(::Any, gdf::GroupedDataFrame) = false function _agg2idx_map_helper(idx, idx_agg) agg2idx_map = fill(-1, length(idx)) @@ -1150,14 +1079,16 @@ function prepare_idx_keeprows(idx::AbstractVector{<:Integer}, return idx_keeprows end -function _combine(f::AbstractVector{<:Pair}, - gd::GroupedDataFrame, nms::AbstractVector{Symbol}, - copycols::Bool, keeprows::Bool, renamecols::Bool) - # here f should be normalized and in a form of source_cols => fun - @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f) - @assert all(x -> last(x) isa Base.Callable, f) +struct TransRes + col_idx::Vector{Int} # index for a column + col::AbstractVector # computed value of a column + name::Symbol # name of a column + optional::Bool # if a column is allowed to be replaced in the future +end - if isempty(f) +function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform::Vector{Bool}, + copycols::Bool, keeprows::Bool, renamecols::Bool) + if isempty(cs_norm) if keeprows && nrow(parent(gd)) > 0 && minimum(gd.groups) == 0 throw(ArgumentError("select and transform do not support " * "`GroupedDataFrame`s from which some groups have "* @@ -1178,30 +1109,100 @@ function _combine(f::AbstractVector{<:Pair}, end idx_agg = nothing - if length(gd) > 0 && any(x -> isagg(x, gd), f) + if length(gd) > 0 && any(x -> isagg(x, gd), cs_norm) # Compute indices of representative rows only once for all AbstractAggregates idx_agg = Vector{Int}(undef, length(gd)) fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd) - elseif length(gd) == 0 || !all(x -> isagg(x, gd), f) + elseif length(gd) == 0 || !all(x -> isagg(x, gd), cs_norm) # Trigger computation of indices # This can speed up some aggregates that would not trigger this on their own @assert gd.idx !== nothing end - res = Vector{Any}(undef, length(f)) + + trans_res = Vector{TransRes}() + + # seen_cols keeps an information about lotacion of columns already processed + # and if a given column can be replaced in the future + seen_cols = Dict{Symbol, Tuple{Bool, Int}}() + parentdf = parent(gd) - for (i, p) in enumerate(f) - source_cols, fun = p - if length(gd) > 0 && isagg(p, gd) - incol = parentdf[!, source_cols] - agg = check_aggregate(last(p), incol) + for i in eachindex(cs_norm, optional_transform) + cs_i = cs_norm[i] + ot_i = optional_transform[i] + + if length(gd) > 0 && isagg(cs_i, gd) + @assert !ot_i + out_col_name = last(last(cs_i)) + incol = parentdf[!, first(cs_i)] + agg = check_aggregate(first(last(cs_i)), incol) outcol = agg(incol, gd) - res[i] = idx_agg, outcol - elseif keeprows && fun === identity && !(source_cols isa AsTable) + + if haskey(seen_cols, out_col_name) + optional, loc = seen_cols[out_col_name] + # we have seen this col but it is not allowed to replace it + optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) + @assert trans_res[loc].optional && trans_res[loc].name == out_col_name + trans_res[loc] = TransRes(idx_agg, outcol, out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + else + push!(trans_res, TransRes(idx_agg, outcol, out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end + elseif keeprows && cs_i isa Pair && first(last(cs_i)) === identity && + !(first(cs_i) isa AsTable) && (last(last(cs_i)) isa Symbol) + # this is a fast path used when we pass a column or rename a column in select or transform + source_cols = first(cs_i) + out_col_name = last(last(cs_i)) @assert source_cols isa Union{Int, AbstractVector{Int}} @assert length(source_cols) == 1 outcol = parentdf[!, first(source_cols)] - res[i] = idx_keeprows, copycols ? copy(outcol) : outcol + + if haskey(seen_cols, out_col_name) + optional, loc = seen_cols[out_col_name] + @assert trans_res[loc].name == out_col_name + if optional + if !ot_i + @assert trans_res[loc].optional + trans_res[loc] = TransRes(idx_keeprows, copycols ? copy(outcol) : outcol, + out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + end + else + # if ot_i is true, then we ignore processing this column + ot_i || throw(ArgumentError("duplicate output column name: :$out_col_name")) + end + else + push!(trans_res, TransRes(idx_keeprows, copycols ? copy(outcol) : outcol, + out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end + elseif cs_i isa Base.Callable + firstres = length(gd) > 0 ? cs_i(gd[1]) : cs_i(similar(parent(gd), 0)) + idx, outcols, nms = _combine_multicol(firstres, cs_i, gd, nothing) + @assert length(outcols) == length(nms) + for j in eachindex(outcols) + outcol = outcols[j] + out_col_name = nms[j] + if haskey(seen_cols, out_col_name) + optional, loc = seen_cols[out_col_name] + # if column was seen and it is optional now ignore it + if !ot_i + optional, loc = seen_cols[out_col_name] + # we have seen this col but it is not allowed to replace it + optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) + @assert trans_res[loc].optional && trans_res[loc].name == out_col_name + trans_res[loc] = TransRes(idx, outcol, out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + end + else + push!(trans_res, TransRes(idx, outcol, out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end + end else + @assert cs_i isa Pair + source_cols, (fun, out_col_name) = cs_i + out_col_name isa Symbol || throw(ArgumentError("returning multiple columns is not supported yet")) if source_cols isa Int incols = (parentdf[!, source_cols],) elseif source_cols isa AsTable @@ -1217,8 +1218,7 @@ function _combine(f::AbstractVector{<:Pair}, do_call(fun, Int[], 1:1, 0:0, gd, incols, 1) firstmulticol = firstres isa MULTI_COLS_TYPE if firstmulticol - throw(ArgumentError("a single value or vector result is required when " * - "passing multiple functions (got $(typeof(res)))")) + throw(ArgumentError("a single value or vector result is required (got $(typeof(res)))")) end # if idx_agg was not computed yet it is nothing # in this case if we are not passed a vector compute it. @@ -1237,28 +1237,44 @@ function _combine(f::AbstractVector{<:Pair}, Val(firstmulticol), firstres isa AbstractVector ? nothing : idx_agg) @assert length(outcols) == 1 - res[i] = idx, outcols[1] + outcol = outcols[1] + + if haskey(seen_cols, out_col_name) + # if column was seen and it is optional now ignore it + if !ot_i + optional, loc = seen_cols[out_col_name] + # we have seen this col but it is not allowed to replace it + optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) + @assert trans_res[loc].optional && trans_res[loc].name == out_col_name + trans_res[loc] = TransRes(idx, outcol, out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + end + else + push!(trans_res, TransRes(idx, outcol, out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end end end + # idx_agg === nothing then we have only functions that # returned multiple rows and idx_loc = 1 - idx_loc = findfirst(x -> x[1] !== idx_agg, res) + idx_loc = findfirst(x -> x.col_idx !== idx_agg, trans_res) if !keeprows && isnothing(idx_loc) @assert !isnothing(idx_agg) idx = idx_agg else - idx = keeprows ? idx_keeprows : res[idx_loc][1] + idx = keeprows ? idx_keeprows : trans_res[idx_loc].col_idx agg2idx_map = nothing - for i in 1:length(res) - if res[i][1] !== idx && res[i][1] != idx - if res[i][1] === idx_agg + for i in 1:length(trans_res) + if trans_res[i].col_idx !== idx + if trans_res[i].col_idx === idx_agg # we perform pseudo broadcasting here # keep -1 as a sentinel for errors if isnothing(agg2idx_map) agg2idx_map = _agg2idx_map_helper(idx, idx_agg) end - res[i] = idx_agg, res[i][2][agg2idx_map] - elseif idx != res[i][1] + trans_res[i] = TransRes(idx_agg, trans_res[i].col[agg2idx_map], trans_res[i].name, trans_res[i].optional) + elseif idx != trans_res[i].col_idx if keeprows throw(ArgumentError("all functions must return vectors with " * "as many values as rows in each group")) @@ -1270,70 +1286,34 @@ function _combine(f::AbstractVector{<:Pair}, end end - # here first field in res[i] is used to keep track how the column was generated + # here first field in trans_res[i] is used to keep track how the column was generated # a correct index is stored in idx variable - for (i, (col_idx, col)) in enumerate(res) - if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column + for i in eachindex(trans_res) + col_idx = trans_res[i].col_idx + col = trans_res[i].col + if keeprows && col_idx !== idx_keeprows # we need to reorder the column newcol = similar(col) # we can probably make it more efficient, but I leave it as an optimization for the future gd_idx = gd.idx - for j in eachindex(gd.idx, col) - newcol[gd_idx[j]] = col[j] + k = 0 + for (s, e) in zip(gd.starts, gd.ends) + for j in s:e + k += 1 + newcol[gd_idx[j]] = col[k] + end end - res[i] = (col_idx, newcol) + @assert k == length(gd_idx) + trans_res[i] = TransRes(col_idx, newcol, trans_res[i].name, trans_res[i].optional) end end - outcols = map(x -> x[2], res) + + outcols = AbstractVector[x.col for x in trans_res] + nms = Symbol[x.name for x in trans_res] # this check is redundant given we check idx above # but it is safer to double check and it is cheap @assert all(x -> length(x) == length(outcols[1]), outcols) - return idx, DataFrame(collect(AbstractVector, outcols), nms, copycols=false) -end - -function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, - copycols::Bool, keeprows::Bool, renamecols::Bool) - @assert copycols && !keeprows - # use `similar` as `gd` might have been subsetted - firstres = length(gd) > 0 ? fun(gd[1]) : fun(similar(parent(gd), 0)) - idx, outcols, nms = _combine_multicol(firstres, fun, gd, nothing) - valscat = DataFrame(collect(AbstractVector, outcols), nms) - return idx, valscat -end - -function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing, - copycols::Bool, keeprows::Bool, renamecols::Bool) - # here p should not be normalized as we allow tabular return value from fun - # map and combine should not dispatch here if p is isagg - @assert copycols && !keeprows - source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p, renamecols) - parentdf = parent(gd) - if source_cols isa Int - incols = (parent(gd)[!, source_cols],) - elseif source_cols isa AsTable - incols = Tables.columntable(select(parentdf, - source_cols.cols, - copycols=false)) - else - @assert source_cols isa AbstractVector{Int} - incols = ntuple(i -> parent(gd)[!, source_cols[i]], length(source_cols)) - end - firstres = length(gd) > 0 ? - do_call(fun, gd.idx, gd.starts, gd.ends, gd, incols, 1) : - do_call(fun, Int[], 1:1, 0:0, gd, incols, 1) - idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) - # disallow passing target column name to genuine tables - if firstres isa MULTI_COLS_TYPE - if p isa Pair{<:Any, <:Pair{<:Any, <:SymbolOrString}} - throw(ArgumentError("setting column name for tabular return value is disallowed")) - end - else - # fetch auto generated or passed target column name to nms overwritting - # what _combine_with_first produced - nms = [out_col] - end - valscat = DataFrame(collect(AbstractVector, outcols), nms) - return idx, valscat + return idx, DataFrame(outcols, nms, copycols=false) end function _combine_multicol(firstres, fun::Any, gd::GroupedDataFrame, @@ -1716,6 +1696,15 @@ julia> select(gd, :, AsTable(Not(:a)) => sum, renamecols=false) │ 8 │ 2 │ 1 │ 8 │ 9 │ ``` """ +function select(f::Base.Callable, gd::GroupedDataFrame; copycols::Bool=true, + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) + if f isa Colon + throw(ArgumentError("First argument must be a transformation if the second argument is a grouped data frame")) + end + return select(gd, f, copycols=copycols, keepkeys=keepkeys, ungroup=ungroup) +end + + select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys, @@ -1733,6 +1722,14 @@ but keeps the columns of `parent(gd)` in their original order. [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref) """ +function transform(f::Base.Callable, gd::GroupedDataFrame; copycols::Bool=true, + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) + if f isa Colon + throw(ArgumentError("First argument must be a transformation if the second argument is a grouped data frame")) + end + return transform(gd, f, copycols=copycols, keepkeys=keepkeys, ungroup=ungroup) +end + function transform(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) res = select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, @@ -1758,6 +1755,13 @@ using the same parent data frame they might get corrupt. [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref) """ +function select!(f::Base.Callable, gd::GroupedDataFrame; ungroup::Bool=true, renamecols::Bool=true) + if f isa Colon + throw(ArgumentError("First argument must be a transformation if the second argument is a grouped data frame")) + end + return select!(gd, f, ungroup=ungroup) +end + function select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) newdf = select(gd, args..., copycols=false, renamecols=renamecols) @@ -1778,6 +1782,13 @@ and keeps the columns of `parent(gd)` in their original order. [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref) """ +function transform!(f::Base.Callable, gd::GroupedDataFrame; ungroup::Bool=true, renamecols::Bool=true) + if f isa Colon + throw(ArgumentError("First argument must be a transformation if the second argument is a grouped data frame")) + end + return transform!(gd, f, ungroup=ungroup) +end + function transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) newdf = select(gd, :, args..., copycols=false, renamecols=renamecols) @@ -1786,3 +1797,51 @@ function transform!(gd::GroupedDataFrame{DataFrame}, args...; _replace_columns!(df, newdf) return ungroup ? df : gd end + + +#### OLD CODE: remove later + +# function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, +# copycols::Bool, keeprows::Bool, renamecols::Bool) +# @assert copycols && !keeprows +# # use `similar` as `gd` might have been subsetted +# firstres = length(gd) > 0 ? fun(gd[1]) : fun(similar(parent(gd), 0)) +# idx, outcols, nms = _combine_multicol(firstres, fun, gd, nothing) +# valscat = DataFrame(collect(AbstractVector, outcols), nms) +# return idx, valscat +# end + +# function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing, +# copycols::Bool, keeprows::Bool, renamecols::Bool) +# # here p should not be normalized as we allow tabular return value from fun +# # map and combine should not dispatch here if p is isagg +# @assert copycols && !keeprows +# source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p, renamecols) +# parentdf = parent(gd) +# if source_cols isa Int +# incols = (parent(gd)[!, source_cols],) +# elseif source_cols isa AsTable +# incols = Tables.columntable(select(parentdf, +# source_cols.cols, +# copycols=false)) +# else +# @assert source_cols isa AbstractVector{Int} +# incols = ntuple(i -> parent(gd)[!, source_cols[i]], length(source_cols)) +# end +# firstres = length(gd) > 0 ? +# do_call(fun, gd.idx, gd.starts, gd.ends, gd, incols, 1) : +# do_call(fun, Int[], 1:1, 0:0, gd, incols, 1) +# idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) +# # disallow passing target column name to genuine tables +# if firstres isa MULTI_COLS_TYPE +# if p isa Pair{<:Any, <:Pair{<:Any, <:SymbolOrString}} +# throw(ArgumentError("setting column name for tabular return value is disallowed")) +# end +# else +# # fetch auto generated or passed target column name to nms overwritting +# # what _combine_with_first produced +# nms = [out_col] +# end +# valscat = DataFrame(collect(AbstractVector, outcols), nms) +# return idx, valscat +# end From 208a79bf4fbd9ece3e512a01ff83570c0d2d9e83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 12 Oct 2020 17:12:06 +0200 Subject: [PATCH 02/10] add tests of reordered GroupedDataFrame --- test/grouping.jl | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/test/grouping.jl b/test/grouping.jl index d5f1934506..30ae81f39d 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3020,4 +3020,59 @@ end @test_throws MethodError select(gdf, AsTable([]) => ByRow(inc0) => :bin) end +@testset "aggregation of reordered groups" begin + df = DataFrame(id = [1, 2, 3, 1, 3, 2], x=1:6) + gdf = groupby(df, :id) + @test select(df, :id, :x => x -> 2x) == select(gdf, :x => x -> 2x) + @test select(df, identity) == select(gdf, identity) + @test select(df, :id, x -> (a=x.x, b=x.x)) == select(gdf, x -> (a=x.x, b=x.x)) + @test transform(df, :x => x -> 2x) == transform(gdf, :x => x -> 2x) + @test transform(df, identity) == transform(gdf, identity) + @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) + @test combine(gdf, :x => x -> 2x) == + DataFrame(id=[1, 1, 2, 2, 3, 3], x_function=[2, 8, 4, 12, 6, 10]) + @test combine(gdf, identity) == df + @test combine(gdf, x -> (a=x.x, b=x.x)) == + DataFrame(id=[1, 1, 2, 2, 3, 3], a=[1, 4, 2, 6, 3, 5], b=[1, 4, 2, 6, 3, 5]) + gdf = groupby(df, :id)[[3, 1, 2]] + @test select(df, :id, :x => x -> 2x) == select(gdf, :x => x -> 2x) + @test select(df, identity) == select(gdf, identity) + @test select(df, :id, x -> (a=x.x, b=x.x)) == select(gdf, x -> (a=x.x, b=x.x)) + @test transform(df, :x => x -> 2x) == transform(gdf, :x => x -> 2x) + @test transform(df, identity) == transform(gdf, identity) + @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) + @test combine(gdf, :x => x -> 2x) == + DataFrame(id=[3, 3, 1, 1, 2, 2], x_function=[6, 10, 2, 8, 4, 12]) + @test combine(gdf, identity) == df[[3, 5, 1, 4, 2, 6], :] + @test combine(gdf, x -> (a=x.x, b=x.x)) == + DataFrame(id=[3, 3, 1, 1, 2, 2], a=[3, 5, 1, 4, 2, 6], b=[3, 5, 1, 4, 2, 6]) + + df = DataFrame(id = [3, 2, 1, 3, 1, 2], x=1:6) + gdf = groupby(df, :id, sort=true) + @test select(df, :id, :x => x -> 2x) == select(gdf, :x => x -> 2x) + @test select(df, identity) == select(gdf, identity) + @test select(df, :id, x -> (a=x.x, b=x.x)) == select(gdf, x -> (a=x.x, b=x.x)) + @test transform(df, :x => x -> 2x) == transform(gdf, :x => x -> 2x) + @test transform(df, identity) == transform(gdf, identity) + @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) + @test combine(gdf, :x => x -> 2x) == + DataFrame(id=[1, 1, 2, 2, 3, 3], x_function=[6, 10, 4, 12, 2, 8]) + @test combine(gdf, identity) == DataFrame(id=[1, 1, 2, 2, 3, 3], x=[3, 5, 2, 6, 1, 4]) + @test combine(gdf, x -> (a=x.x, b=x.x)) == + DataFrame(id=[1, 1, 2, 2, 3, 3], a=[3, 5, 2, 6, 1, 4], b=[3, 5, 2, 6, 1, 4]) + + gdf = groupby(df, :id)[[3, 1, 2]] + @test select(df, :id, :x => x -> 2x) == select(gdf, :x => x -> 2x) + @test select(df, identity) == select(gdf, identity) + @test select(df, :id, x -> (a=x.x, b=x.x)) == select(gdf, x -> (a=x.x, b=x.x)) + @test transform(df, :x => x -> 2x) == transform(gdf, :x => x -> 2x) + @test transform(df, identity) == transform(gdf, identity) + @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) + @test combine(gdf, :x => x -> 2x) == + DataFrame(id=[1, 1, 3, 3, 2, 2], x_function=[6, 10, 2, 8, 4, 12]) + @test combine(gdf, identity) == DataFrame(id=[1, 1, 3, 3, 2, 2], x=[3, 5, 1, 4, 2, 6]) + @test combine(gdf, x -> (a=x.x, b=x.x)) == + DataFrame(id=[1, 1, 3, 3, 2, 2], a=[3, 5, 1, 4, 2, 6], b=[3, 5, 1, 4, 2, 6]) +end + end # module From 610dce7fcb5a84a4630d76c163b34d7dce1b228d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 12 Oct 2020 17:53:11 +0200 Subject: [PATCH 03/10] add @inbounds comment --- src/groupeddataframe/splitapplycombine.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 58a94885d4..d07459e588 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1297,6 +1297,7 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform # we can probably make it more efficient, but I leave it as an optimization for the future gd_idx = gd.idx k = 0 + # consider adding @inbounds later for (s, e) in zip(gd.starts, gd.ends) for j in s:e k += 1 From 96ebb094df246982ee091ca42d5113bb4550e132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 13 Oct 2020 16:44:56 +0200 Subject: [PATCH 04/10] support AsTable and multicolumn return values --- src/groupeddataframe/splitapplycombine.jl | 174 ++++++++++++---------- 1 file changed, 93 insertions(+), 81 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index d07459e588..6ce42824da 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1202,7 +1202,7 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform else @assert cs_i isa Pair source_cols, (fun, out_col_name) = cs_i - out_col_name isa Symbol || throw(ArgumentError("returning multiple columns is not supported yet")) + if source_cols isa Int incols = (parentdf[!, source_cols],) elseif source_cols isa AsTable @@ -1213,45 +1213,105 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform @assert source_cols isa AbstractVector{Int} incols = ntuple(i -> parentdf[!, source_cols[i]], length(source_cols)) end + firstres = length(gd) > 0 ? do_call(fun, gd.idx, gd.starts, gd.ends, gd, incols, 1) : do_call(fun, Int[], 1:1, 0:0, gd, incols, 1) firstmulticol = firstres isa MULTI_COLS_TYPE - if firstmulticol - throw(ArgumentError("a single value or vector result is required (got $(typeof(res)))")) - end - # if idx_agg was not computed yet it is nothing - # in this case if we are not passed a vector compute it. - if !(firstres isa AbstractVector) && isnothing(idx_agg) - idx_agg = Vector{Int}(undef, length(gd)) - fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd) - end - # TODO: if firstres is a vector we recompute idx for every function - # this could be avoided - it could be computed only the first time - # and later we could just check if lengths of groups match this first idx - - # the last argument passed to _combine_with_first informs it about precomputed - # idx. Currently we do it only for single-row return values otherwise we pass - # nothing to signal that idx has to be computed in _combine_with_first - idx, outcols, _ = _combine_with_first(wrap(firstres), fun, gd, incols, - Val(firstmulticol), - firstres isa AbstractVector ? nothing : idx_agg) - @assert length(outcols) == 1 - outcol = outcols[1] - if haskey(seen_cols, out_col_name) - # if column was seen and it is optional now ignore it - if !ot_i - optional, loc = seen_cols[out_col_name] - # we have seen this col but it is not allowed to replace it - optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) - @assert trans_res[loc].optional && trans_res[loc].name == out_col_name - trans_res[loc] = TransRes(idx, outcol, out_col_name, ot_i) - seen_cols[out_col_name] = (ot_i, loc) + if out_col_name isa Symbol + if firstmulticol + throw(ArgumentError("a single value or vector result is required (got $(typeof(firstres)))")) + end + # if idx_agg was not computed yet it is nothing + # in this case if we are not passed a vector compute it. + if !(firstres isa AbstractVector) && isnothing(idx_agg) + idx_agg = Vector{Int}(undef, length(gd)) + fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd) + end + # TODO: if firstres is a vector we recompute idx for every function + # this could be avoided - it could be computed only the first time + # and later we could just check if lengths of groups match this first idx + + # the last argument passed to _combine_with_first informs it about precomputed + # idx. Currently we do it only for single-row return values otherwise we pass + # nothing to signal that idx has to be computed in _combine_with_first + idx, outcols, _ = _combine_with_first(wrap(firstres), fun, gd, incols, + Val(firstmulticol), + firstres isa AbstractVector ? nothing : idx_agg) + @assert length(outcols) == 1 + outcol = outcols[1] + + if haskey(seen_cols, out_col_name) + # if column was seen and it is optional now ignore it + if !ot_i + optional, loc = seen_cols[out_col_name] + # we have seen this col but it is not allowed to replace it + optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) + @assert trans_res[loc].optional && trans_res[loc].name == out_col_name + trans_res[loc] = TransRes(idx, outcol, out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + end + else + push!(trans_res, TransRes(idx, outcol, out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end + elseif out_col_name == AsTable || out_col_name isa AbstractVector{Symbol} + if firstres isa AbstractVector + idx, outcol_vec, _ = _combine_with_first(wrap(firstres), fun, gd, incols, + Val(firstmulticol), nothing) + @assert length(outcol_vec) == 1 + res = outcol_vec[1] + @assert length(res) > 0 + + kp1 = keys(res[1]) + prepend = all(x -> x isa Integer, kp1) + if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) + throw(ArgumentError("keys of the returned elements must be " * + "`Symbol`s, strings or integers")) + end + if any(x -> !isequal(keys(x), kp1), res) + throw(ArgumentError("keys of the returned elements must be identical")) + end + outcols = [[x[n] for x in res] for n in kp1] + nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1] + else + if !firstmulticol + firstres = Tables.columntable(firstres) + fun = (x...) -> Tables.columntable(fun(x...)) + end + idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) + @assert length(outcols) == length(nms) + end + if out_col_name isa AbstractVector{Symbol} + if length(out_col_name) != length(nms) + throw(ArgumentError("Number of returned columns does not " * + "match the length of requested output")) + else + nms = out_col_name + end + end + for j in eachindex(outcols) + outcol = outcols[j] + out_col_name = nms[j] + if haskey(seen_cols, out_col_name) + optional, loc = seen_cols[out_col_name] + # if column was seen and it is optional now ignore it + if !ot_i + optional, loc = seen_cols[out_col_name] + # we have seen this col but it is not allowed to replace it + optional || throw(ArgumentError("duplicate output column name: :$out_col_name")) + @assert trans_res[loc].optional && trans_res[loc].name == out_col_name + trans_res[loc] = TransRes(idx, outcol, out_col_name, ot_i) + seen_cols[out_col_name] = (ot_i, loc) + end + else + push!(trans_res, TransRes(idx, outcol, out_col_name, ot_i)) + seen_cols[out_col_name] = (ot_i, length(trans_res)) + end end else - push!(trans_res, TransRes(idx, outcol, out_col_name, ot_i)) - seen_cols[out_col_name] = (ot_i, length(trans_res)) + throw(ArgumentError("unsupported target column name specifier $out_col_name")) end end end @@ -1798,51 +1858,3 @@ function transform!(gd::GroupedDataFrame{DataFrame}, args...; _replace_columns!(df, newdf) return ungroup ? df : gd end - - -#### OLD CODE: remove later - -# function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, -# copycols::Bool, keeprows::Bool, renamecols::Bool) -# @assert copycols && !keeprows -# # use `similar` as `gd` might have been subsetted -# firstres = length(gd) > 0 ? fun(gd[1]) : fun(similar(parent(gd), 0)) -# idx, outcols, nms = _combine_multicol(firstres, fun, gd, nothing) -# valscat = DataFrame(collect(AbstractVector, outcols), nms) -# return idx, valscat -# end - -# function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing, -# copycols::Bool, keeprows::Bool, renamecols::Bool) -# # here p should not be normalized as we allow tabular return value from fun -# # map and combine should not dispatch here if p is isagg -# @assert copycols && !keeprows -# source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p, renamecols) -# parentdf = parent(gd) -# if source_cols isa Int -# incols = (parent(gd)[!, source_cols],) -# elseif source_cols isa AsTable -# incols = Tables.columntable(select(parentdf, -# source_cols.cols, -# copycols=false)) -# else -# @assert source_cols isa AbstractVector{Int} -# incols = ntuple(i -> parent(gd)[!, source_cols[i]], length(source_cols)) -# end -# firstres = length(gd) > 0 ? -# do_call(fun, gd.idx, gd.starts, gd.ends, gd, incols, 1) : -# do_call(fun, Int[], 1:1, 0:0, gd, incols, 1) -# idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) -# # disallow passing target column name to genuine tables -# if firstres isa MULTI_COLS_TYPE -# if p isa Pair{<:Any, <:Pair{<:Any, <:SymbolOrString}} -# throw(ArgumentError("setting column name for tabular return value is disallowed")) -# end -# else -# # fetch auto generated or passed target column name to nms overwritting -# # what _combine_with_first produced -# nms = [out_col] -# end -# valscat = DataFrame(collect(AbstractVector, outcols), nms) -# return idx, valscat -# end From 07d7dd36befd3926fd37e80cf8fa47d7e2aa9288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 14 Oct 2020 17:47:19 +0200 Subject: [PATCH 05/10] improve handling of corner cases when mixing transformation --- src/groupeddataframe/splitapplycombine.jl | 37 ++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 6ce42824da..83c695fada 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -570,7 +570,18 @@ end # Wrapping automatically adds column names when the value returned # by the user-provided function lacks them -wrap(x::Union{AbstractDataFrame, NamedTuple, DataFrameRow}) = x +wrap(x::Union{AbstractDataFrame, DataFrameRow}) = x +wrap(x::NamedTuple) = x +function wrap(x::NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}) + if !isempty(x) + len1 = length(x[1]) + for i in 2:length(x) + length(x[i]) == len1 || throw(DimensionMismatch("all vectors returned in a" * + "NamedTuple must have the same length")) + end + end + return x +end wrap(x::AbstractMatrix) = NamedTuple{Tuple(gennames(size(x, 2)))}(Tuple(view(x, :, i) for i in 1:size(x, 2))) wrap(x::Any) = (x1=x,) @@ -1179,6 +1190,18 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform elseif cs_i isa Base.Callable firstres = length(gd) > 0 ? cs_i(gd[1]) : cs_i(similar(parent(gd), 0)) idx, outcols, nms = _combine_multicol(firstres, cs_i, gd, nothing) + + if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + # if idx_agg was not computed yet it is nothing + # in this case if we are not passed a vector compute it. + if isnothing(idx_agg) + idx_agg = Vector{Int}(undef, length(gd)) + fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd) + end + @assert idx == idx_agg + idx = idx_agg + end @assert length(outcols) == length(nms) for j in eachindex(outcols) outcol = outcols[j] @@ -1281,6 +1304,18 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform fun = (x...) -> Tables.columntable(fun(x...)) end idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) + + if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame, + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + # if idx_agg was not computed yet it is nothing + # in this case if we are not passed a vector compute it. + if isnothing(idx_agg) + idx_agg = Vector{Int}(undef, length(gd)) + fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd) + end + @assert idx == idx_agg + idx = idx_agg + end @assert length(outcols) == length(nms) end if out_col_name isa AbstractVector{Symbol} From 0527e77198924e41253fd5f404c6062a36054727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 14 Oct 2020 23:54:04 +0200 Subject: [PATCH 06/10] start rewriting tests --- src/groupeddataframe/splitapplycombine.jl | 14 ++++-- test/grouping.jl | 53 +++++++++++++---------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 83c695fada..7384516370 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -453,9 +453,13 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum, renamecols=false) │ 8 │ 4 │ 1 │ 8 │ 9 │ ``` """ -combine(f::Base.Callable, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = +function combine(f::Base.Callable, gd::GroupedDataFrame; + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) + if f isa Colon + throw(ArgumentError("First argument must be a transformation if the second argument is a grouped data frame")) + end return combine(gd, f, keepkeys=keepkeys, ungroup=ungroup, renamecols=renamecols) +end combine(gd::GroupedDataFrame, cs::Union{Pair, Base.Callable, ColumnIndex, MultiColumnIndex}...; @@ -576,7 +580,7 @@ function wrap(x::NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}) if !isempty(x) len1 = length(x[1]) for i in 2:length(x) - length(x[i]) == len1 || throw(DimensionMismatch("all vectors returned in a" * + length(x[i]) == len1 || throw(DimensionMismatch("all vectors returned in a " * "NamedTuple must have the same length")) end end @@ -1301,7 +1305,8 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform else if !firstmulticol firstres = Tables.columntable(firstres) - fun = (x...) -> Tables.columntable(fun(x...)) + oldfun = fun + fun = (x...) -> Tables.columntable(oldfun(x...)) end idx, outcols, nms = _combine_multicol(firstres, fun, gd, incols) @@ -1351,6 +1356,7 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform end end + isempty(trans_res) && return Int[], DataFrame() # idx_agg === nothing then we have only functions that # returned multiple rows and idx_loc = 1 idx_loc = findfirst(x -> x.col_idx !== idx_agg, trans_res) diff --git a/test/grouping.jl b/test/grouping.jl index 30ae81f39d..e0d120662a 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -770,65 +770,70 @@ end # Only test that different combine syntaxes work, # and rely on tests below for deeper checks @test combine(gd, :c => sum) == - combine(:c => sum, gd) == combine(gd, :c => sum => :c_sum) == - combine(:c => sum => :c_sum, gd) == combine(gd, [:c => sum]) == combine(gd, [:c => sum => :c_sum]) == - combine(d -> (c_sum=sum(d.c),), gd) - @test_throws MethodError combine(gd, d -> (c_sum=sum(d.c),)) + combine(d -> (c_sum=sum(d.c),), gd) == + combine(gd, d -> (c_sum=sum(d.c),)) == + combine(gd, d -> (c_sum=[sum(d.c)],)) == + combine(gd, d -> DataFrame(c_sum=sum(d.c))) == + combine(gd, :c => (x -> [sum(x)]) => [:c_sum]) == + combine(gd, :c => (x -> [(c_sum=sum(x),)]) => AsTable) == + combine(gd, :c => (x -> fill(sum(x),1,1)) => [:c_sum]) == + combine(gd, :c => (x -> [Dict(:c_sum => sum(x))]) => AsTable) + @test_throws MethodError combine(:c => sum, gd) + @test_throws ArgumentError combine(:, gd) @test combine(gd, :c => vexp) == - combine(:c => vexp, gd) == combine(gd, :c => vexp => :c_function) == - combine(:c => vexp => :c_function, gd) == - combine(:c => c -> (c_function = vexp(c),), gd) == combine(gd, [:c => vexp]) == combine(gd, [:c => vexp => :c_function]) == - combine(d -> (c_function=exp.(d.c),), gd) + combine(d -> (c_function=exp.(d.c),), gd) == + combine(gd, d -> (c_function=exp.(d.c),)) == + combine(gd, :c => (x -> (c_function=exp.(x),)) => AsTable) == + combine(gd, :c => ByRow(exp) => :c_function) == + combine(gd, :c => ByRow(x -> [exp(x)]) => [:c_function]) @test_throws ArgumentError combine(gd, :c => c -> (c_function = vexp(c),)) - @test_throws MethodError combine(gd, d -> (c_function=exp.(d.c),)) @test combine(gd, :b => sum, :c => sum) == combine(gd, :b => sum => :b_sum, :c => sum => :c_sum) == combine(gd, [:b => sum, :c => sum]) == combine(gd, [:b => sum => :b_sum, :c => sum => :c_sum]) == - combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd) - @test_throws MethodError combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) + combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd) == + combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) == + combine(gd, d -> (b_sum=sum(d.b),), d -> (c_sum=sum(d.c),)) @test combine(gd, :b => vexp, :c => identity) == combine(gd, :b => vexp => :b_function, :c => identity => :c_identity) == combine(gd, [:b => vexp, :c => identity]) == combine(gd, [:b => vexp => :b_function, :c => identity => :c_identity]) == combine(d -> (b_function=vexp(d.b), c_identity=d.c), gd) == - combine([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c), gd) - @test_throws MethodError combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c)) + combine(gd, [:b, :c] => ((b, c) -> (b_function=vexp(b), c_identity=c)) => AsTable) == + combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c)) @test_throws ArgumentError combine(gd, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c)) - @test combine(x -> extrema(x.c), gd) == combine(:c => (x -> extrema(x)) => :x1, gd) - @test combine(x -> x.b+x.c, gd) == combine([:b,:c] => (+) => :x1, gd) - @test combine(x -> (p=x.b, q=x.c), gd) == - combine([:b,:c] => (b,c) -> (p=b,q=c), gd) - @test_throws MethodError combine(gd, x -> (p=x.b, q=x.c)) + @test combine(x -> extrema(x.c), gd) == combine(gd, :c => (x -> extrema(x)) => :x1) + @test combine(x -> hcat(extrema(x.c)...), gd) == combine(gd, :c => (x -> [extrema(x)]) => AsTable) + @test combine(x -> x.b+x.c, gd) == combine(gd, [:b,:c] => (+) => :x1) + @test combine(x -> (p=x.b, q=x.c), gd) == combine(gd, [:b,:c] => ((b,c) -> (p=b,q=c)) => AsTable) @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> (p=b,q=c)) @test combine(x -> DataFrame(p=x.b, q=x.c), gd) == - combine([:b,:c] => (b,c) -> DataFrame(p=b,q=c), gd) - @test_throws MethodError combine(gd, x -> DataFrame(p=x.b, q=x.c)) + combine(gd, [:b,:c] => ((b,c) -> DataFrame(p=b,q=c)) => AsTable) == + combine(gd, x -> DataFrame(p=x.b, q=x.c)) @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> DataFrame(p=b,q=c)) @test combine(x -> [1 2; 3 4], gd) == - combine([:b,:c] => (b,c) -> [1 2; 3 4], gd) - @test_throws MethodError combine(gd, x -> [1 2; 3 4]) + combine(gd, [:b,:c] => ((b,c) -> [1 2; 3 4]) => AsTable) @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> [1 2; 3 4]) @test combine(nrow, gd) == combine(gd, nrow) == combine(gd, [nrow => :nrow]) == combine(gd, 1 => length => :nrow) - @test combine(nrow => :res, gd) == combine(gd, nrow => :res) == + @test combine(gd, nrow => :res) == combine(gd, [nrow => :res]) == combine(gd, 1 => length => :res) @test combine(gd, nrow => :res, nrow, [nrow => :res2]) == combine(gd, 1 => length => :res, 1 => length => :nrow, 1 => length => :res2) - @test_throws ArgumentError combine([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, gd) + @test_throws MethodError combine([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, gd) @test_throws ArgumentError combine(gd, [:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx) @test_throws ArgumentError combine(gd, nrow, nrow) @test_throws ArgumentError combine(gd, [nrow]) From c38a33a83a348c8ec179245a7fd826663ec55bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 15 Oct 2020 12:53:00 +0200 Subject: [PATCH 07/10] finish first round of tests --- test/grouping.jl | 153 +++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 66 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index e0d120662a..bc56a54e61 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -839,64 +839,54 @@ end @test_throws ArgumentError combine(gd, [nrow]) for col in (:c, 3) - @test combine(col => sum, gd) == combine(d -> (c_sum=sum(d.c),), gd) - @test combine(col => x -> sum(x), gd) == combine(d -> (c_function=sum(d.c),), gd) - @test combine(col => x -> (z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd) - @test combine(col => x -> DataFrame(z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd) - @test combine(col => identity, gd) == combine(d -> (c_identity=d.c,), gd) - @test combine(col => x -> (z=x,), gd) == combine(d -> (z=d.c,), gd) - - @test combine(col => sum => :xyz, gd) == - combine(d -> (xyz=sum(d.c),), gd) - @test combine(col => (x -> sum(x)) => :xyz, gd) == - combine(d -> (xyz=sum(d.c),), gd) - @test combine(col => (x -> (sum(x),)) => :xyz, gd) == - combine(d -> (xyz=(sum(d.c),),), gd) + @test combine(gd, col => sum) == combine(d -> (c_sum=sum(d.c),), gd) + @test combine(gd, col => x -> sum(x)) == combine(d -> (c_function=sum(d.c),), gd) + @test combine(gd, col => (x -> (z=sum(x),)) => AsTable) == combine(d -> (z=sum(d.c),), gd) + @test combine(gd, col => (x -> DataFrame(z=sum(x),)) => AsTable) == combine(d -> (z=sum(d.c),), gd) + @test combine(gd, col => identity) == combine(d -> (c_identity=d.c,), gd) + @test combine(gd, col => (x -> (z=x,)) => AsTable) == combine(d -> (z=d.c,), gd) + + @test combine(gd, col => sum => :xyz) == combine(d -> (xyz=sum(d.c),), gd) + @test combine(gd, col => (x -> sum(x)) => :xyz) == combine(d -> (xyz=sum(d.c),), gd) + @test combine(gd, col => (x -> (sum(x),)) => :xyz) == combine(d -> (xyz=(sum(d.c),),), gd) @test combine(nrow, gd) == combine(d -> (nrow=length(d.c),), gd) - @test combine(nrow => :res, gd) == combine(d -> (res=length(d.c),), gd) - @test combine(col => sum => :res, gd) == combine(d -> (res=sum(d.c),), gd) - @test combine(col => (x -> sum(x)) => :res, gd) == combine(d -> (res=sum(d.c),), gd) - @test_throws ArgumentError combine(col => (x -> (z=sum(x),)) => :xyz, gd) - @test_throws ArgumentError combine(col => (x -> DataFrame(z=sum(x),)) => :xyz, gd) - @test_throws ArgumentError combine(col => (x -> (z=x,)) => :xyz, gd) - @test_throws ArgumentError combine(col => x -> (z=1, xzz=[1]), gd) + @test combine(gd, nrow => :res) == combine(d -> (res=length(d.c),), gd) + @test combine(gd, col => sum => :res) == combine(d -> (res=sum(d.c),), gd) + @test combine(gd, col => (x -> sum(x)) => :res) == combine(d -> (res=sum(d.c),), gd) + + @test_throws ArgumentError combine(gd, col => (x -> (z=sum(x),)) => :xyz) + @test_throws ArgumentError combine(gd, col => (x -> DataFrame(z=sum(x),)) => :xyz) + @test_throws ArgumentError combine(gd, col => (x -> (z=x,)) => :xyz) + @test_throws ArgumentError combine(gd, col => x -> (z=1, xzz=[1])) end + for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), ungroup in (true, false) - @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, ungroup=ungroup) == - combine(d -> (y=exp.(d.b), z=d.c), gd, ungroup=ungroup) - @test combine(cols => (b,c) -> [exp.(b) c], gd, ungroup=ungroup) == + @test combine(gd, cols => ((b,c) -> (y=exp.(b), z=c)) => AsTable, ungroup=ungroup) == + combine(gd, d -> (y=exp.(d.b), z=d.c), ungroup=ungroup) + @test combine(gd, cols => ((b,c) -> [exp.(b) c]) => AsTable, ungroup=ungroup) == combine(d -> [exp.(d.b) d.c], gd, ungroup=ungroup) - @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, ungroup=ungroup) == + @test combine(gd, cols => ((b,c) -> sum(b) + sum(c)) => :xyz, ungroup=ungroup) == combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, ungroup=ungroup) - if eltype(cols) === Bool - cols2 = [[false, true, false], [false, false, true]] - @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => sum), - gd, ungroup=ungroup) - @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[1] => sum), - gd, ungroup=ungroup) - @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)), - gd, ungroup=ungroup) - else - cols2 = cols - @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, ungroup=ungroup) == + if eltype(cols) !== Bool + @test combine(gd, cols[1] => sum => :xyz, cols[2] => sum => :xzz, ungroup=ungroup) == combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, ungroup=ungroup) - @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, ungroup=ungroup) == + @test combine(gd, cols[1] => sum => :xyz, cols[1] => sum => :xzz, ungroup=ungroup) == combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, ungroup=ungroup) - @test combine(gd, cols2[1] => sum => :xyz, - cols2[2] => (x -> first(x)) => :xzz, ungroup=ungroup) == + @test combine(gd, cols[1] => sum => :xyz, + cols[2] => (x -> first(x)) => :xzz, ungroup=ungroup) == combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, ungroup=ungroup) - @test combine(gd, cols2[1] => vexp => :xyz, - cols2[2] => sum => :xzz, ungroup=ungroup) == + @test combine(gd, cols[1] => vexp => :xyz, + cols[2] => sum => :xzz, ungroup=ungroup) == combine(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))), gd, ungroup=ungroup) end - @test_throws ArgumentError combine(cols => (b,c) -> (y=exp.(b), z=sum(c)), - gd, ungroup=ungroup) - @test_throws ArgumentError combine(cols2 => ((b,c) -> DataFrame(y=exp.(b), - z=sum(c))) => :xyz, gd, ungroup=ungroup) - @test_throws ArgumentError combine(cols2 => ((b,c) -> [exp.(b) c]) => :xyz, - gd, ungroup=ungroup) + @test_throws ArgumentError combine(gd, cols => (b,c) -> (y=exp.(b), z=sum(c)), + ungroup=ungroup) + @test_throws ArgumentError combine(gd, cols => ((b,c) -> DataFrame(y=exp.(b), + z=sum(c))) => :xyz, ungroup=ungroup) + @test_throws ArgumentError combine(gd, cols => ((b,c) -> [exp.(b) c]) => :xyz, + ungroup=ungroup) end end @@ -1446,9 +1436,9 @@ end @test gdf[:] == gdf @test gdf[1:1] == gdf - @test validate_gdf(combine(nrow => :x1, gdf, ungroup=false)) == + @test validate_gdf(combine(gdf, nrow => :x1, ungroup=false)) == groupby_checked(DataFrame(x1=3), []) - @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, ungroup=false)) == + @test validate_gdf(combine(gdf, :x2 => identity => :x2_identity, ungroup=false)) == groupby_checked(DataFrame(x2_identity=[1,1,2]), []) @test isequal_typed(DataFrame(gdf), df) @@ -1843,9 +1833,9 @@ end @test res == DataFrame(validate_gdf(combine(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a), ungroup=false))) if fr isa AbstractVector && df.x1[1] - @test res == combine(:x1 => (x1 -> x1[1] ? fr : er) => :x1, gdf) + @test res == combine(gdf, :x1 => (x1 -> x1[1] ? fr : er) => :x1) else - @test res == combine(:x1 => x1 -> x1[1] ? fr : er, gdf) + @test res == combine(gdf, :x1 => (x1 -> x1[1] ? fr : er) => AsTable) end if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1) @test res == DataFrame(a=[]) @@ -1872,9 +1862,8 @@ end @test combine(gdf, r"x" => cor) == DataFrame(g=[1,2], x1_x2_cor = [1.0, 1.0]) @test combine(gdf, Not(:g) => ByRow(/)) == DataFrame(:g => [1,1,1,2,2,2], Symbol("x1_x2_/") => 1.0) @test combine(gdf, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1) - @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == combine(:x1 => :z, gdf) == - DataFrame(g=[1,1,1,2,2,2], z=1:6) - @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), ungroup=false)) == + @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == DataFrame(g=[1,1,1,2,2,2], z=1:6) + @test validate_gdf(combine(groupby_checked(df, :g), :x1 => :z, ungroup=false)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) end @@ -1884,10 +1873,10 @@ end gdf = groupby_checked(df, :b) res = combine(sdf -> sdf.x[1:2], gdf) @test names(res) == ["b", "x1"] - res2 = combine(:x => x -> x[1:2], gdf) + res2 = combine(gdf, :x => x -> x[1:2]) @test names(res2) == ["b", "x_function"] @test Matrix(res) == Matrix(res2) - res2 = combine(:x => (x -> x[1:2]) => :z, gdf) + res2 = combine(gdf, :x => (x -> x[1:2]) => :z) @test names(res2) == ["b", "z"] @test Matrix(res) == Matrix(res2) @@ -1921,8 +1910,8 @@ end end for i in 1:2, v1 in [1, 1:2], v2 in [1, 1:2] - @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v, gdf) - @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v, gdf) + @test_throws ArgumentError combine(gdf, [:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v) + @test_throws ArgumentError combine(gdf, [:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v) end end @@ -1932,8 +1921,8 @@ end @test_throws ArgumentError combine(gdf, :x1 => x -> DataFrame()) @test_throws ArgumentError combine(gdf, :x1 => x -> (x=1, y=2)) @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1], y=[2])) - @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1],y=2)) - @test_throws ArgumentError combine(:x1 => x -> (x=[1], y=2), gdf) + @test_throws ArgumentError combine(gdf, :x1 => (x -> (x=[1],y=2)) => AsTable) + @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1], y=2)) @test_throws ArgumentError combine(gdf, :x1 => x -> ones(2, 2)) @test_throws ArgumentError combine(gdf, :x1 => x -> df[1, Not(:g)]) end @@ -2075,9 +2064,9 @@ end # whole column 4 options of single pair passed @test combine(gdf , AsTable([:x, :y]) => Ref) == - combine(AsTable([:x, :y]) => Ref, gdf) == + combine(gdf, AsTable([:x, :y]) => Ref) == DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])]) - @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, ungroup=false)) == + @test validate_gdf(combine(gdf, AsTable([:x, :y]) => Ref, ungroup=false)) == groupby_checked(combine(gdf, AsTable([:x, :y]) => Ref), :g) @test combine(gdf, AsTable(1) => Ref) == @@ -2086,10 +2075,10 @@ end # ByRow 4 options of single pair passed @test combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) == - combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) == + combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) == DataFrame(g=[1,1,1,2,2], x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]]) - @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, ungroup=false)) == + @test validate_gdf(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x]), ungroup=false)) == groupby_checked(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])), :g) # whole column and ByRow test for multiple pairs passed @@ -2829,7 +2818,7 @@ end @testset "disallowed tuple column selector" begin df = DataFrame(g=1:3) gdf = groupby(df, :g) - @test_throws ArgumentError combine((:g, :g) => identity, gdf) + @test_throws MethodError combine((:g, :g) => identity, gdf) @test_throws ArgumentError combine(gdf, (:g, :g) => identity) end @@ -2970,7 +2959,7 @@ end DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) @test combine(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) - @test combine([:a, :b] => +, gdf, renamecols=false) == DataFrame(a=1:3, a_b=5:2:9) + @test combine(gdf, [:a, :b] => +, renamecols=false) == DataFrame(a=1:3, a_b=5:2:9) @test combine(identity, gdf, renamecols=false) == df df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) @@ -3036,7 +3025,7 @@ end @test transform(df, x -> (a=x.x, b=x.x)) == transform(gdf, x -> (a=x.x, b=x.x)) @test combine(gdf, :x => x -> 2x) == DataFrame(id=[1, 1, 2, 2, 3, 3], x_function=[2, 8, 4, 12, 6, 10]) - @test combine(gdf, identity) == df + @test combine(gdf, identity) == DataFrame(gdf) @test combine(gdf, x -> (a=x.x, b=x.x)) == DataFrame(id=[1, 1, 2, 2, 3, 3], a=[1, 4, 2, 6, 3, 5], b=[1, 4, 2, 6, 3, 5]) gdf = groupby(df, :id)[[3, 1, 2]] @@ -3080,4 +3069,36 @@ end DataFrame(id=[1, 1, 3, 3, 2, 2], a=[3, 5, 1, 4, 2, 6], b=[3, 5, 1, 4, 2, 6]) end +@testset "new rules tests" begin + df = DataFrame(id = [1, 2, 3, 1, 3, 2], x=1:6) + gdf = groupby(df, :id) + + @test combine(gdf, x -> reshape(1:4, 2, 2)) == + DataFrame(id=[1,1,2,2,3,3], x1=[1,2,1,2,1,2], x2=[3,4,3,4,3,4]) + @test combine(gdf, x -> DataFrame(a=1:2, b=3:4)) == + DataFrame(id=[1,1,2,2,3,3], a=[1,2,1,2,1,2], b=[3,4,3,4,3,4]) + @test combine(gdf, x -> DataFrame(a=1:2, b=3:4)[1, :]) == + DataFrame(id=[1,2,3], a=[1,1,1], b=[3,3,3]) + @test combine(gdf, x -> (a=1, b=3)) == + DataFrame(id=[1,2,3], a=[1,1,1], b=[3,3,3]) + @test combine(gdf, x -> (a=1:2, b=3:4)) == + DataFrame(id=[1,1,2,2,3,3], a=[1,2,1,2,1,2], b=[3,4,3,4,3,4]) + @test combine(gdf, :x => (x -> Dict(:a => 1:2, :b => 3:4)) => AsTable) == + DataFrame(id=[1,1,2,2,3,3], a=[1,2,1,2,1,2], b=[3,4,3,4,3,4]) + @test combine(gdf, :x => ByRow(x -> [x,x+1,x+2]) => AsTable) == + DataFrame(id=[1,1,2,2,3,3], x1=[1,4,2,6,3,5], x2=[2,5,3,7,4,6], x3=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> (x,x+1,x+2)) => AsTable) == + DataFrame(id=[1,1,2,2,3,3], x1=[1,4,2,6,3,5], x2=[2,5,3,7,4,6], x3=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> (a=x,b=x+1,c=x+2)) => AsTable) == + DataFrame(id=[1,1,2,2,3,3], a=[1,4,2,6,3,5], b=[2,5,3,7,4,6], c=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> [x,x+1,x+2]) => [:p, :q, :r]) == + DataFrame(id=[1,1,2,2,3,3], p=[1,4,2,6,3,5], q=[2,5,3,7,4,6], r=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> (x,x+1,x+2)) => [:p, :q, :r]) == + DataFrame(id=[1,1,2,2,3,3], p=[1,4,2,6,3,5], q=[2,5,3,7,4,6], r=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> (a=x,b=x+1,c=x+2)) => [:p, :q, :r]) == + DataFrame(id=[1,1,2,2,3,3], p=[1,4,2,6,3,5], q=[2,5,3,7,4,6], r=[3,6,4,8,5,7]) + @test combine(gdf, :x => ByRow(x -> 1) => [:p]) == DataFrame(id=[1,1,2,2,3,3], p=1) + @test_throws ArgumentError combine(gdf, :x => (x -> 1) => [:p]) +end + end # module From 98bd976dafa72e4d39346109108ba4e4f7ced46b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 15 Oct 2020 20:15:48 +0200 Subject: [PATCH 08/10] update string tests --- test/string.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/string.jl b/test/string.jl index 589d4ca825..94e47d054f 100644 --- a/test/string.jl +++ b/test/string.jl @@ -169,18 +169,18 @@ end @test combine(gdf, :a) == combine(gdf, "a") == combine(gdf, [:a]) == combine(gdf, ["a"]) - @test combine("a" => identity, gdf, ungroup=false) == - combine(:a => identity, gdf, ungroup=false) - @test combine(["a"] => identity, gdf, ungroup=false) == - combine([:a] => identity, gdf, ungroup=false) - @test combine(nrow => :n, gdf, ungroup=false) == - combine(nrow => "n", gdf, ungroup=false) - - @test combine("a" => identity, gdf) == combine(:a => identity, gdf) == + @test combine(gdf, "a" => identity, ungroup=false) == + combine(gdf, :a => identity, ungroup=false) + @test combine(gdf, ["a"] => identity, ungroup=false) == + combine(gdf, [:a] => identity, ungroup=false) + @test combine(gdf, nrow => :n, ungroup=false) == + combine(gdf, nrow => "n", ungroup=false) + + @test combine(gdf, "a" => identity) == combine(gdf, :a => identity) == combine(gdf, "a" => identity) == combine(gdf, :a => identity) - @test combine(["a"] => identity, gdf) == combine([:a] => identity, gdf) == + @test combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) == combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) - @test combine(nrow => :n, gdf) == combine(nrow => "n", gdf) == + @test combine(gdf, nrow => :n) == combine(gdf, nrow => "n") == combine(gdf, nrow => :n) == combine(gdf, nrow => "n") end From 360aee3b38c347f13955ae7e2f9216ab2281e2af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 15 Oct 2020 21:24:48 +0200 Subject: [PATCH 09/10] update the manual entry --- docs/src/man/split_apply_combine.md | 149 ++++++++++++++++++++-------- 1 file changed, 105 insertions(+), 44 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 1eb02f4889..0f94b86f08 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -13,6 +13,15 @@ In order to perform operations by groups you first need to create a `GroupedData object from your data frame using the `groupby` function that takes two arguments: (1) a data frame to be grouped, and (2) a set of columns to group by. +!!! note + + All operations described for `GroupedDataFrame` in this section of the manual + are also supported for `AbstractDataFrame` in which case it is considered as + being grouped by no columns (typically meaning that it has one group except + when the data frame has zero rows in which case it is treated as having zero groups). + The only difference is that in this case the `keepkeys` and `ungroup` keyword + arguments are not supported and always a data frame is returned. + Operations can then be applied on each group using one of the following functions: * `combine`: does not put restrictions on number of rows returned, the order of rows is specified by the order of groups in `GroupedDataFrame`; it is typically used @@ -26,59 +35,103 @@ Operations can then be applied on each group using one of the following function All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: -1. standard column selectors (integers, symbols, vectors of integers, vectors of symbols, +1. standard column selectors (integers, `Symbol`s, vectors of integers, vectors of symbols, `All`, `:`, `Between`, `Not` and regular expressions) -2. a `cols => function` pair indicating that `function` should be called with - positional arguments holding columns `cols`, which can be a any valid column selector -3. a `cols => function => target_col` form additionally - specifying the name of the target column (this assumes that `function` returns a single - value or a vector) -4. a `col => target_col` pair, which renames the column `col` to `target_col` -5. a `nrow` or `nrow => target_col` form which efficiently computes the number of rows - in a group (without `target_col` the new column is called `:nrow`) -6. several arguments of the forms given above, or vectors thereof -7. a function which will be called with a `SubDataFrame` corresponding to each group; +2. a `cols => function => target_cols` form additionally specifying the target column or columns +3. a `cols => function` pair indicating that `function` should be called with + positional arguments holding columns `cols`, which can be a any valid column selector; + in this case target column name is automatically generated and it is assumed that + `function` returns a single value or a vector; the generated name is created by concatenating + source column name and `function` name where possible (see examples below). +4. a `col => target_cols` pair, which renames the column `col` to `target_cols` +5. a `nrow` or `nrow => target_cols` form which efficiently computes the number of rows + in a group (without `target_cols` the new column is called `:nrow`) +6. vectors or matrices transformations specified by `Pair` syntax described in points 2 to 5 +8. a function which will be called with a `SubDataFrame` corresponding to each group; this form should be avoided due to its poor performance unless a very large number of columns are processed (in which case `SubDataFrame` avoids excessive compilation) -As a special rule that applies to `cols => function` syntax, if `cols` is wrapped -in an `AsTable` object then a `NamedTuple` containing columns selected by `cols` is -passed to `function`. - -In all of these cases, `function` can return either a single row or multiple rows. -`function` can always generate a single column by returning a single value or a vector. -Additionally, if `combine` is passed exactly one `function`, `cols => function`, -or `cols => function => outcol` as a first argument -and `target_col` is not specified, -`function` can return multiple columns in the form of an `AbstractDataFrame`, -`AbstractMatrix`, `NamedTuple` or `DataFrameRow`. +All functions have two types of signatures. One of them takes a `GroupedDataFrame` +as a first argument and an arbitrary number of transfomations described above +as following arguments. The second type of signature is when `Function` or `Type` +is passed as a first argument and `GroupedDataFrame` is a second argument (in a +similar fashion like it is passed in e.g. `map` function). + +As a special rule that applies to `cols => function` and `cols => function => +target_cols` syntaxes is the following. If `cols` is wrapped in an `AsTable` +object then a `NamedTuple` containing columns selected by `cols` is passed to +`function`. + +What is allowed for `function` to return is determined by the `target_cols` value +in the following way: +1. If just a `function` is passed as an argument then returning a data frame, + a matrix, a `NamedTuple`, or a `DataFrameRow` will produce multiple columns in the + result. Returning any other value produces a single column. +2. If `target_cols` is a `Symbol` or a string then the function is assumed to return + a single column. In this case returning a data frame, a matrix, a `NamedTuple`, + or a `DataFrameRow` raises an error. +3. If `target_cols` is a vector of `Symbol`s or strings or `AsTable` it is assumed + that `function` returns multiple columns. + If `function` returns one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`, + `AbstractMatrix` then rules described in point 1 above apply. + If `function` returns an `AbstractVector` then each element of this vector must + support the `keys` function, which must return a collection of `Symbol`s, strings + or integers; the return value of `keys` must be identical for all elements. + Then as many columns are created as there are elements in the return value + of the `keys` function. If `target_cols` is `AsTable` then their names + are set to be equal to the key names except if `keys` returns integers, in + which case they are prefixed by `x` (so the column names are e.g. `x1`, + `x2`, ...). If `target_cols` is a vector of `Symbol`s or strings then + column names produced using the rules above are ignored and replaced by + `target_cols` (the number of columns must be the same as the length of + `target_cols` in this case). + If `fun` returns a value of any other type then it is assumed that it is a + table conforming to the Tables.jl API and the `Tables.columntable` function + is called on it to get the resulting columns and their names. The names are + retained when `target_cols` is `AsTable` and are replaced if + `target_cols` is a vector of `Symbol`s or strings. + +In all of these cases, `function` can return either a single row or multiple +rows. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional +`AbstractArray` are unwrapped and then treated as a single row. `select`/`select!` and `transform`/`transform!` always return a `DataFrame` -with the same number of rows as the source. -For `combine`, the shape of the resulting `DataFrame` is determined -according to the following rules: -- a single value produces a single row and column per group -- a named tuple or `DataFrameRow` produces a single row and one column per field -- a vector produces a single column with one row per entry -- a named tuple of vectors produces one column per field with one row per entry in the vectors -- a `DataFrame` or a matrix produces as many rows and columns as it contains; - note that this option should be avoided due to its poor performance when the number - of groups is large +with the same number and order of rows as the source (even if `GroupedDataFrame` +had its groups reordered). -The kind of return value and the number and names of columns must be the same for all groups. +For `combine` return value is ordered by the order of groups in `GroupedDataFrame` +and for each group the functions can return an arbibrary number of rows (provided +that these numbers are consistent). It is allowed to mix single values and vectors if multiple transformations -are requested. In this case single value will be broadcasted to match the length +are requested. In this case single value will be repeated to match the length of columns specified by returned vectors. -As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` -are unwrapped and then broadcasted. -If a single value or a vector is returned by the `function` and `target_col` is not -provided, it is generated automatically, by concatenating source column name and -`function` name where possible (see examples below). +To apply `function` to each row instead of whole columns, it can be wrapped in a +`ByRow` struct. In this case if `cols` is a `Symbol`, a string, or an +integer then `function` is applied to each element (row) of `cols` using +broadcasting. Otherwise `cols` can be any column indexing syntax, in +which case `function` will be passed one argument for each of the columns +specified by `cols`. If `ByRow` is used it is allowed for +`cols` to select an empty set of columns, in which case `function` + is called for each row without any arguments. -We show several examples of the `by` function applied to the `iris` dataset below: +The kind of return value and the number and names of columns must be the same for all groups. + +There the following keyword arguments are supported by the transformation functions +(not all keyword arguments are supported in all cases; in general they are allowed +in situations when they are meaningful, see the documentation of the specific functions +for details): +- `keepkeys` : if grouping columns should be kept in the returned data frame. +- `ungroup` : if the retun value of the operation should be a data frame or a + `GroupedDataFrame`. +- `copycols` : if columns of the source data frame should be copied if no transformation + is applied to them. +- `renamecols` : if in `cols => funcion` form the automatically generated column name + should include the name of transformation function or not. + +We show several examples of these functions applied to the `iris` dataset below: ```jldoctest sac julia> using DataFrames, CSV, Statistics @@ -176,8 +229,8 @@ julia> combine(gdf, nrow, :PetalLength => mean => :mean) │ 2 │ Iris-versicolor │ 50 │ 4.26 │ │ 3 │ Iris-virginica │ 50 │ 5.552 │ -julia> combine([:PetalLength, :SepalLength] => (p, s) -> (a=mean(p)/mean(s), b=sum(p)), - gdf) # multiple columns are passed as arguments +julia> combine(gdf, [:PetalLength, :SepalLength] => ((p, s) -> (a=mean(p)/mean(s), b=sum(p))) => + AsTable) # multiple columns are passed as arguments 3×3 DataFrame │ Row │ Species │ a │ b │ │ │ String │ Float64 │ Float64 │ @@ -215,6 +268,14 @@ julia> combine(gdf, 1:2 => cor, nrow) │ 2 │ Iris-versicolor │ 0.525911 │ 50 │ │ 3 │ Iris-virginica │ 0.457228 │ 50 │ +julia> combine(gdf, :PetalLength => (x -> [extrema(x)]) => [:min, :max]) +3×3 DataFrame +│ Row │ Species │ min │ max │ +│ │ String │ Float64 │ Float64 │ +├─────┼─────────────────┼─────────┼─────────┤ +│ 1 │ Iris-setosa │ 1.0 │ 1.9 │ +│ 2 │ Iris-versicolor │ 3.0 │ 5.1 │ +│ 3 │ Iris-virginica │ 4.5 │ 6.9 │ ``` Contrary to `combine`, the `select` and `transform` functions always return @@ -268,7 +329,7 @@ julia> transform(gdf, :Species => x -> chop.(x, head=5, tail=0)) │ 150 │ Iris-virginica │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │ ``` -The `combine` function also supports the `do` block form. However, as noted above, +All functions also support the `do` block form. However, as noted above, this form is slow and should therefore be avoided when performance matters. ```jldoctest sac @@ -385,7 +446,7 @@ julia> combine(gd, valuecols(gd) .=> mean) │ 2 │ Iris-versicolor │ 5.936 │ 2.77 │ 4.26 │ 1.326 │ │ 3 │ Iris-virginica │ 6.588 │ 2.974 │ 5.552 │ 2.026 │ -julia> combine(gd, valuecols(gd) .=> (x -> (x .- mean(x)) ./ std(x)) .=> valuecols(gd)) +julia> combine(gd, valuecols(gd) .=> (x -> (x .- mean(x)) ./ std(x)), renamecols=false) 150×5 DataFrame │ Row │ Species │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ │ │ String │ Float64 │ Float64 │ Float64 │ Float64 │ From 27ea3bda85b55145bf1696e6ce42f46777ede0f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 18 Oct 2020 19:01:15 +0200 Subject: [PATCH 10/10] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/split_apply_combine.md | 39 +++++++++++------------ src/groupeddataframe/splitapplycombine.jl | 2 +- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 0f94b86f08..4cfb6015f8 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -17,10 +17,9 @@ object from your data frame using the `groupby` function that takes two argument All operations described for `GroupedDataFrame` in this section of the manual are also supported for `AbstractDataFrame` in which case it is considered as - being grouped by no columns (typically meaning that it has one group except - when the data frame has zero rows in which case it is treated as having zero groups). + being grouped on no columns (meaning it has a single group, or zero groups if it is empty). The only difference is that in this case the `keepkeys` and `ungroup` keyword - arguments are not supported and always a data frame is returned. + arguments are not supported and a data frame is always returned. Operations can then be applied on each group using one of the following functions: * `combine`: does not put restrictions on number of rows returned, the order of rows @@ -46,7 +45,7 @@ each subset of the `DataFrame`. This specification can be of the following forms 4. a `col => target_cols` pair, which renames the column `col` to `target_cols` 5. a `nrow` or `nrow => target_cols` form which efficiently computes the number of rows in a group (without `target_cols` the new column is called `:nrow`) -6. vectors or matrices transformations specified by `Pair` syntax described in points 2 to 5 +6. vectors or matrices containing transformations specified by the `Pair` syntax described in points 2 to 5 8. a function which will be called with a `SubDataFrame` corresponding to each group; this form should be avoided due to its poor performance unless a very large number of columns are processed (in which case `SubDataFrame` avoids excessive @@ -55,17 +54,16 @@ each subset of the `DataFrame`. This specification can be of the following forms All functions have two types of signatures. One of them takes a `GroupedDataFrame` as a first argument and an arbitrary number of transfomations described above as following arguments. The second type of signature is when `Function` or `Type` -is passed as a first argument and `GroupedDataFrame` is a second argument (in a -similar fashion like it is passed in e.g. `map` function). +is passed as a first argument and `GroupedDataFrame` is the second argument +(similar to how it is passed to `map`). -As a special rule that applies to `cols => function` and `cols => function => -target_cols` syntaxes is the following. If `cols` is wrapped in an `AsTable` +As a special rule, with the `cols => function` and `cols => function => +target_cols` syntaxes, if `cols` is wrapped in an `AsTable` object then a `NamedTuple` containing columns selected by `cols` is passed to `function`. -What is allowed for `function` to return is determined by the `target_cols` value -in the following way: -1. If just a `function` is passed as an argument then returning a data frame, +What is allowed for `function` to return is determined by the `target_cols` value: +1. If both `cols` and `target_cols` are omitted (so only a `function` is passed), then returning a data frame, a matrix, a `NamedTuple`, or a `DataFrameRow` will produce multiple columns in the result. Returning any other value produces a single column. 2. If `target_cols` is a `Symbol` or a string then the function is assumed to return @@ -100,9 +98,10 @@ rows. As a particular rule, values wrapped in a `Ref` or a `0`-dimensional with the same number and order of rows as the source (even if `GroupedDataFrame` had its groups reordered). -For `combine` return value is ordered by the order of groups in `GroupedDataFrame` -and for each group the functions can return an arbibrary number of rows (provided -that these numbers are consistent). +For `combine`, rows in the returned object appear in the order of +groups in the `GroupedDataFrame`. The functions can return an arbitrary number +of rows for each group, but the kind of returned object and the number +and names of columns must be the same for all groups. It is allowed to mix single values and vectors if multiple transformations are requested. In this case single value will be repeated to match the length @@ -117,19 +116,17 @@ specified by `cols`. If `ByRow` is used it is allowed for `cols` to select an empty set of columns, in which case `function` is called for each row without any arguments. -The kind of return value and the number and names of columns must be the same for all groups. - There the following keyword arguments are supported by the transformation functions (not all keyword arguments are supported in all cases; in general they are allowed in situations when they are meaningful, see the documentation of the specific functions for details): -- `keepkeys` : if grouping columns should be kept in the returned data frame. -- `ungroup` : if the retun value of the operation should be a data frame or a +- `keepkeys` : whether grouping columns should be kept in the returned data frame. +- `ungroup` : whether the return value of the operation should be a data frame or a `GroupedDataFrame`. -- `copycols` : if columns of the source data frame should be copied if no transformation +- `copycols` : whether columns of the source data frame should be copied if no transformation is applied to them. -- `renamecols` : if in `cols => funcion` form the automatically generated column name - should include the name of transformation function or not. +- `renamecols` : whether in the `cols => function` form automatically generated column names + should include the name of transformation functions or not. We show several examples of these functions applied to the `iris` dataset below: diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 7384516370..2e61bceb22 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1136,7 +1136,7 @@ function _combine(gd::GroupedDataFrame, cs_norm::Vector{Any}, optional_transform trans_res = Vector{TransRes}() - # seen_cols keeps an information about lotacion of columns already processed + # seen_cols keeps an information about location of columns already processed # and if a given column can be replaced in the future seen_cols = Dict{Symbol, Tuple{Bool, Int}}()