Skip to content

Commit

Permalink
allow passing empty sets of columns to ByRow and filter (#2476)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Oct 12, 2020
1 parent 9a292ff commit b11fe97
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 52 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -74,6 +74,8 @@
which if set to `true` makes them retun a `SubDataFrame` view into the passed
data frame.
* add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449))
* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476))

## Deprecated

Expand Down
28 changes: 17 additions & 11 deletions src/abstractdataframe/abstractdataframe.jl
Expand Up @@ -994,9 +994,10 @@ end
@inline function Base.filter((cols, f)::Pair, df::AbstractDataFrame; view::Bool=false)
int_cols = index(df)[cols] # it will be AbstractVector{Int} or Int
if length(int_cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f() for _ in axes(df, 1)]
else
rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...)
end
rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand All @@ -1006,9 +1007,10 @@ end
AbstractVector{<:Symbol}}},
df::AbstractDataFrame; view::Bool=false)
if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f() for _ in axes(df, 1)]
else
rowidxs = _filter_helper(f, (df[!, i] for i in cols)...)
end
rowidxs = _filter_helper(f, (df[!, i] for i in cols)...)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand All @@ -1018,9 +1020,10 @@ _filter_helper(f, cols...)::BitVector = ((x...) -> f(x...)::Bool).(cols...)
view::Bool=false)
df_tmp = select(df, cols.cols, copycols=false)
if ncol(df_tmp) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = [f(NamedTuple()) for _ in axes(df, 1)]
else
rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp))
end
rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

Expand Down Expand Up @@ -1101,7 +1104,7 @@ julia> filter!(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df)
│ 3 │ 1 │ b │
```
"""
Base.filter!(f, df::AbstractDataFrame) = _filter!_helper(df, f, eachrow(df))
Base.filter!(f, df::AbstractDataFrame) = delete!(df, findall(!f, eachrow(df)))
Base.filter!((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) =
_filter!_helper(df, f, df[!, col])
Base.filter!((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) =
Expand All @@ -1115,17 +1118,20 @@ Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) =

function _filter!_helper(df::AbstractDataFrame, f, cols...)
if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
rowidxs = findall(x -> !f(), axes(df, 1))
else
rowidxs = findall(((x...) -> !(f(x...)::Bool)).(cols...))
end
return delete!(df, findall(((x...) -> !(f(x...)::Bool)).(cols...)))
return delete!(df, rowidxs)
end

function Base.filter!((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame)
dff = select(df, cols.cols, copycols=false)
if ncol(dff) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
return delete!(df, findall(x -> !f(NamedTuple()), axes(df, 1)))
else
return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f)
end
return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f)
end

_filter!_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) =
Expand Down
76 changes: 42 additions & 34 deletions src/abstractdataframe/selection.jl
Expand Up @@ -67,10 +67,20 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable,
<:Union{Symbol, AbstractString, DataType,
AbstractVector{Symbol},
AbstractVector{<:AbstractString}}}},
sel::Pair{<:ColumnIndex,
<:Pair{<:Base.Callable,
<:Union{Symbol, AbstractString}}},
renamecols::Bool)
src, (fun, dst) = sel
return idx[src] => fun => Symbol(dst)
end

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,
<:Pair{<:Base.Callable,
<:Union{Symbol, AbstractString, DataType,
AbstractVector{Symbol},
AbstractVector{<:AbstractString}}}},
renamecols::Bool)
lls = last(last(sel))
if lls isa DataType
Expand Down Expand Up @@ -170,31 +180,29 @@ function normalize_selection(idx::AbstractIndex,
return (wanttable ? AsTable(c) : c) => fun => newcol
end

function _transformation_helper(df::AbstractDataFrame,
col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable},
@nospecialize(fun))
if col_idx === nothing
return fun(df)
elseif col_idx isa Int
return fun(df[!, col_idx])
elseif col_idx isa AsTable
tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
if isempty(tbl) && fun isa ByRow
return [fun.fun(NamedTuple()) for _ in 1:nrow(df)]
else
return fun(tbl)
end
_transformation_helper(df::AbstractDataFrame, col_idx::Nothing, fun) = fun(df)
_transformation_helper(df::AbstractDataFrame, col_idx::Int, fun) = fun(df[!, col_idx])

_empty_astable_helper(fun, len) = [fun(NamedTuple()) for _ in 1:len]

function _transformation_helper(df::AbstractDataFrame, col_idx::AsTable, fun)
tbl = Tables.columntable(select(df, col_idx.cols, copycols=false))
if isempty(tbl) && fun isa ByRow
return _empty_astable_helper(fun.fun, nrow(df))
else
# it should be fast enough here as we do not expect to do it millions of times
@assert col_idx isa AbstractVector{Int}
if isempty(col_idx) && fun isa ByRow
return [fun.fun() for _ in 1:nrow(df)]
else
cdf = eachcol(df)
return fun(map(c -> cdf[c], col_idx)...)
end
return fun(tbl)
end
end

_empty_selector_helper(fun, len) = [fun() for _ in 1:len]

function _transformation_helper(df::AbstractDataFrame, col_idx::AbstractVector{Int}, fun)
if isempty(col_idx) && fun isa ByRow
return _empty_selector_helper(fun.fun, nrow(df))
else
cdf = eachcol(df)
return fun(map(c -> cdf[c], col_idx)...)
end
throw(ErrorException("unreachable reached"))
end

function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol},
Expand Down Expand Up @@ -656,7 +664,7 @@ julia> select!(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat
```
"""
select!(df::DataFrame, args...; renamecols::Bool=true) =
select!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) =
_replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols))

function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
Expand All @@ -676,7 +684,7 @@ Equivalent to `select!(df, :, args...)`.
See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform!(df::DataFrame, args...; renamecols::Bool=true) =
transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) =
select!(df, :, args..., renamecols=renamecols)

function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
Expand Down Expand Up @@ -810,7 +818,7 @@ julia> select(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats
```
"""
select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
select(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) =
manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols)

function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
Expand All @@ -831,7 +839,7 @@ Equivalent to `select(df, :, args..., copycols=copycols)`.
See [`select`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) =
select(df, :, args..., copycols=copycols, renamecols=renamecols)

function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
Expand Down Expand Up @@ -935,7 +943,7 @@ julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat
│ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │
```
"""
combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
combine(df::AbstractDataFrame, @nospecialize(args...); renamecols::Bool=true) =
manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols)

function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true)
Expand Down Expand Up @@ -964,7 +972,7 @@ manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols)

function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool)
function manipulate(df::DataFrame, @nospecialize(cs...); copycols::Bool, keeprows::Bool, renamecols::Bool)
cs_vec = []
for v in cs
if v isa AbstractVecOrMat{<:Pair}
Expand Down Expand Up @@ -1061,7 +1069,7 @@ function manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
end
end

function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool,
function manipulate(dfv::SubDataFrame, @nospecialize(args...); copycols::Bool, keeprows::Bool,
renamecols::Bool)
if copycols
cs_vec = []
Expand Down
14 changes: 11 additions & 3 deletions src/groupeddataframe/splitapplycombine.jl
Expand Up @@ -712,7 +712,11 @@ end
function do_call(f::Any, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::Tuple{}, i::Integer)
f()
if f isa ByRow
return [f.fun() for _ in 1:(ends[i] - starts[i] + 1)]
else
return f()
end
end

function do_call(f::Any, idx::AbstractVector{<:Integer},
Expand Down Expand Up @@ -754,8 +758,12 @@ end
function do_call(f::Any, idx::AbstractVector{<:Integer},
starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer},
gd::GroupedDataFrame, incols::NamedTuple, i::Integer)
idx = idx[starts[i]:ends[i]]
return f(map(c -> view(c, idx), incols))
if f isa ByRow && isempty(incols)
return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)]
else
idx = idx[starts[i]:ends[i]]
return f(map(c -> view(c, idx), incols))
end
end

function do_call(f::Any, idx::AbstractVector{<:Integer},
Expand Down
42 changes: 38 additions & 4 deletions test/data.jl
Expand Up @@ -453,11 +453,45 @@ end
@test filter(AsTable("x") => testfun, df) == DataFrame(x=[3, 2], y=["b", "a"])
filter!(AsTable("x") => testfun, df)
@test df == DataFrame(x=[3, 2], y=["b", "a"])
end

@testset "empty arg to filter and filter!" begin
df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"])

@test filter([] => () -> true, df) == df
@test filter(AsTable(r"z") => x -> true, df) == df
@test filter!([] => () -> true, copy(df)) == df
@test filter!(AsTable(r"z") => x -> true, copy(df)) == df

flipflop0 = let
state = false
() -> (state = !state)
end

flipflop1 = let
state = false
x -> (state = !state)
end

@test_throws ArgumentError filter([] => () -> true, df)
@test_throws ArgumentError filter(AsTable(r"z") => () -> true, df)
@test_throws ArgumentError filter!([] => () -> true, df)
@test_throws ArgumentError filter!(AsTable(r"z") => () -> true, df)
@test filter([] => flipflop0, df) == df[[1,3], :]
@test filter(Int[] => flipflop0, df) == df[[1,3], :]
@test filter(String[] => flipflop0, df) == df[[1,3], :]
@test filter(Symbol[] => flipflop0, df) == df[[1,3], :]
@test filter(r"z" => flipflop0, df) == df[[1,3], :]
@test filter(Not(All()) => flipflop0, df) == df[[1,3], :]
@test filter(AsTable(r"z") => flipflop1, df) == df[[1,3], :]
@test filter(AsTable([]) => flipflop1, df) == df[[1,3], :]
@test filter!([] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Int[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(String[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Symbol[] => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(r"z" => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(Not(All()) => flipflop0, copy(df)) == df[[1,3], :]
@test filter!(AsTable(r"z") => flipflop1, copy(df)) == df[[1,3], :]
@test filter!(AsTable([]) => flipflop1, copy(df)) == df[[1,3], :]

@test_throws MethodError filter([] => flipflop1, df)
@test_throws MethodError filter(AsTable([]) => flipflop0, df)
end

@testset "names with cols" begin
Expand Down
41 changes: 41 additions & 0 deletions test/grouping.jl
Expand Up @@ -2979,4 +2979,45 @@ end
@test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30)
end

@testset "empty ByRow" begin
inc0 = let
state = 0
() -> (state += 1)
end

inc1 = let
state = 0
x -> (state += 1)
end

df = DataFrame(a=[1,1,1,2,2,3,4,4,5,5,5,5], b=1:12)
gdf = groupby_checked(df, :a)

@test select(gdf, [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a, bin=1:12)
@test combine(gdf, [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a, bin=13:24)
@test select(gdf, AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a, bin=1:12)
@test combine(gdf, AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a, bin=13:24)
@test combine(gdf[Not(2)], [] => ByRow(inc0) => :bin) ==
DataFrame(a=df.a[Not(4:5)], bin=25:34)
@test combine(gdf[Not(2)], AsTable([]) => ByRow(inc1) => :bin) ==
DataFrame(a=df.a[Not(4:5)], bin=25:34)

# note that type inference in a comprehension does not always work
@test isequal_coltyped(combine(gdf[[]], [] => ByRow(inc0) => :bin),
DataFrame(a=Int[], bin=Any[]))
@test isequal_coltyped(combine(gdf[[]], [] => ByRow(rand) => :bin),
DataFrame(a=Int[], bin=Float64[]))
@test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(inc1) => :bin),
DataFrame(a=Int[], bin=Any[]))
@test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(x -> rand()) => :bin),
DataFrame(a=Int[], bin=Float64[]))

@test_throws MethodError select(gdf, [] => ByRow(inc1) => :bin)
@test_throws MethodError select(gdf, AsTable([]) => ByRow(inc0) => :bin)
end

end # module

0 comments on commit b11fe97

Please sign in to comment.