Skip to content

Commit

Permalink
Avoid method dispatch ambiguities in DataFrames.jl (#3179)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Sep 26, 2022
1 parent 515a348 commit c3fd232
Show file tree
Hide file tree
Showing 13 changed files with 167 additions and 36 deletions.
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@
([#3081](https://github.com/JuliaData/DataFrames.jl/pull/3081))
* Make `subset` preserves group ordering when `ungroup=false` like `subset!` already does
([#3094](https://github.com/JuliaData/DataFrames.jl/pull/3094))
* Fix incorrect behavior of `GroupDataFrame` indexing in corner cases
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
* Fix errors in `insertcols!` when no columns to add are passed
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))
* Fix errors in `minimum` and `maximum` aggregates
when processing `GroupedDataFrame` with `combine` in corner cases
([#3179](https://github.com/JuliaData/DataFrames.jl/pull/3179))

## Performance

Expand Down
40 changes: 28 additions & 12 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3097,7 +3097,7 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
3 │ c 9 4 5 3
```
"""
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol, <:Any}...;
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
if !is_column_insertion_allowed(df)
throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
Expand Down Expand Up @@ -3222,31 +3222,47 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
return df
end

insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString, <:Any}...;
insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol, <:Any}...;
insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
insertcols!(df, ncol(df)+1, name_cols..., after=after,
makeunique=makeunique, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString, <:Any}...;
insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)

function insertcols!(df::AbstractDataFrame, col::Int=ncol(df)+1; makeunique::Bool=false, name_cols...)
if !(0 < col <= ncol(df) + 1)
throw(ArgumentError("attempt to insert a column to a data frame with " *
"$(ncol(df)) columns at index $col"))
function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
if col isa SymbolOrString
col_ind = Int(columnindex(df, col))
if col_ind == 0
throw(ArgumentError("column $col does not exist in data frame"))
end
else
col_ind = Int(col)
end
if !isempty(name_cols)
# an explicit error is thrown as keyword argument was supported in the past
throw(ArgumentError("inserting columns using a keyword argument is not supported, " *
"pass a Pair as a positional argument instead"))

if after
col_ind += 1
end

if !(0 < col_ind <= ncol(df) + 1)
throw(ArgumentError("attempt to insert a column to a data frame with " *
"$(ncol(df)) columns at index $col_ind"))
end

_drop_all_nonnote_metadata!(parent(df))
return df
end

function insertcols!(df::AbstractDataFrame; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
_drop_all_nonnote_metadata!(parent(df))
return df
end
4 changes: 4 additions & 0 deletions src/abstractdataframe/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,10 @@ function Base.issorted(df::AbstractDataFrame, cols=All();
end
end

Base.issorted(::AbstractDataFrame, ::Base.Order.Ordering) =
throw(ArgumentError("second positional argument of `issorted` on " *
"a data frame must be a column selector"))

"""
sort(df::AbstractDataFrame, cols=All();
alg::Union{Algorithm, Nothing}=nothing,
Expand Down
24 changes: 8 additions & 16 deletions src/groupeddataframe/fastaggregates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,14 @@ check_aggregate(f::typeof(prod), ::AbstractVector{<:Union{Missing, Number}}) =
Reduce(Base.mul_prod)
check_aggregate(f::typeof(prodskipmissing), ::AbstractVector{<:Union{Missing, Number}}) =
Reduce(Base.mul_prod, !ismissing)
check_aggregate(f::typeof(maximum),
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
check_aggregate(f::typeof(maximum), v::AbstractVector{<:Union{Missing, Real}}) =
eltype(v) === Any ? f : Reduce(max)
check_aggregate(f::typeof(maximumskipmissing),
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
check_aggregate(f::typeof(maximumskipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
eltype(v) === Any ? f : Reduce(max, !ismissing, nothing, true)
check_aggregate(f::typeof(minimum),
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
check_aggregate(f::typeof(minimum), v::AbstractVector{<:Union{Missing, Real}}) =
eltype(v) === Any ? f : Reduce(min)
check_aggregate(f::typeof(minimumskipmissing),
::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = f
check_aggregate(f::typeof(minimumskipmissing), v::AbstractVector{<:Union{Missing, Real}}) =
eltype(v) === Any ? f : Reduce(min, !ismissing, nothing, true)
check_aggregate(f::typeof(maximum), ::AbstractVector{<:Union{Missing, Real}}) =
Reduce(max)
check_aggregate(f::typeof(maximumskipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
Reduce(max, !ismissing, nothing, true)
check_aggregate(f::typeof(minimum), ::AbstractVector{<:Union{Missing, Real}}) =
Reduce(min)
check_aggregate(f::typeof(minimumskipmissing), ::AbstractVector{<:Union{Missing, Real}}) =
Reduce(min, !ismissing, nothing, true)
check_aggregate(f::typeof(mean), ::AbstractVector{<:Union{Missing, Number}}) =
Reduce(Base.add_sum, nothing, /)
check_aggregate(f::typeof(meanskipmissing), ::AbstractVector{<:Union{Missing, Number}}) =
Expand Down
28 changes: 24 additions & 4 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -748,14 +748,13 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear()
return GroupKey(parent(gk), i)
end


#
# Non-standard indexing
#

# Non-standard indexing relies on converting to integer indices first
# The full version (to_indices) is required rather than to_index even though
# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not.
# GroupedDataFrame behaves as a 1D array due to the behavior of Not.
# Note that this behavior would be the default if it was <:AbstractArray
function Base.getindex(gd::GroupedDataFrame, idx...)
length(idx) == 1 || throw(ArgumentError("GroupedDataFrame requires a single index"))
Expand All @@ -767,6 +766,10 @@ const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, A
# All allowed scalar index types
const GroupIndexTypes = Union{Integer, GroupKeyTypes}

# GroupedDataFrame is not a multidimensional array, so it does not support cartesian indexing
Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{CartesianIndex}) =
throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))"))

# Find integer index for dictionary keys
function Base.to_index(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return getfield(key, :idx)
Expand Down Expand Up @@ -864,13 +867,30 @@ end
# ambiguity in dispatch
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}})
(findall(!, idx.skip),)
if length(idx.skip) != length(gd)
throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
"with $(length(idx.skip))-element Boolean vector"))
end
return (findall(!, idx.skip),)
end
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:AbstractVector{Bool}}})
(findall(!, idx.skip),)
if length(idx.skip) != length(gd)
throw(BoundsError("attempt to index $(length(gd))-group GroupedDataFrame " *
"with $(length(idx.skip))-element Boolean vector"))
end
return (findall(!, idx.skip),)
end

# Needed to avoid ambiguity
@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx{1}}}) =
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))

@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:InvertedIndices.NIdx}}) =
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))

@inline Base.to_indices(gd::GroupedDataFrame, I::Tuple{Not{<:Union{Array{Bool}, BitArray}}}) =
throw(ArgumentError("attempt to index GroupedDataFrame with $(typeof(I))"))

#
# Dictionary interface
Expand Down
10 changes: 10 additions & 0 deletions src/other/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.BroadcastStyle)
Base.Broadcast.BroadcastStyle(::Base.Broadcast.BroadcastStyle, ::DataFrameStyle) =
DataFrameStyle()
Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::DataFrameStyle) = DataFrameStyle()
# The method below is added to avoid dispatch ambiguity
Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.Unknown) =
DataFrameStyle()

function copyto_widen!(res::AbstractVector{T}, bc::Base.Broadcast.Broadcasted,
pos, col) where T
Expand Down Expand Up @@ -225,6 +228,9 @@ function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src)
return src
end

# The method below is added to avoid dispatch ambiguity
Base.Broadcast.broadcast_unalias(::Nothing, src::AbstractDataFrame) = src

function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataFrame)
wascopied = false
for (i, col) in enumerate(eachcol(src))
Expand Down Expand Up @@ -371,6 +377,10 @@ end
Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src) =
Base.Broadcast.broadcast_unalias(parent(dest), src)

# this is currently impossible but is added to avoid potential dispatch ambiguity in the future
Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src::AbstractDataFrame) =
Base.Broadcast.broadcast_unalias(parent(dest), src)

function Base.copyto!(dfr::DataFrameRow, bc::Base.Broadcast.Broadcasted)
bc′ = Base.Broadcast.preprocess(dfr, bc)
for I in eachindex(bc′)
Expand Down
4 changes: 4 additions & 0 deletions src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ struct SubDataFrame{D<:AbstractDataFrame, S<:AbstractIndex, T<:AbstractVector{In
rows::T # maps from subdf row indexes to parent row indexes
end

# this method should be never called by DataFrames.jl code, but is added for safety
SubDataFrame(parent::SubDataFrame, colindex::AbstractIndex, rows::AbstractVector{Int}) =
throw(ArgumentError("Creation of a SubDataFrame from a SubDataFrame is not allowed"))

Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector{Int}, cols)
@boundscheck if !checkindex(Bool, axes(parent, 1), rows)
throw(BoundsError(parent, (rows, cols)))
Expand Down
10 changes: 10 additions & 0 deletions test/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1957,4 +1957,14 @@ end
@test_throws ArgumentError dfv.c .= [1, 2]
end

@testset "test coverage for corner cases that are not normally called" begin
@test Base.Broadcast.BroadcastStyle(DataFrames.DataFrameStyle(),
Base.Broadcast.Unknown()) isa DataFrames.DataFrameStyle
df = DataFrame(a=1)
@test Base.Broadcast.broadcast_unalias(nothing, df) === df
@test Base.Broadcast.broadcast_unalias(df[1, :], df) == df
@test Base.Broadcast.broadcast_unalias(df[1, :], df) !== df
@test Base.Broadcast.broadcast_unalias(copy(df)[1, :], df) === df
end

end # module
19 changes: 15 additions & 4 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ end
dfc = copy(df)
@test insertcols!(df, 2) == dfc
@test_throws ArgumentError insertcols!(df, 10)
@test_throws ArgumentError insertcols!(df, 2, a=1, b=2)
@test_throws MethodError insertcols!(df, 2, a=1, b=2)

df = DataFrame()
@test insertcols!(df, 1, :x=>[1]) == DataFrame(x=[1])
Expand Down Expand Up @@ -348,7 +348,7 @@ end
@test insertcols!(df, "a" => 2, makeunique=true) == DataFrame(a=1, a_1=2)
end

@testset "insertcols!" begin
@testset "insertcols! old tests" begin
df = DataFrame(a=1:3, b=4:6)
df2 = insertcols(df, :c => 1)
@test df == DataFrame(a=1:3, b=4:6)
Expand All @@ -361,9 +361,20 @@ end
@test df2[!, 1] === x
end

@testset "unsupported insertcols!" begin
@testset "insertcols! with no cols" begin
df = DataFrame(x=1:2)
@test_throws ArgumentError insertcols!(df, 2, y=2:3)
@test_throws ArgumentError insertcols!(df, 0)
@test insertcols!(df, 2) === df
@test insertcols!(df, 2) == DataFrame(x=1:2)
@test insertcols!(df, :x) == DataFrame(x=1:2)
@test insertcols!(df, "x") == DataFrame(x=1:2)
@test insertcols!(df, "x", after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
@test insertcols!(df, 0, after=true) == DataFrame(x=1:2)
@test_throws ArgumentError insertcols!(df, 2, after=true)
@test insertcols!(df) === df
@test insertcols!(df) == DataFrame(x=1:2)
@test insertcols!(df, after=true, makeunique=true, copycols=true) == DataFrame(x=1:2)
@test_throws ArgumentError insertcols!(DataFrame(), :b)
end

@testset "insertcols! after" begin
Expand Down
36 changes: 36 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4276,4 +4276,40 @@ end
end
end

@testset "maximum and minimum on missing" begin
df = DataFrame(id=[1,1,2,2], x=fill(missing, 4))
gdf = groupby_checked(df, :id)
@test combine(gdf, :x => maximum => :x) DataFrame(id=1:2, x=fill(missing, 2))
@test combine(gdf, :x => minimum => :x) DataFrame(id=1:2, x=fill(missing, 2))
@test_throws ArgumentError combine(gdf, :x => maximumskipmissing)
@test_throws ArgumentError combine(gdf, :x => minimumskipmissing)
end

@testset "corner cases of indexing" begin
df = DataFrame(id=1:4)
gdf = groupby_checked(df, :id)
@test_throws ArgumentError gdf[CartesianIndex(1)]
@test_throws ArgumentError gdf[CartesianIndex(1, 1)]
@test_throws ArgumentError gdf[[CartesianIndex(1)]]
@test_throws ArgumentError gdf[[CartesianIndex(1, 1)]]
@test_throws ArgumentError gdf[Any[CartesianIndex(1)]]
@test_throws ArgumentError gdf[Any[CartesianIndex(1, 1)]]

@test_throws ArgumentError gdf[Not(CartesianIndex(1))]
@test_throws ArgumentError gdf[Not(CartesianIndex(1, 1))]
@test_throws ArgumentError gdf[Not([CartesianIndex(1)])]
@test_throws ArgumentError gdf[Not([CartesianIndex(1, 1)])]
@test_throws ArgumentError gdf[Not(Any[CartesianIndex(1)])]
@test_throws ArgumentError gdf[Not(Any[CartesianIndex(1, 1)])]

@test_throws BoundsError gdf[[true]]
@test_throws BoundsError gdf[Not([true])]
@test_throws BoundsError gdf[trues(1)]
@test_throws BoundsError gdf[Not(trues(1))]
@test_throws BoundsError gdf[view([true], 1:1)]
@test_throws BoundsError gdf[Not(view([true], 1:1))]
@test_throws BoundsError gdf[[true true true true]]
@test_throws ArgumentError gdf[Not([true true true true])]
end

end # module
14 changes: 14 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,20 @@ else
@show Threads.nthreads()
end

ambiguities_vec = Test.detect_ambiguities(DataFrames, recursive=true)
if !isempty(ambiguities_vec)
@error "Method ambiguities:"
display(ambiguities_vec)
throw(AssertionError("method dispatch ambiguities found"))
end

unbound_args_vec = Test.detect_unbound_args(DataFrames, recursive=true)
if !isempty(unbound_args_vec)
@error "Unbound type parameters:"
display(unbound_args_vec)
throw(AssertionError("unbound type parameters found"))
end

my_tests = ["utils.jl",
"cat.jl",
"data.jl",
Expand Down
1 change: 1 addition & 0 deletions test/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ end
@test issorted(df, rev=fill(false, ncol(df)))
@test issorted(df, order=Base.Forward)
@test issorted(df, order=fill(Base.Forward, ncol(df)))
@test_throws ArgumentError issorted(df, Base.Order.Forward)

@test issorted(df, :x, by=identity)
@test issorted(df, :x, by=[identity])
Expand Down
6 changes: 6 additions & 0 deletions test/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -301,4 +301,10 @@ end
end
end

@testset "disallowed use of SubDataFrame constructor" begin
df = DataFrame(a=1)
sdf = view(df, :, :)
@test_throws ArgumentError SubDataFrame(sdf, DataFrames.index(sdf), [1])
end

end # module

0 comments on commit c3fd232

Please sign in to comment.