add Cols support (#2495)

JuliaData · Oct 27, 2020 · d23077a · d23077a
1 parent 40c368d
commit d23077a
Show file tree

Hide file tree

Showing 21 changed files with 171 additions and 21 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -77,6 +77,7 @@
 * passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
   with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476))
 * add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447))
+* add support for `Cols` from DataAPI.jl ([#2495](https://github.com/JuliaData/DataFrames.jl/pull/2495))
 
 ## Deprecated
 

diff --git a/Project.toml b/Project.toml
@@ -37,7 +37,7 @@ test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]
 julia = "1"
 CategoricalArrays = "0.8.3"
 Compat = "3.17"
-DataAPI = "1.2"
+DataAPI = "1.3"
 InvertedIndices = "1"
 IteratorInterfaceExtensions = "0.1.1, 1"
 Missings = "0.4.2"

diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md
@@ -26,7 +26,7 @@ The rules for a valid type of index into a column are the following:
     * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`;
     * a regular expression, which gets expanded to a vector of matching column names;
     * a `Not` expression (see [InvertedIndices.jl](https://github.com/mbauman/InvertedIndices.jl));
-    * an `All` or `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
+    * an `Cols`, `All` or `Between` expression (see [DataAPI.jl](https://github.com/JuliaData/DataAPI.jl));
     * a colon literal `:`.
 
 The rules for a valid type of index into a row are the following:

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
@@ -456,7 +456,11 @@ julia> df[!, :A] == df[:, :A]
 true
 ```
 
-In the first case, `[:A]` is a vector, indicating that the resulting object should be a `DataFrame`. On the other hand, `:A` is a single symbol, indicating that a single column vector should be extracted. Note that in the first case a vector is required to be passed (not just any iterable), so e.g. `df[:, (:x1, :x2)]` is not allowed, but `df[:, [:x1, :x2]]` is valid.
+In the first case, `[:A]` is a vector, indicating that the resulting object
+should be a `DataFrame`. On the other hand, `:A` is a single symbol, indicating
+that a single column vector should be extracted. Note that in the first case a
+vector is required to be passed (not just any iterable), so e.g. `df[:, (:x1,
+:x2)]` is not allowed, but `df[:, [:x1, :x2]]` is valid.
 
 It is also possible to use a regular expression as a selector of columns matching it:
 ```jldoctest dataframe
@@ -475,7 +479,9 @@ julia> df[!, r"x"]
 │ 1   │ 1     │ 2     │
 ```
 
-A `Not` selector (from the [InvertedIndices](https://github.com/mbauman/InvertedIndices.jl) package) can be used to select all columns excluding a specific subset:
+A `Not` selector (from the
+[InvertedIndices](https://github.com/mbauman/InvertedIndices.jl) package) can be
+used to select all columns excluding a specific subset:
 
 ```jldoctest dataframe
 julia> df[!, Not(:x1)]
@@ -486,8 +492,13 @@ julia> df[!, Not(:x1)]
 │ 1   │ 2     │ 3     │
 ```
 
-Finally, you can use `Not`, `Between`, and `All` selectors in more complex column selection scenarios.
-The following examples move all columns whose names match `r"x"` regular expression respectively to the front and to the end of a data frame:
+Finally, you can use `Not`, `Between`, `Cols` and `All` selectors in more
+complex column selection scenarios (note that `Cols()` selects no columns while
+`All()` selects all columns therefore `Cols` is a preferred selector if you
+write generic code). The following examples move all columns whose names match
+`r"x"` regular expression respectively to the front and to the end of a data
+frame:
+
 ```
 julia> df = DataFrame(r=1, x1=2, x2=3, y=4)
 1×4 DataFrame
@@ -496,14 +507,14 @@ julia> df = DataFrame(r=1, x1=2, x2=3, y=4)
 ├─────┼───────┼───────┼───────┼───────┤
 │ 1   │ 1     │ 2     │ 3     │ 4     │
 
-julia> df[:, All(r"x", :)]
+julia> df[:, Cols(r"x", :)]
 1×4 DataFrame
 │ Row │ x1    │ x2    │ r     │ y     │
 │     │ Int64 │ Int64 │ Int64 │ Int64 │
 ├─────┼───────┼───────┼───────┼───────┤
 │ 1   │ 2     │ 3     │ 1     │ 4     │
 
-julia> df[:, All(Not(r"x"), :)]
+julia> df[:, Cols(Not(r"x"), :)]
 1×4 DataFrame
 │ Row │ r     │ y     │ x1    │ x2    │
 │     │ Int64 │ Int64 │ Int64 │ Int64 │

diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md
@@ -128,7 +128,7 @@ julia> last(iris, 4)
 Keywords used above include `rev` (to sort in reverse),
 and `by` (to apply a function to values before comparing them).
 Each keyword can either be a single value, a vector with values corresponding to
-individual columns, or a selector: `:`, `All`, `Not`, `Between`, or `Regex`.
+individual columns, or a selector: `:`, `Cols`, `All`, `Not`, `Between`, or `Regex`.
 
 As an alternative to using a vector values you can use `order` to specify
 an ordering for a particular column within a set of columns.

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -27,7 +27,7 @@ Operations can then be applied on each group using one of the following function
 All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
 1. standard column selectors (integers, symbols, vectors of integers, vectors of symbols,
-   `All`, `:`, `Between`, `Not` and regular expressions)
+   `All`, `Cols`, `:`, `Between`, `Not` and regular expressions)
 2. a `cols => function` pair indicating that `function` should be called with
    positional arguments holding columns `cols`, which can be a any valid column selector
 3. a `cols => function => target_col` form additionally

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -11,6 +11,7 @@ using Markdown
 import DataAPI,
        DataAPI.All,
        DataAPI.Between,
+       DataAPI.Cols,
        DataAPI.describe,
        Tables,
        Tables.columnindex,
@@ -21,6 +22,7 @@ export AbstractDataFrame,
        AsTable,
        Between,
        ByRow,
+       Cols,
        DataFrame,
        DataFrameRow,
        GroupedDataFrame,

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -67,7 +67,7 @@ abstract type AbstractDataFrame end
 Return a freshly allocated `Vector{String}` of names of columns contained in `df`.
 
 If `cols` is passed then restrict returned column names to those matching the
-selector (this is useful in particular with regular expressions, `Not`, and `Between`).
+selector (this is useful in particular with regular expressions, `Cols`, `Not`, and `Between`).
 `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR)
 or a `Type`, in which case columns whose `eltype` is a subtype of `cols` are returned.
 

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -1106,6 +1106,6 @@ function manipulate(dfv::SubDataFrame, @nospecialize(args...); copycols::Bool, k
                 push!(newinds, newind)
             end
         end
-        return view(dfv, :, isempty(newinds) ? [] : All(newinds...))
+        return view(dfv, :, Cols(newinds...))
     end
 end
diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl
@@ -224,7 +224,7 @@ end
 
 Base.@propagate_inbounds Base.getindex(r::DataFrameRow, ::Colon) = r
 
-for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon)
+for T in MULTICOLUMNINDEX_TUPLE
     @eval function Base.setindex!(df::DataFrame,
                                   v::Union{DataFrameRow, NamedTuple, AbstractDict},
                                   row_ind::Integer,

diff --git a/src/other/index.jl b/src/other/index.jl
@@ -11,11 +11,11 @@ Base.summary(io::IO, idx::AbstractIndex) = print(io, summary(idx))
 
 const SymbolOrString = Union{Symbol, AbstractString}
 const ColumnIndex = Union{Signed, Unsigned, SymbolOrString}
-const MultiColumnIndex = Union{AbstractVector, Regex, Not, Between, All, Colon}
-const MULTICOLUMNINDEX_TUPLE = (:AbstractVector, :Regex, :Not, :Between, :All, :Colon)
+const MultiColumnIndex = Union{AbstractVector, Regex, Not, Between, All, Cols, Colon}
+const MULTICOLUMNINDEX_TUPLE = (:AbstractVector, :Regex, :Not, :Between, :All, :Cols, :Colon)
 
 const COLUMNINDEX_STR = "`Symbol`, string or integer"
-const MULTICOLUMNINDEX_STR = "`:`, `All`, `Between`, `Not`, a regular expression," *
+const MULTICOLUMNINDEX_STR = "`:`, `Cols`, `All`, `Between`, `Not`, a regular expression," *
                           " or a vector of `Symbol`s, strings or integers"
 
 struct Index <: AbstractIndex   # an OrderedDict would be nice here...
@@ -219,6 +219,8 @@ end
 @inline Base.getindex(x::AbstractIndex, idx::Between) = x[idx.first]:x[idx.last]
 @inline Base.getindex(x::AbstractIndex, idx::All) =
     isempty(idx.cols) ? (1:length(x)) : union(getindex.(Ref(x), idx.cols)...)
+@inline Base.getindex(x::AbstractIndex, idx::Cols) =
+    isempty(idx.cols) ? Int[] : union(getindex.(Ref(x), idx.cols)...)
 
 @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer})
     if any(v -> v isa Bool, idx)

diff --git a/test/data.jl b/test/data.jl
@@ -479,6 +479,7 @@ end
     @test filter(Symbol[] => flipflop0, df) == df[[1,3], :]
     @test filter(r"z" => flipflop0, df) == df[[1,3], :]
     @test filter(Not(All()) => flipflop0, df) == df[[1,3], :]
+    @test filter(Cols() => flipflop0, df) == df[[1,3], :]
     @test filter(AsTable(r"z") => flipflop1, df) == df[[1,3], :]
     @test filter(AsTable([]) => flipflop1, df) == df[[1,3], :]
     @test filter!([] => flipflop0, copy(df)) == df[[1,3], :]
@@ -487,6 +488,7 @@ end
     @test filter!(Symbol[] => flipflop0, copy(df)) == df[[1,3], :]
     @test filter!(r"z" => flipflop0, copy(df)) == df[[1,3], :]
     @test filter!(Not(All()) => flipflop0, copy(df)) == df[[1,3], :]
+    @test filter!(Cols() => flipflop0, copy(df)) == df[[1,3], :]
     @test filter!(AsTable(r"z") => flipflop1, copy(df)) == df[[1,3], :]
     @test filter!(AsTable([]) => flipflop1, copy(df)) == df[[1,3], :]
 
@@ -502,13 +504,15 @@ end
         @test names(v, Between(:x1, :x3)) == ["x1", "x2", "x3"]
         @test names(v, Not(:a)) == names(v, r"x") == ["x1", "x2", "x3", "x4"]
         @test names(v, :x1) == names(v, 2) == ["x1"]
+        @test names(v, Cols()) == names(v, Cols()) == []
     end
 
     for v in [view(df, :, [4,3,2,1]), groupby(view(df, :, [4,3,2,1]), 1), view(df, 1, [4,3,2,1])]
         @test names(v, All()) == names(v, :) == names(v) ==  ["x3", "x2", "x1", "a"]
         @test names(v, Between(:x2, :x1)) == ["x2", "x1"]
         @test names(v, Not(:a)) == names(v, r"x") == ["x3", "x2", "x1"]
         @test names(v, :x1) == names(v, 3) == ["x1"]
+        @test names(v, Cols()) == names(v, Cols()) == []
     end
 end
 

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -1552,107 +1552,151 @@ end
     # we check dispatch here only
     df = DataFrame(a=1, b=2, c=3)
     completecases(df, All())
+    completecases(df, Cols(:))
     completecases(df, Between(1, 2))
     dropmissing(df, All())
+    dropmissing(df, Cols(:))
     dropmissing(df, Between(1, 2))
     dropmissing!(df, All())
+    dropmissing!(df, Cols(:))
     dropmissing!(df, Between(1, 2))
     disallowmissing(df, All())
+    disallowmissing(df, Cols(:))
     disallowmissing(df, Between(1, 2))
     allowmissing(df, All())
+    allowmissing(df, Cols(:))
     allowmissing(df, Between(1, 2))
 
     df[1, All()]
+    df[1, Cols(:)]
     df[1, Between(1,2)]
     df[1:1, All()]
+    df[1:1, Cols(:)]
     df[1:1, Between(1,2)]
     df[Not(1), All()]
+    df[Not(1), Cols(:)]
     df[Not(1), Between(1,2)]
     df[:, All()]
+    df[:, Cols(:)]
     df[:, Between(1,2)]
     df[!, All()]
+    df[!, Cols(:)]
     df[!, Between(1,2)]
 
     @view df[1, All()]
+    @view df[1, Cols(:)]
     @view df[1, Between(1,2)]
     @view df[1:1, All()]
+    @view df[1:1, Cols(:)]
     @view df[1:1, Between(1,2)]
     @view df[Not(1), All()]
+    @view df[Not(1), Cols(:)]
     @view df[Not(1), Between(1,2)]
     @view df[:, All()]
+    @view df[:, Cols(:)]
     @view df[:, Between(1,2)]
     @view df[!, All()]
+    @view df[!, Cols(:)]
     @view df[!, Between(1,2)]
 
     df[1, All()] = (a=1, b=2, c=3)
+    df[1, Cols(:)] = (a=1, b=2, c=3)
     df[1, Between(1,2)] = (a=1, b=2)
     df[1:1, All()] = df
+    df[1:1, Cols(:)] = df
     df[1:1, Between(1,2)] = df[!, 1:2]
     df[:, All()] = df
+    df[:, Cols(:)] = df
     df[:, Between(1,2)] = df[!, 1:2]
     df[1:1, All()] = Matrix(df)
+    df[1:1, Cols(:)] = Matrix(df)
     df[1:1, Between(1,2)] = Matrix(df[!, 1:2])
     df[:, All()] = Matrix(df)
+    df[:, Cols(:)] = Matrix(df)
     df[:, Between(1,2)] = Matrix(df[!, 1:2])
 
     df2 = vcat(df, df)
     df2[Not(1), All()] = df
+    df2[Not(1), Cols(:)] = df
     df2[Not(1), Between(1,2)] = df[!, 1:2]
     df2[Not(1), All()] = Matrix(df)
+    df2[Not(1), Cols(:)] = Matrix(df)
     df2[Not(1), Between(1,2)] = Matrix(df[!,1:2])
 
     allowmissing!(df2, All())
+    allowmissing!(df2, Cols(:))
     allowmissing!(df2, Between(1,2))
     disallowmissing!(df2, All())
+    disallowmissing!(df2, Cols(:))
     disallowmissing!(df2, Between(1,2))
 
     dfr = df[1, :]
     dfr[All()]
+    dfr[Cols(:)]
     dfr[Between(1,2)]
     dfr[All()] = (a=1, b=2, c=3)
+    dfr[Cols(:)] = (a=1, b=2, c=3)
     dfr[Between(1,2)] = (a=1, b=2)
     @view dfr[All()]
+    @view dfr[Cols(:)]
     @view dfr[Between(1,2)]
 
     dfv = view(df, :, :)
 
     dfv[1, All()]
+    dfv[1, Cols(:)]
     dfv[1, Between(1,2)]
     dfv[1:1, All()]
+    dfv[1:1, Cols(:)]
     dfv[1:1, Between(1,2)]
     dfv[Not(1), All()]
+    dfv[Not(1), Cols(:)]
     dfv[Not(1), Between(1,2)]
     dfv[:, All()]
+    dfv[:, Cols(:)]
     dfv[:, Between(1,2)]
     dfv[!, All()]
+    dfv[!, Cols(:)]
     dfv[!, Between(1,2)]
 
     @view dfv[1, All()]
+    @view dfv[1, Cols(:)]
     @view dfv[1, Between(1,2)]
     @view dfv[1:1, All()]
+    @view dfv[1:1, Cols(:)]
     @view dfv[1:1, Between(1,2)]
     @view dfv[Not(1), All()]
+    @view dfv[Not(1), Cols(:)]
     @view dfv[Not(1), Between(1,2)]
     @view dfv[:, All()]
+    @view dfv[:, Cols(:)]
     @view dfv[:, Between(1,2)]
     @view dfv[!, All()]
+    @view dfv[!, Cols(:)]
     @view dfv[!, Between(1,2)]
 
     dfv[1, All()] = (a=1, b=2, c=3)
+    dfv[1, Cols(:)] = (a=1, b=2, c=3)
     dfv[1, Between(1,2)] = (a=1, b=2)
     dfv[1:1, All()] = df
+    dfv[1:1, Cols(:)] = df
     dfv[1:1, Between(1,2)] = df[!, 1:2]
     dfv[:, All()] = df
+    dfv[:, Cols(:)] = df
     dfv[:, Between(1,2)] = df[!, 1:2]
     dfv[1:1, All()] = Matrix(df)
+    dfv[1:1, Cols(:)] = Matrix(df)
     dfv[1:1, Between(1,2)] = Matrix(df[!, 1:2])
     dfv[:, All()] = Matrix(df)
+    dfv[:, Cols(:)] = Matrix(df)
     dfv[:, Between(1,2)] = Matrix(df[!, 1:2])
 
     df2v = view(vcat(df, df), :, :)
     df2v[Not(1), All()] = df
+    df2v[Not(1), Cols(:)] = df
     df2v[Not(1), Between(1,2)] = df[!, 1:2]
     df2v[Not(1), All()] = Matrix(df)
+    df2v[Not(1), Cols(:)] = Matrix(df)
     df2v[Not(1), Between(1,2)] = Matrix(df[!, 1:2])
 end
 

diff --git a/test/deprecated.jl b/test/deprecated.jl
@@ -218,11 +218,13 @@ end
     @test df.x1 isa CategoricalVector{Int}
 end
 
-@testset "categorical with All and Between" begin
+@testset "categorical with Cols, All and Between" begin
     df = DataFrame(x1=["a", "b"], y=[2, 3])
     categorical(df, All())
+    categorical(df, Cols())
     categorical(df, Between(1, 2))
     categorical!(df, All())
+    categorical!(df, Cols())
     categorical!(df, Between(1,2))
 end
 

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -2013,15 +2013,15 @@ end
     df.g = shuffle!([1,2,2,3,3,3,4,4,4,4])
     gdf = groupby_checked(df, :g)
 
-    for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
+    for selector in [Cols(:), All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
                      [1, 2, 3, 4], [true, true, true, true, false]]
         @test combine(gdf, selector, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3) ==
               combine(gdf) do sdf
                   DataFrame(x1 = sin.(sdf.x1), x2 = sdf.x2, x3 = sin.(sdf.x2), x4 = sdf.x4)
               end
     end
 
-    for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
+    for selector in [Cols(:), All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
                      [1, 2, 3, 4], [true, true, true, true, false]]
         @test combine(gdf, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, selector) ==
               combine(gdf) do sdf
@@ -2960,6 +2960,8 @@ end
 
     @test select(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) ==
           DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30)
+    @test select(gdf, :a => +, [:a, :b] => +, Cols(:) => +, renamecols=false) ==
+          DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30)
     @test_throws ArgumentError select(gdf, [] => () -> 10, renamecols=false)
     @test transform(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) ==
           DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30)