JuliaData · bkamins · Mar 19, 2020 · Jan 6, 2020 · Jan 6, 2020 · Jan 6, 2020
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -73,6 +73,7 @@ include("dataframerow/utils.jl")
 
 include("other/broadcasting.jl")
 
+include("abstractdataframe/selection.jl")
 include("abstractdataframe/iteration.jl")
 include("abstractdataframe/join.jl")
 include("abstractdataframe/reshape.jl")

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -0,0 +1,361 @@
+# TODO:
+# * add transform and transfom! functions
+# * add `Col` wrapper for whole column operations
+# * update documentation
+# * add tests
+
+# normalize_selection function makes sure that whatever input format of idx is it
+# will end up in one of four canonical forms
+# 1) Int
+# 2) AbstractVector{Int}
+# 3) Pair{Int, Pair{ColRename, Symbol}}
+# 3) Pair{Int, <:Pair{<:Base.Callable, Symbol}}
+# 4) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
+# in this way we can easily later decide on the codepath using type signatures
+
+```
+    ColRename
+
+A singleton type indicating that column renaming operation was requested in `select`.
+```
+struct ColRename end
+
+normalize_selection(idx::AbstractIndex, sel) = idx[sel]
+normalize_selection(idx::AbstractIndex, sel::Pair{<:Any,<:Pair{<:Base.Callable,Symbol}}) =
+    idx[first(sel)] => last(sel)
+normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:Symbol}) =
+    idx[first(sel)] => ColRename() => last(sel)
+
+function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:Base.Callable})
+    c = idx[first(sel)]
+    fun = last(sel)
+    newcol = Symbol(_names(idx)[c], "_", funname(fun))
+    return c => fun => newcol
+end
+
+function normalize_selection(idx::AbstractIndex, sel::Pair{<:Any, <:Base.Callable})
+    c = idx[first(sel)]
+    fun = last(sel)
+    if length(c) > 2
+        newcol = Symbol(join(_names(idx)[c[1:2]], "_"), "_⋯_", funname(fun))
+    else
+        newcol = Symbol(join(_names(idx)[c], "_"), "_", funname(fun))
+    end
+    return c => fun => newcol
+end
+
+struct TypeHolder{T} end
+
+function select_transform_helper(th::TypeHolder{T}, cols, fun, n) where T
+    fun_transform(fun, x::T) = fun(x)
+    map(i -> fun_transform(fun, T(ntuple(j -> cols[j][i], length(cols)))), 1:n)
+end
+
+function select_transform!(nc::Union{Pair{Int, Pair{ColRename, Symbol}},
+                                     Pair{<:Union{Int, AbstractVector{Int}},
+                                          <:Pair{<:Base.Callable, Symbol}}},
+                           df::DataFrame, newdf::DataFrame,
+                           transformed_cols::Dict{Any, Any}, copycols::Bool)
+    newname = last(last(nc))
+    if !isnothing(transformed_cols[newname])
+        @assert !hasproperty(newdf, newname)
+    end
+    col_idx = first(nc)
+    if nc isa Pair{Int, Pair{ColRename, Symbol}}
+        newdf[!, newname] = copycols ? df[:, col_idx] : df[!, col_idx]
+    elseif nc isa Pair{Int, <:Pair{<:Base.Callable, Symbol}}
+        newdf[!, newname] = first(last(nc)).(df[!, col_idx])
+    elseif nc isa Pair{<:AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
+        if length(col_idx) == 0
+            newdf[!, newname] = [first(last(nc))() for _ in axes(df, 1)]
+        else
+            cols = ntuple(i -> _columns(df)[col_idx[i]], length(col_idx))
+            colnames = ntuple(i -> _names(df)[col_idx[i]], length(col_idx))
+            newdf[!, newname] = select_transform_helper(TypeHolder{NamedTuple{colnames,
+                                                                              Tuple{eltype.(cols)...}}}(),
+                                                        cols, first(last(nc)), nrow(df))
+        end
+    else
+        throw(ErrorException("code should never reach this branch"))
+    end
+    transformed_cols[newname] = nothing
+end
+
+"""
+    select!(df::DataFrame, inds...)
+
+Mutate `df` in place to retain only columns specified by `inds...` and return it.
+
+Arguments passed as `inds...` can be any index that is allowed for column indexing
+provided that the columns requested in each of them are unique and present in `df`.
+In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported.
+
+Column renaming and transformations are supported.
+The syntax for column renaming is `old_column=>new_column_name`.
+The syntax for column transformations is `old_column=>fun=>new_column_name`.
+`new_column_name` must be a `Symbol`.
+If `old_column` is a `Symbol` or an integer then and `fun` must be callable
+that is applied row by row to the values of `old_column`.
+Otherwise `old_column` can be any column indexing syntax, but in this case `fun`
+will be passed `DataFrameRows` selected by `old_column`.
+
+Column transfomation syntax also supports a short `old_column=>fun` form, in which
+case `new_column_name` is automatically generated by joining `olcd_column` with `fun` name
+with `_`.
+
+If more than one argument is passed then duplicates are accepted except for
+column renaming and transformation operations, where it is not alloweded to rename/transform
+into the same column name.
+For example if `:col` is present in `df` a call `select!(df, :col, :)` is valid
+and moves the column `:col` moved to be the first one in-place.
+
+Note that including the same column several times in the data frame will create aliases.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a=1:3, b=4:6)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 2     │ 5     │
+│ 3   │ 3     │ 6     │
+
+julia> select!(df, 2)
+3×1 DataFrame
+│ Row │ b     │
+│     │ Int64 │
+├─────┼───────┤
+│ 1   │ 4     │
+│ 2   │ 5     │
+│ 3   │ 6     │
+
+julia> df = DataFrame(a=1:3, b=4:6)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 2     │ 5     │
+│ 3   │ 3     │ 6     │
+
+julia> select!(df, :a=><(1.5)=>:c, :b)
+3×2 DataFrame
+│ Row │ c    │ b     │
+│     │ Bool │ Int64 │
+├─────┼──────┼───────┤
+│ 1   │ 1    │ 4     │
+│ 2   │ 0    │ 5     │
+│ 3   │ 0    │ 6     │
+```
+
+"""
+function select!(df::DataFrame, inds::AbstractVector{Int})
+    if isempty(inds)
+        empty!(_columns(df))
+        empty!(index(df))
+        return df
+    end
+    indmin, indmax = extrema(inds)
+    if indmin < 1
+        throw(ArgumentError("indices must be positive"))
+    end
+    if indmax > ncol(df)
+        throw(ArgumentError("indices must not be greater than number of columns"))
+    end
+    if !allunique(inds)
+        throw(ArgumentError("indices must not contain duplicates"))
+    end
+    copy!(_columns(df), _columns(df)[inds])
+    x = index(df)
+    copy!(_names(x), _names(df)[inds])
+    empty!(x.lookup)
+    for (i, n) in enumerate(x.names)
+        x.lookup[n] = i
+    end
+    return df
+end
+
+select!(df::DataFrame, c::Int) = select!(df, [c])
+select!(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
+                                Colon, All, Not, Between, Regex}) =
+    select!(df, index(df)[c])
+
+function select!(df::DataFrame, cs...)
+    newdf = select(df, cs..., copycols=false)
+    copy!(_columns(df), _columns(newdf))
+    x = index(df)
+    copy!(_names(x), _names(newdf))
+    empty!(x.lookup)
+    for (i, n) in enumerate(x.names)
+        x.lookup[n] = i
+    end
+    return df
+end
+
+"""
+    select(df::AbstractDataFrame, inds...; copycols::Bool=true)
+
+Create a new data frame that contains columns from `df`
+specified by `inds` and return it.
+
+Arguments passed as `inds...` can be any index that is allowed for column indexing
+provided that the columns requested in each of them are unique and present in `df`.
+In particular, regular expressions, `All`, `Between`, and `Not` selectors  are supported.
+
+Also if `df` is a `DataFrame` or `copycols=true` then column renaming and transformations
+are supported. The syntax for column renaming is `old_column=>new_column_name`.
+The syntax for column transformations is `old_column=>fun=>new_column_name`.
+If `old_column` is a `Symbol` or an integer then and `fun` must be callable
+that is applied row by row to the values of `old_column`.
+Otherwise `old_column` can be any column indexing syntax, but in this case `fun`
+will be passed `DataFrameRows` selected by `old_column`.
+
+Column transfomation syntax also supports a short `old_column=>fun` form, in which
+case `new_column_name` is automatically generated by joining `olcd_column` with `fun` name
+with `_`.
+
+If more than one argument is passed then duplicates are accepted except for
+column renaming and transformation operations, where it is not allowed to rename/transform
+into the same column name.
+For example if `:col` is present in `df` a call `select(df, :col, :)` is valid
+and creates a new data frame with column `:col` moved to be the first.
+
+If `df` is a `DataFrame` a new `DataFrame` is returned.
+If `copycols=true` (the default), then returned `DataFrame` is guaranteed not to share columns with `df`.
+If `copycols=false`, then returned `DataFrame` shares column vectors with `df` where possible.
+
+If `df` is a `SubDataFrame` then a `SubDataFrame` is returned if `copycols=false`
+and a `DataFrame` with freshly allocated columns otherwise.
+
+Note that if `df` is a `DataFrame` and `copycols=false` then including the same column several times
+in the resulting data frame will create aliases.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a=1:3, b=4:6)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 2     │ 5     │
+│ 3   │ 3     │ 6     │
+
+julia> select(df, :b)
+3×1 DataFrame
+│ Row │ b     │
+│     │ Int64 │
+├─────┼───────┤
+│ 1   │ 4     │
+│ 2   │ 5     │
+│ 3   │ 6     │
+
+julia> select(df, Not(:b)) # drop column :b from df
+3×1 DataFrame
+│ Row │ a     │
+│     │ Int64 │
+├─────┼───────┤
+│ 1   │ 1     │
+│ 2   │ 2     │
+│ 3   │ 3     │
+
+julia> select(df, :a=>:c, :b)
+3×2 DataFrame
+│ Row │ c     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 2     │ 5     │
+│ 3   │ 3     │ 6     │
+
+julia> select(df, :b, :a=><(1.5)=>:c)
+3×2 DataFrame
+│ Row │ b     │ c    │
+│     │ Int64 │ Bool │
+├─────┼───────┼──────┤
+│ 1   │ 4     │ 1    │
+│ 2   │ 5     │ 0    │
+│ 3   │ 6     │ 0    │
+```
+
+"""
+select(df::DataFrame, inds::AbstractVector{Int}; copycols::Bool=true) =
+    DataFrame(_columns(df)[inds], Index(_names(df)[inds]),
+              copycols=copycols)
+select(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
+                               Colon, All, Not, Between, Regex}; copycols::Bool=true) =
+    select(df, index(df)[c], copycols=copycols)
+select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) =
+    select(df, [c], copycols=copycols)
+
+select(df::DataFrame, cs...; copycols::Bool=true) =
+    _select(df, [normalize_selection(index(df), c) for c in cs], copycols)
+
+function _select(df::DataFrame, ncs, copycols::Bool)
+    newdf = DataFrame()
+    # it should be OK to be type unstable here + in this way we aviod having to compile custom Dict
+    transformed_cols = Dict()
+    for nc in ncs
+        if nc isa Pair
+            newname = last(last(nc))
+            @assert newname isa Symbol
+            if haskey(transformed_cols, newname)
+                throw(ArgumentError("duplicate target transformed or renamed column names passed"))
+            end
+            transformed_cols[newname] = nc
+        end
+    end
+    for nc in ncs
+        if nc isa Union{Int, AbstractVector{Int}}
+            allunique(nc) || throw(ArgumentError("duplicate column names selected"))
+            for i in nc
+                newname = _names(df)[i]
+                if !hasproperty(newdf, newname)
+                    if haskey(transformed_cols, newname)
+                        nct = transformed_cols[newname]
+                        @assert !isnothing(nct)
+                        select_transform!(nct, df, newdf, transformed_cols, copycols)
+                    else
+                        newdf[!, newname] = copycols ? df[:, i] : df[!, i]
+                    end
+                end
+            end
+        else
+            select_transform!(nc, df, newdf, transformed_cols, copycols)
+        end
+    end
+    return newdf
+end
+
+select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) =
+    select(dfv, [ind], copycols=copycols)
+select(dfv::SubDataFrame, inds::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
+                                      Colon, All, Not, Between, Regex}; copycols::Bool=true) =
+    copycols ? dfv[:, inds] : view(dfv, :, inds)
+
+function select(dfv::SubDataFrame, inds...; copycols::Bool=true)
+    if copycols
+        newinds = [normalize_selection(index(dfv), c) for c in inds]
+        usedcols = Int[]
+        for ni in newinds
+            # ni is guaranteed to be a Pair with first being an index or an index
+            append!(usedcols, ni isa Pair ? first(ni) : ni)
+        end
+        return _select(dfv[:, unique!(usedcols)], newinds, false)
+    else
+        # we do not support transformations here
+        # newinds should not be large so making it Vector{Any} should be OK
+        newinds = []
+        for ind in inds
+            newind = normalize_selection(index(dfv), ind)
+            if newind isa Pair
+                throw(ArgumentError("transforming and renaming columns of a " *
+                                    "`SubDataFrame` is not allowed when `copycols=false`"))
+            end
+            push!(newinds, newind)
+        end
+        return view(dfv, :, All(newinds...))
+    end
+end