From 4223da5da88af9cc9be534e595684ca4243bec1d Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 11 Oct 2020 12:55:52 +0200 Subject: [PATCH 1/6] mapcols!(): exit early if no columns (#2475) --- src/abstractdataframe/iteration.jl | 2 ++ test/iteration.jl | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 3805a0f4f2..4e6b0c6ec3 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -411,6 +411,8 @@ julia> df """ function mapcols!(f::Union{Function,Type}, df::DataFrame) # note: `f` must return a consistent length + ncol(df) == 0 && return df # skip if no columns + vs = AbstractVector[] seenscalar = false seenvector = false diff --git a/test/iteration.jl b/test/iteration.jl index 42921b6667..6a346711fd 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -71,8 +71,11 @@ end end @testset "mapcols!" begin + df_empty = DataFrame() + @test mapcols!(sum, df_empty) === df_empty + df_mapcols = DataFrame(a=1:10, b=11:20) - mapcols!(sum, df_mapcols) + @test mapcols!(sum, df_mapcols) === df_mapcols @test df_mapcols == DataFrame(a=55, b=155) df_mapcols = DataFrame(a=1:10, b=11:20) From fdb3ece371acf2ff808557c54e9a9f3ce9b3df57 Mon Sep 17 00:00:00 2001 From: Wolf Thomsen Date: Sun, 11 Oct 2020 12:57:14 +0200 Subject: [PATCH 2/6] add favicon to documentation assets and make.jl (#2478) --- docs/make.jl | 5 ++++- docs/src/assets/favicon.ico | Bin 0 -> 1150 bytes 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 docs/src/assets/favicon.ico diff --git a/docs/make.jl b/docs/make.jl index 8f55966777..e6ac334768 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,7 +14,10 @@ makedocs( doctest = false, clean = false, sitename = "DataFrames.jl", - format = Documenter.HTML(canonical = "https://juliadata.github.io/DataFrames.jl/stable/"), + format = Documenter.HTML( + canonical = "https://juliadata.github.io/DataFrames.jl/stable/", + assets = ["assets/favicon.ico"] + ), pages = Any[ "Introduction" => "index.md", "User Guide" => Any[ diff --git a/docs/src/assets/favicon.ico b/docs/src/assets/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..eaa4da2c7a72982278e57c7bb9ff7508b5ad62a0 GIT binary patch literal 1150 zcmb`GOK%ck6o!wKx~L|Sv|;51Z;6*EG*+fiXv(d0pw$BE!nCGfW33dsXx-GRY)!f_ zH8D-Oq-CHu(ssIB0S4X3@K>GW7=7R-pS zT+h2IdiYNbwo+cCfLK7U6nqtYMgGoe5cj4)0pvxBuf{?*9PWw^{u4N9I2n;FViNul z(SHCZ2{ICL$VVi8-DNS0^>5UDte^QCAiImt*1w4PSqVAxX!uzFBX}VhwIBNtMij$!uf^@<11}%{cFD0Q-Z&^Ljd{3Zqs`LGSJGM{U$doOttI7 z9+PH?zp9Ry-Rg+RrHPo_+;aal>*A$D-_6CA&C#V+5n25`VzDRgpx(sb(H_CEpoy9? zATLE7>FWkkr70JE75vBWQeqF*rv_Z^Z@wYnWb^-*zgeGKkN-4(ZRoHe?g(n4<`*DY zrxQ8+EaTa#18-~3OXV2}u3`f*@)exm*hK!SRv literal 0 HcmV?d00001 From 9a292ff0ae7b82054048d5e40a1692b4605905d3 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 11 Oct 2020 18:34:03 +0200 Subject: [PATCH 3/6] [BREAKING] Use DataAPI.refpool for optimized grouping (#2442) Generalize existing optimized `row_group_slots` method for `CategoricalArray` and `PooledArray` so that it can be used for other array types for which `DataAPI.refpool` returns an `AbstractVector`. This allows dropping the dependency on CategoricalArrays in this part of the code. Also refactor the method to be faster when not sorting. In that case, we do not need to build a map between reference codes and groups (indexing into it is slow when the number of groups is very large). `CategoricalArray` is no longer special cased: when `sort=false`, levels are still sorted, but `missing` appears first. Add more tests to cover weird combinations. --- Project.toml | 2 +- src/dataframerow/utils.jl | 157 ++++++++++++++++------ src/groupeddataframe/splitapplycombine.jl | 3 +- test/grouping.jl | 127 +++++++++++++++-- 4 files changed, 233 insertions(+), 56 deletions(-) diff --git a/Project.toml b/Project.toml index a36eeb95c0..f737def79f 100644 --- a/Project.toml +++ b/Project.toml @@ -35,7 +35,7 @@ test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"] [compat] julia = "1" -CategoricalArrays = "0.8" +CategoricalArrays = "0.8.3" Compat = "3.17" DataAPI = "1.2" InvertedIndices = "1" diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index 00b98dae90..3c503c9048 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -94,10 +94,20 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int, # 4) whether groups are already sorted # Optional `groups` vector is set to the group indices of each row (starting at 1) # With skipmissing=true, rows with missing values are attributed index 0. +row_group_slots(cols::Tuple{Vararg{AbstractVector}}, + hash::Val = Val(true), + groups::Union{Vector{Int}, Nothing} = nothing, + skipmissing::Bool = false, + sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} = + row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort) + +# Generic fallback method based on open adressing hash table function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, + refpools::Any, hash::Val = Val(true), groups::Union{Vector{Int}, Nothing} = nothing, - skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} + skipmissing::Bool = false, + sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} @assert groups === nothing || length(groups) == length(cols[1]) rhashes, missings = hashrows(cols, skipmissing) # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 @@ -140,70 +150,132 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}}, return ngroups, rhashes, gslots, false end -nlevels(x::PooledArray) = length(x.pool) -nlevels(x) = length(levels(x)) - -function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}}, +# Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector +function row_group_slots(cols::NTuple{N,<:AbstractVector}, + refpools::NTuple{N,<:AbstractVector}, hash::Val{false}, groups::Union{Vector{Int}, Nothing} = nothing, - skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N + skipmissing::Bool = false, + sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N # Computing neither hashes nor groups isn't very useful, # and this method needs to allocate a groups vector anyway @assert groups !== nothing && all(col -> length(col) == length(groups), cols) + refs = map(DataAPI.refarray, cols) + missinginds = map(refpools) do refpool + eltype(refpool) >: Missing ? + something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1 + end + # If skipmissing=true, rows with missings all go to group 0, # which will be removed by functions down the stream - ngroupstup = map(cols) do c - nlevels(c) + (!skipmissing && eltype(c) >: Missing) + ngroupstup = map(refpools, missinginds) do refpool, missingind + len = length(refpool) + if skipmissing && missingind <= lastindex(refpool) + return len - 1 + else + return len + end end ngroups = prod(ngroupstup) - # Fall back to hashing if there would be too many empty combinations. + # Fall back to hashing if there would be too many empty combinations + # or if the pool does not contain only unique values # The first check ensures the computation of ngroups did not overflow. # The rationale for the 2 threshold is that while the fallback method is always slower, # it allocates a hash table of size length(groups) instead of the remap vector # of size ngroups (i.e. the number of possible combinations) in this method: # so it makes sense to allocate more memory for better performance, # but it needs to remain reasonable compared with the size of the data frame. - if prod(Int128.(ngroupstup)) > typemax(Int) || ngroups > 2 * length(groups) + anydups = !all(allunique, refpools) + if prod(big.(ngroupstup)) > typemax(Int) || + ngroups > 2 * length(groups) || + anydups + # In the simplest case, we can work directly with the reference codes + newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) || + sort || + anydups ? cols : refs return invoke(row_group_slots, - Tuple{Tuple{Vararg{AbstractVector}}, Val, - Union{Vector{Int}, Nothing}, Bool}, - cols, hash, groups, skipmissing) + Tuple{Tuple{Vararg{AbstractVector}}, Any, Val, + Union{Vector{Int}, Nothing}, Bool, Bool}, + newcols, refpools, hash, groups, skipmissing, sort) end seen = fill(false, ngroups) - # Compute vector mapping missing to -1 if skipmissing=true - refmaps = map(cols) do col - nlevs = nlevels(col) - refmap = collect(-1:(nlevs-1)) - # First value in refmap is only used by CategoricalArray - # (corresponds to ref 0, i.e. missing values) - refmap[1] = skipmissing ? -1 : nlevs - if col isa PooledArray{>: Missing} && skipmissing - missingind = get(col.invpool, missing, 0) - if missingind > 0 - refmap[missingind+1] = -1 - refmap[missingind+2:end] .-= 1 - end - end - refmap - end strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N,Int} - @inbounds for i in eachindex(groups) - local refs - let i=i # Workaround for julia#15276 - refs = map(c -> c.refs[i], cols) + firstinds = map(firstindex, refpools) + if sort + nminds = map(refpools, missinginds) do refpool, missingind + missingind > lastindex(refpool) ? + eachindex(refpool) : setdiff(eachindex(refpool), missingind) end - vals = map((m, r, s) -> m[r+1] * s, refmaps, refs, strides) - j = sum(vals) + 1 - # x < 0 happens with -1 in refmap, which corresponds to missing - if skipmissing && any(x -> x < 0, vals) - j = 0 + if skipmissing + sorted = all(issorted(view(refpool, nmind)) + for (refpool, nmind) in zip(refpools, nminds)) else - seen[j] = true + sorted = all(issorted, refpools) + end + else + sorted = false + end + if sort && !sorted + # Compute vector mapping missing to -1 if skipmissing=true + refmaps = map(cols, refpools, missinginds, nminds) do col, refpool, missingind, nmind + refmap = collect(0:length(refpool)-1) + if skipmissing + fi = firstindex(refpool) + if missingind <= lastindex(refpool) + refmap[missingind-fi+1] = -1 + refmap[missingind-fi+2:end] .-= 1 + end + if sort + perm = sortperm(view(refpool, nmind)) + invpermute!(view(refmap, nmind .- fi .+ 1), perm) + end + elseif sort + # collect is needed for CategoricalRefPool + invpermute!(refmap, sortperm(collect(refpool))) + end + refmap + end + @inbounds for i in eachindex(groups) + local refs_i + let i=i # Workaround for julia#15276 + refs_i = map(c -> c[i], refs) + end + vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds) + j = sum(vals) + 1 + # x < 0 happens with -1 in refmap, which corresponds to missing + if skipmissing && any(x -> x < 0, vals) + j = 0 + else + seen[j] = true + end + groups[i] = j + end + else + @inbounds for i in eachindex(groups) + local refs_i + let i=i # Workaround for julia#15276 + refs_i = map(refs, missinginds) do ref, missingind + r = Int(ref[i]) + if skipmissing + return r == missingind ? -1 : (r > missingind ? r-1 : r) + else + return r + end + end + end + vals = map((r, s, fi) -> (r-fi) * s, refs_i, strides, firstinds) + j = sum(vals) + 1 + # x < 0 happens with -1, which corresponds to missing + if skipmissing && any(x -> x < 0, vals) + j = 0 + else + seen[j] = true + end + groups[i] = j end - groups[i] = j end if !all(seen) # Compress group indices to remove unused ones oldngroups = ngroups @@ -220,8 +292,7 @@ function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}} # To catch potential bugs inducing unnecessary computations @assert oldngroups != ngroups end - sorted = all(col -> col isa CategoricalVector, cols) - return ngroups, UInt[], Int[], sorted + return ngroups, UInt[], Int[], sort end @@ -267,7 +338,7 @@ end function group_rows(df::AbstractDataFrame) groups = Vector{Int}(undef, nrow(df)) ngroups, rhashes, gslots, sorted = - row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false) + row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false, false) rperm, starts, stops = compute_indices(groups, ngroups) return RowGroupDict(df, rhashes, gslots, groups, rperm, starts, stops) end diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index b6f0595019..6f9da55f1c 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -160,7 +160,8 @@ function groupby(df::AbstractDataFrame, cols; groups = Vector{Int}(undef, nrow(df)) ngroups, rhashes, gslots, sorted = - row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), groups, skipmissing) + row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), + groups, skipmissing, sort) gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing, Threads.ReentrantLock()) diff --git a/test/grouping.jl b/test/grouping.jl index 26b81ad058..ecf16cce70 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -594,6 +594,122 @@ end end end +@testset "grouping arrays that allow missing without missings" begin + xv = ["A", "B", "B", "B", "A", "B", "A", "A"] + yv = ["B", "A", "A", "B", "A", "B", "A", "A"] + xvars = (xv, + categorical(xv), + levels!(categorical(xv), ["A", "B", "X"]), + levels!(categorical(xv), ["X", "B", "A"]), + _levels!(PooledArray(xv), ["A", "B"]), + _levels!(PooledArray(xv), ["B", "A", "X"]), + _levels!(PooledArray(xv), ["X", "A", "B"])) + yvars = (yv, + categorical(yv), + levels!(categorical(yv), ["A", "B", "X"]), + levels!(categorical(yv), ["B", "X", "A"]), + _levels!(PooledArray(yv), ["A", "B"]), + _levels!(PooledArray(yv), ["A", "B", "X"]), + _levels!(PooledArray(yv), ["B", "A", "X"])) + for x in xvars, y in yvars, + fx in (identity, allowmissing), + fy in (identity, allowmissing) + df = DataFrame(Key1 = fx(x), Key2 = fy(y), Value = 1:8) + + @testset "sort=false, skipmissing=false" begin + gd = groupby_checked(df, :Key1) + @test length(gd) == 2 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]), + DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]), + ]) + + gd = groupby_checked(df, [:Key1, :Key2]) + @test length(gd) == 4 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]), + DataFrame(Key1="A", Key2="B", Value=1), + DataFrame(Key1="B", Key2="A", Value=[2, 3]), + DataFrame(Key1="B", Key2="B", Value=[4, 6]) + ]) + end + + @testset "sort=false, skipmissing=true" begin + gd = groupby_checked(df, :Key1, skipmissing=true) + @test length(gd) == 2 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]), + DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]) + ]) + + gd = groupby_checked(df, [:Key1, :Key2], skipmissing=true) + @test length(gd) == 4 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]), + DataFrame(Key1="A", Key2="B", Value=1), + DataFrame(Key1="B", Key2="A", Value=[2, 3]), + DataFrame(Key1="B", Key2="B", Value=[4, 6]) + ]) + end + + @testset "sort=true, skipmissing=false" begin + gd = groupby_checked(df, :Key1, sort=true) + @test length(gd) == 2 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]), + DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]), + ]) + @test issorted(vcat(gd...), :Key1) + + gd = groupby_checked(df, [:Key1, :Key2], sort=true) + @test length(gd) == 4 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]), + DataFrame(Key1="A", Key2="B", Value=1), + DataFrame(Key1="B", Key2="A", Value=[2, 3]), + DataFrame(Key1="B", Key2="B", Value=[4, 6]), + ]) + @test issorted(vcat(gd...), [:Key1, :Key2]) + end + + @testset "sort=true, skipmissing=true" begin + gd = groupby_checked(df, :Key1, sort=true, skipmissing=true) + @test length(gd) == 2 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]), + DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]) + ]) + @test issorted(vcat(gd...), :Key1) + + gd = groupby_checked(df, [:Key1, :Key2], sort=true, skipmissing=true) + @test length(gd) == 4 + @test isequal_unordered(gd, [ + DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]), + DataFrame(Key1="A", Key2="B", Value=1), + DataFrame(Key1="B", Key2="A", Value=[2, 3]), + DataFrame(Key1="B", Key2="B", Value=[4, 6]) + ]) + @test issorted(vcat(gd...), [:Key1, :Key2]) + end + end +end + +@testset "grouping refarray with fallback" begin + # The high number of categories compared to the number of rows triggers the use + # of the fallback grouping method + for x in ([3, 1, 2], [3, 1, missing]) + df = DataFrame(x=categorical(x, levels=10000:-1:1), + x2=categorical(x, levels=3:-1:1), + y=[1, 2, 3]) + for skipmissing in (true, false) + @test groupby(df, :x, sort=true, skipmissing=skipmissing) ≅ + groupby(df, :x, sort=true, skipmissing=skipmissing) + @test isequal_unordered(groupby(df, :x, skipmissing=skipmissing), + collect(AbstractDataFrame, groupby(df, :x, skipmissing=skipmissing))) + end + end +end + @testset "grouping with three keys" begin # We need many rows so that optimized CategoricalArray method is used xv = rand(["A", "B", missing], 100) @@ -632,17 +748,6 @@ end dfs = [groupby_checked(dfb, [:Key1, :Key2, :Key3], sort=true, skipmissing=true)...] @test isequal_unordered(gd, dfs) @test issorted(vcat(gd...), [:Key1, :Key2, :Key3]) - - # This is an implementation detail but it allows checking - # that the optimized method is used - if df.Key1 isa CategoricalVector && - df.Key2 isa CategoricalVector && - df.Key3 isa CategoricalVector - @test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true) ≅ - groupby_checked(df, [:Key1, :Key2, :Key3], sort=false) - @test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true, skipmissing=true) ≅ - groupby_checked(df, [:Key1, :Key2, :Key3], sort=false, skipmissing=true) - end end end From b11fe976138e12ce288eca44d2c06874010d6404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 12 Oct 2020 10:54:23 +0200 Subject: [PATCH 4/6] allow passing empty sets of columns to ByRow and filter (#2476) --- NEWS.md | 2 + src/abstractdataframe/abstractdataframe.jl | 28 ++++---- src/abstractdataframe/selection.jl | 76 ++++++++++++---------- src/groupeddataframe/splitapplycombine.jl | 14 +++- test/data.jl | 42 ++++++++++-- test/grouping.jl | 41 ++++++++++++ 6 files changed, 151 insertions(+), 52 deletions(-) diff --git a/NEWS.md b/NEWS.md index 81a9d67825..504d2386b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,6 +74,8 @@ which if set to `true` makes them retun a `SubDataFrame` view into the passed data frame. * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449)) +* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine` + with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476)) ## Deprecated diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 46aee96257..c92d84cb6b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -994,9 +994,10 @@ end @inline function Base.filter((cols, f)::Pair, df::AbstractDataFrame; view::Bool=false) int_cols = index(df)[cols] # it will be AbstractVector{Int} or Int if length(int_cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f() for _ in axes(df, 1)] + else + rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...) end - rowidxs = _filter_helper(f, (df[!, i] for i in int_cols)...) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1006,9 +1007,10 @@ end AbstractVector{<:Symbol}}}, df::AbstractDataFrame; view::Bool=false) if length(cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f() for _ in axes(df, 1)] + else + rowidxs = _filter_helper(f, (df[!, i] for i in cols)...) end - rowidxs = _filter_helper(f, (df[!, i] for i in cols)...) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1018,9 +1020,10 @@ _filter_helper(f, cols...)::BitVector = ((x...) -> f(x...)::Bool).(cols...) view::Bool=false) df_tmp = select(df, cols.cols, copycols=false) if ncol(df_tmp) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = [f(NamedTuple()) for _ in axes(df, 1)] + else + rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp)) end - rowidxs = _filter_helper_astable(f, Tables.namedtupleiterator(df_tmp)) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -1101,7 +1104,7 @@ julia> filter!(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df) │ 3 │ 1 │ b │ ``` """ -Base.filter!(f, df::AbstractDataFrame) = _filter!_helper(df, f, eachrow(df)) +Base.filter!(f, df::AbstractDataFrame) = delete!(df, findall(!f, eachrow(df))) Base.filter!((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) = _filter!_helper(df, f, df[!, col]) Base.filter!((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) = @@ -1115,17 +1118,20 @@ Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) = function _filter!_helper(df::AbstractDataFrame, f, cols...) if length(cols) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + rowidxs = findall(x -> !f(), axes(df, 1)) + else + rowidxs = findall(((x...) -> !(f(x...)::Bool)).(cols...)) end - return delete!(df, findall(((x...) -> !(f(x...)::Bool)).(cols...))) + return delete!(df, rowidxs) end function Base.filter!((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame) dff = select(df, cols.cols, copycols=false) if ncol(dff) == 0 - throw(ArgumentError("At least one column must be passed to filter on")) + return delete!(df, findall(x -> !f(NamedTuple()), axes(df, 1))) + else + return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f) end - return _filter!_helper_astable(df, Tables.namedtupleiterator(dff), f) end _filter!_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) = diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 76393ccd85..88500bd77d 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -67,10 +67,20 @@ normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractStrin normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool) function normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable, - <:Union{Symbol, AbstractString, DataType, - AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}}}, + sel::Pair{<:ColumnIndex, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractString}}}, + renamecols::Bool) + src, (fun, dst) = sel + return idx[src] => fun => Symbol(dst) +end + +function normalize_selection(idx::AbstractIndex, + sel::Pair{<:Any, + <:Pair{<:Base.Callable, + <:Union{Symbol, AbstractString, DataType, + AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}}}, renamecols::Bool) lls = last(last(sel)) if lls isa DataType @@ -170,31 +180,29 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => fun => newcol end -function _transformation_helper(df::AbstractDataFrame, - col_idx::Union{Nothing, Int, AbstractVector{Int}, AsTable}, - @nospecialize(fun)) - if col_idx === nothing - return fun(df) - elseif col_idx isa Int - return fun(df[!, col_idx]) - elseif col_idx isa AsTable - tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) - if isempty(tbl) && fun isa ByRow - return [fun.fun(NamedTuple()) for _ in 1:nrow(df)] - else - return fun(tbl) - end +_transformation_helper(df::AbstractDataFrame, col_idx::Nothing, fun) = fun(df) +_transformation_helper(df::AbstractDataFrame, col_idx::Int, fun) = fun(df[!, col_idx]) + +_empty_astable_helper(fun, len) = [fun(NamedTuple()) for _ in 1:len] + +function _transformation_helper(df::AbstractDataFrame, col_idx::AsTable, fun) + tbl = Tables.columntable(select(df, col_idx.cols, copycols=false)) + if isempty(tbl) && fun isa ByRow + return _empty_astable_helper(fun.fun, nrow(df)) else - # it should be fast enough here as we do not expect to do it millions of times - @assert col_idx isa AbstractVector{Int} - if isempty(col_idx) && fun isa ByRow - return [fun.fun() for _ in 1:nrow(df)] - else - cdf = eachcol(df) - return fun(map(c -> cdf[c], col_idx)...) - end + return fun(tbl) + end +end + +_empty_selector_helper(fun, len) = [fun() for _ in 1:len] + +function _transformation_helper(df::AbstractDataFrame, col_idx::AbstractVector{Int}, fun) + if isempty(col_idx) && fun isa ByRow + return _empty_selector_helper(fun.fun, nrow(df)) + else + cdf = eachcol(df) + return fun(map(c -> cdf[c], col_idx)...) end - throw(ErrorException("unreachable reached")) end function _gen_colnames(@nospecialize(res), newname::Union{AbstractVector{Symbol}, @@ -656,7 +664,7 @@ julia> select!(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat ``` """ -select!(df::DataFrame, args...; renamecols::Bool=true) = +select!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) function select!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -676,7 +684,7 @@ Equivalent to `select!(df, :, args...)`. See [`select!`](@ref) for detailed rules regarding accepted values for `args`. """ -transform!(df::DataFrame, args...; renamecols::Bool=true) = +transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) = select!(df, :, args..., renamecols=renamecols) function transform!(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -810,7 +818,7 @@ julia> select(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stats ``` """ -select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = +select(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) = manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols) function select(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -831,7 +839,7 @@ Equivalent to `select(df, :, args..., copycols=copycols)`. See [`select`](@ref) for detailed rules regarding accepted values for `args`. """ -transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = +transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) = select(df, :, args..., copycols=copycols, renamecols=renamecols) function transform(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -935,7 +943,7 @@ julia> combine(df, AsTable(:) => ByRow(x -> (mean=mean(x), std=std(x))) => :stat │ 3 │ (mean = 6.0, std = 3.0) │ 6.0 │ 3.0 │ ``` """ -combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = +combine(df::AbstractDataFrame, @nospecialize(args...); renamecols::Bool=true) = manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols) function combine(arg::Base.Callable, df::AbstractDataFrame; renamecols::Bool=true) @@ -964,7 +972,7 @@ manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool, renamecols::Bool) = manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols) -function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool) +function manipulate(df::DataFrame, @nospecialize(cs...); copycols::Bool, keeprows::Bool, renamecols::Bool) cs_vec = [] for v in cs if v isa AbstractVecOrMat{<:Pair} @@ -1061,7 +1069,7 @@ function manipulate(dfv::SubDataFrame, args::MultiColumnIndex; end end -function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool, +function manipulate(dfv::SubDataFrame, @nospecialize(args...); copycols::Bool, keeprows::Bool, renamecols::Bool) if copycols cs_vec = [] diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 6f9da55f1c..5664e7449e 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -712,7 +712,11 @@ end function do_call(f::Any, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::Tuple{}, i::Integer) - f() + if f isa ByRow + return [f.fun() for _ in 1:(ends[i] - starts[i] + 1)] + else + return f() + end end function do_call(f::Any, idx::AbstractVector{<:Integer}, @@ -754,8 +758,12 @@ end function do_call(f::Any, idx::AbstractVector{<:Integer}, starts::AbstractVector{<:Integer}, ends::AbstractVector{<:Integer}, gd::GroupedDataFrame, incols::NamedTuple, i::Integer) - idx = idx[starts[i]:ends[i]] - return f(map(c -> view(c, idx), incols)) + if f isa ByRow && isempty(incols) + return [f.fun(NamedTuple()) for _ in 1:(ends[i] - starts[i] + 1)] + else + idx = idx[starts[i]:ends[i]] + return f(map(c -> view(c, idx), incols)) + end end function do_call(f::Any, idx::AbstractVector{<:Integer}, diff --git a/test/data.jl b/test/data.jl index cd9f167865..fff7a74cec 100644 --- a/test/data.jl +++ b/test/data.jl @@ -453,11 +453,45 @@ end @test filter(AsTable("x") => testfun, df) == DataFrame(x=[3, 2], y=["b", "a"]) filter!(AsTable("x") => testfun, df) @test df == DataFrame(x=[3, 2], y=["b", "a"]) +end + +@testset "empty arg to filter and filter!" begin + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + + @test filter([] => () -> true, df) == df + @test filter(AsTable(r"z") => x -> true, df) == df + @test filter!([] => () -> true, copy(df)) == df + @test filter!(AsTable(r"z") => x -> true, copy(df)) == df + + flipflop0 = let + state = false + () -> (state = !state) + end + + flipflop1 = let + state = false + x -> (state = !state) + end - @test_throws ArgumentError filter([] => () -> true, df) - @test_throws ArgumentError filter(AsTable(r"z") => () -> true, df) - @test_throws ArgumentError filter!([] => () -> true, df) - @test_throws ArgumentError filter!(AsTable(r"z") => () -> true, df) + @test filter([] => flipflop0, df) == df[[1,3], :] + @test filter(Int[] => flipflop0, df) == df[[1,3], :] + @test filter(String[] => flipflop0, df) == df[[1,3], :] + @test filter(Symbol[] => flipflop0, df) == df[[1,3], :] + @test filter(r"z" => flipflop0, df) == df[[1,3], :] + @test filter(Not(All()) => flipflop0, df) == df[[1,3], :] + @test filter(AsTable(r"z") => flipflop1, df) == df[[1,3], :] + @test filter(AsTable([]) => flipflop1, df) == df[[1,3], :] + @test filter!([] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Int[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(String[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Symbol[] => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(r"z" => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(Not(All()) => flipflop0, copy(df)) == df[[1,3], :] + @test filter!(AsTable(r"z") => flipflop1, copy(df)) == df[[1,3], :] + @test filter!(AsTable([]) => flipflop1, copy(df)) == df[[1,3], :] + + @test_throws MethodError filter([] => flipflop1, df) + @test_throws MethodError filter(AsTable([]) => flipflop0, df) end @testset "names with cols" begin diff --git a/test/grouping.jl b/test/grouping.jl index ecf16cce70..d5f1934506 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -2979,4 +2979,45 @@ end @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) end +@testset "empty ByRow" begin + inc0 = let + state = 0 + () -> (state += 1) + end + + inc1 = let + state = 0 + x -> (state += 1) + end + + df = DataFrame(a=[1,1,1,2,2,3,4,4,5,5,5,5], b=1:12) + gdf = groupby_checked(df, :a) + + @test select(gdf, [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a, bin=1:12) + @test combine(gdf, [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a, bin=13:24) + @test select(gdf, AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a, bin=1:12) + @test combine(gdf, AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a, bin=13:24) + @test combine(gdf[Not(2)], [] => ByRow(inc0) => :bin) == + DataFrame(a=df.a[Not(4:5)], bin=25:34) + @test combine(gdf[Not(2)], AsTable([]) => ByRow(inc1) => :bin) == + DataFrame(a=df.a[Not(4:5)], bin=25:34) + + # note that type inference in a comprehension does not always work + @test isequal_coltyped(combine(gdf[[]], [] => ByRow(inc0) => :bin), + DataFrame(a=Int[], bin=Any[])) + @test isequal_coltyped(combine(gdf[[]], [] => ByRow(rand) => :bin), + DataFrame(a=Int[], bin=Float64[])) + @test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(inc1) => :bin), + DataFrame(a=Int[], bin=Any[])) + @test isequal_coltyped(combine(gdf[[]], AsTable([]) => ByRow(x -> rand()) => :bin), + DataFrame(a=Int[], bin=Float64[])) + + @test_throws MethodError select(gdf, [] => ByRow(inc1) => :bin) + @test_throws MethodError select(gdf, AsTable([]) => ByRow(inc0) => :bin) +end + end # module From f507944731e9ddf983f79eb495308b82544a22f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 13 Oct 2020 17:41:02 +0200 Subject: [PATCH 5/6] remove dependency on CategoricalArrays.jl in legacy show (#2427) --- src/abstractdataframe/show.jl | 42 +++++++++-------------------------- test/show.jl | 8 +++---- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index 7e23975f2c..ccca7f9bdc 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -28,47 +28,25 @@ end """ DataFrames.ourshow(io::IO, x::Any, truncstring::Int) -Render a value to an `IO` object compactly and omitting type information, by -calling 3-argument `show`, or 2-argument `show` if the former contains line breaks. -Unlike `show`, render strings without surrounding quote marks. +Render a value to an `IO` object compactly using print. `truncstring` indicates the approximate number of text characters width to truncate the output (if it is a non-positive value then no truncation is applied). """ function ourshow(io::IO, x::Any, truncstring::Int; styled::Bool=false) io_ctx = IOContext(io, :compact=>get(io, :compact, true), :typeinfo=>typeof(x)) - - # This mirrors the behavior of Base.print_matrix_row - # First try 3-arg show - sx = sprint(show, "text/plain", x, context=io_ctx) - - # If the output contains line breaks, try 2-arg show instead. - if occursin('\n', sx) - sx = sprint(show, x, context=io_ctx) - end - - # strings should have " stripped here - if x isa AbstractString - @assert sx[1] == sx[end] == '"' - sx = escape_string(chop(sx, head=1, tail=1), "") - end - + sx = sprint(print, x, context=io_ctx) + sx = escape_string(sx, ()) # do not escape " sx = truncatestring(sx, truncstring) - - if styled - printstyled(io_ctx, sx, color=:light_black) - else - print(io_ctx, sx) - end + styled ? printstyled(io_ctx, sx, color=:light_black) : print(io_ctx, sx) end const SHOW_TABULAR_TYPES = Union{AbstractDataFrame, DataFrameRow, DataFrameRows, DataFrameColumns, GroupedDataFrame} -ourshow(io::IO, x::AbstractString, truncstring::Int) = - escape_string(io, truncatestring(x, truncstring), "") -ourshow(io::IO, x::CategoricalValue{<:AbstractString}, truncstring::Int) = - ourshow(io, get(x), truncstring) -ourshow(io::IO, x::Symbol, truncstring::Int) = ourshow(io, string(x), truncstring) +# workaround Julia 1.0 for Char +ourshow(io::IO, x::Char, truncstring::Int; styled::Bool=false) = + ourshow(io, string(x), styled=styled, truncstring) + ourshow(io::IO, x::Nothing, truncstring::Int; styled::Bool=false) = ourshow(io, "", styled=styled, truncstring) ourshow(io::IO, x::SHOW_TABULAR_TYPES, truncstring::Int; styled::Bool=false) = @@ -113,7 +91,9 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true) maxwidth -= 1 # we will add "…" at the end - if T <: CategoricalValue + # This is only type display shortening so we + # are OK with any T whose name starts with CategoricalValue here + if startswith(sT, "CategoricalValue") || startswith(sT, "CategoricalArrays.CategoricalValue") sT = string(nameof(T)) if textwidth(sT) ≤ maxwidth return sT * "…" * suffix diff --git a/test/show.jl b/test/show.jl index be6d563833..94eb6b47bf 100644 --- a/test/show.jl +++ b/test/show.jl @@ -31,10 +31,10 @@ end │ Row │ A │ B │ C │ D │ │ │ Int64 │ String │ Float32 │ Char │ ├─────┼───────┼─────────────┼─────────┼──────┤ - │ 1 │ 1 │ x" │ 1.0 │ '\\'' │ - │ 2 │ 2 │ ∀ε>0: x+ε>x │ 2.0 │ '∀' │ - │ 3 │ 3 │ z\$ │ 3.0 │ '\$' │ - │ 4 │ 4 │ A\\nC │ 4.0 │ '\\n' │""" + │ 1 │ 1 │ x" │ 1.0 │ ' │ + │ 2 │ 2 │ ∀ε>0: x+ε>x │ 2.0 │ ∀ │ + │ 3 │ 3 │ z\$ │ 3.0 │ \$ │ + │ 4 │ 4 │ A\\nC │ 4.0 │ \\n │""" for allrows in [true, false], allcols in [true, false] io = IOBuffer() From e07b08da25ebbebb83117e35a3df8ae20fe0b7ad Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Fri, 16 Oct 2020 11:54:02 -0400 Subject: [PATCH 6/6] Implement permutedims (#2447) --- NEWS.md | 1 + docs/src/lib/functions.md | 1 + docs/src/man/reshaping_and_pivoting.md | 50 ++++++++++++ src/abstractdataframe/reshape.jl | 104 +++++++++++++++++++++++++ test/reshape.jl | 104 ++++++++++++++++++++----- 5 files changed, 241 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 504d2386b3..fff1f8c405 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,7 @@ * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449)) * passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine` with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476)) +* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447)) ## Deprecated diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index a3e154d2c1..128f0ac9e7 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -57,6 +57,7 @@ vcat ```@docs stack unstack +permutedims ``` ## Sorting diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index bb8ca54dea..7107f7184f 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -380,3 +380,53 @@ julia> first(unstack(x, :Species, :vsum), 6) │ 4 │ PetalWidth │ 0.244 │ 1.326 │ 2.026 │ │ 5 │ id │ 25.5 │ 75.5 │ 125.5 │ ``` + +To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref). + +```jldoctest reshape +julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> permutedims(df1, 1) +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ +``` + +Note that the column indexed by `src_colnames` in the original `df` +becomes the column names in the permuted result, +and the column names of the original become a new column. +Typically, this would be used on columns with homogenous element types, +since the element types of the other columns +are the result of `promote_type` on _all_ the permuted columns. +Note also that, by default, the new column created from the column names +of the original `df` has the same name as `src_namescol`. +An optional positional argument `dest_namescol` can alter this: + +```jldoctest reshape +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df2, 1, "different_name") +3×3 DataFrame +│ Row │ different_name │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────────────┼─────┼─────┤ +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 7db56ff20a..4f5cca7d1d 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -399,3 +399,107 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) res end + + +Base.transpose(::AbstractDataFrame, args...; kwargs...) = + MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") + +""" + permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, AbstractString}, + [dest_namescol::Union{Symbol, AbstractString}]; + makeunique::Bool=false) + +Turn `df` on its side such that rows become columns +and values in the column indexed by `src_namescol` become the names of new columns. +In the resulting `DataFrame`, column names of `df` will become the first column +with name specified by `dest_namescol`. + +# Arguments +- `df` : the `AbstractDataFrame` +- `src_namescol` : the column that will become the new header. + This column's element type must be `AbstractString` or `Symbol`. +- `dest_namescol` : the name of the first column in the returned `DataFrame`. + Defaults to the same name as `src_namescol`. +- `makeunique` : if `false` (the default), an error will be raised + if duplicate names are found; if `true`, duplicate names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + +Note: The element types of columns in resulting `DataFrame` +(other than the first column, which always has element type `String`) +will depend on the element types of _all_ input columns +based on the result of `promote_type`. +That is, if the source data frame contains `Int` and `Float64` columns, +resulting columns will have element type `Float64`. If the source has +`Int` and `String` columns, resulting columns will have element type `Any`. + +# Examples + +```jldoctest +julia> df1 = DataFrame(a=["x", "y"], b=[1., 2.], c=[3, 4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> permutedims(df1, 1) # note the column types +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ + +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df2, 1, "different_name") +3×3 DataFrame +│ Row │ different_name │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────────────┼─────┼─────┤ +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` +""" +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, + dest_namescol::Union{Symbol, AbstractString}; + makeunique::Bool=false) + + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) + end + eltype(df[!, src_namescol]) <: SymbolOrString || + throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) + + df_notsrc = df[!, Not(src_namescol)] + df_permuted = DataFrame(dest_namescol => names(df_notsrc)) + + if ncol(df_notsrc) == 0 + df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], df[!, src_namescol], + makeunique=makeunique, copycols=false) + else + m = permutedims(Matrix(df_notsrc)) + df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + end + return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) +end + +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; + makeunique::Bool=false) + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) + dest_namescol = _names(df)[src_namescol] + else + dest_namescol = src_namescol + end + return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) +end diff --git a/test/reshape.jl b/test/reshape.jl index 6e215e7e92..6aeca3dc04 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -25,10 +25,10 @@ const ≅ = isequal # first column stays as CategoricalArray in df3 @test df3 == df4 #Make sure unstack works with missing values at the start of the value column - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result - df4[1,:Mass] = missing + df4[1, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]), @@ -62,11 +62,11 @@ const ≅ = isequal @test df3 == df4 #Make sure unstack works with missing values at the start of the value column allowmissing!(df, :Value) - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result allowmissing!(df4, :Mass) - df4[2,:Mass] = missing + df4[2, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], @@ -89,9 +89,9 @@ const ≅ = isequal @test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol) # test missing value in grouping variable - mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4) - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] + mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4) + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] + @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @@ -158,7 +158,7 @@ end b = unstack(df, :variable, :value) @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4]) - df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1]) + df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1]) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value) end @@ -225,14 +225,14 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a)) + d1m = stack(d1[:, [1, 3, 4]], Not(:a)) @test propertynames(d1m) == [:a, :variable, :value] # Test naming of measure/value columns d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval) @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval] - d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval) + d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval) @test propertynames(d1m_named) == [:a, :letter, :someval] # test empty measures or ids @@ -270,21 +270,21 @@ end @test d1s[!, 5] isa DataFrames.StackedVector @test ndims(d1s[!, 5]) == 1 @test ndims(typeof(d1s[!, 2])) == 1 - @test d1s[!, 4][[1,24]] == ["a", "b"] - @test d1s[!, 5][[1,24]] == [1, 4] + @test d1s[!, 4][[1, 24]] == ["a", "b"] + @test d1s[!, 5][[1, 24]] == [1, 4] @test_throws ArgumentError d1s[!, 4][true] @test_throws ArgumentError d1s[!, 5][true] @test_throws ArgumentError d1s[!, 4][1.0] @test_throws ArgumentError d1s[!, 5][1.0] d1ss = stack(d1, [:a, :b], view=true) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol) - @test d1ss[!, 4][[1,24]] == [:a, :b] + @test d1ss[!, 4][[1, 24]] == [:a, :b] @test d1ss[!, 4] isa DataFrames.RepeatedVector # Those tests check indexing RepeatedVector/StackedVector by a vector @@ -307,7 +307,7 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a), view=true) + d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true) @test propertynames(d1m) == [:a, :variable, :value] d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true) @@ -329,13 +329,13 @@ end @test d1us3 == unstack(d1s2) # test unstack with exactly one key column that is not passed - df1 = stack(DataFrame(rand(10,10))) + df1 = stack(DataFrame(rand(10, 10))) df1[!, :id] = 1:100 @test size(unstack(df1, :variable, :value)) == (100, 11) @test unstack(df1, :variable, :value) ≅ unstack(df1) # test empty keycol - @test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value) + @test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value) end @testset "column names duplicates" begin @@ -494,7 +494,7 @@ end end @testset "test stack eltype" begin - df = DataFrame(rand(4,5)) + df = DataFrame(rand(4, 5)) sdf = stack(df) @test eltype(sdf.variable) === String @test eltype(typeof(sdf.variable)) === String @@ -507,4 +507,70 @@ end @test eltype(typeof(sdf2.value)) === Float64 end +@testset "permutedims" begin + df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2)) + + @test_throws MethodError transpose(df1) + @test_throws ArgumentError permutedims(df1, :bar) + + df1_pd = permutedims(df1, 1) + @test size(df1_pd, 1) == ncol(df1) - 1 + @test size(df1_pd, 2) == nrow(df1) + 1 + @test names(df1_pd) == ["a", "x", "y"] + @test df1_pd == permutedims(df1, :a) == permutedims(df1, 1) + @test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"] + + orignames1 = names(df1)[2:end] + for (i, row) in enumerate(eachrow(df1_pd)) + @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]] + end + + # All columns should be promoted + @test eltype(df1_pd.x) == Float64 + @test eltype(df1_pd.y) == Float64 + + df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2)) + + df2_pd = permutedims(df2, :a) + @test size(df2_pd, 1) == ncol(df2) - 1 + @test size(df2_pd, 2) == nrow(df2) + 1 + @test names(df2_pd) == ["a", "x", "y"] + + orignames2 = names(df2)[2:end] + for (i, row) in enumerate(eachrow(df2_pd)) + @test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]] + end + @test Any == eltype(df2_pd.x) + @test Any == eltype(df2_pd.y) + + df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10)) + + d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...] + @test_throws ArgumentError permutedims(df3, 1) + @test names(permutedims(df3, 1, makeunique=true)) == d3pd_names + @test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch + @test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names + + df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing], + e=["x", "y"], f=[:x, :y], # valid src + g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src + ) + + @test permutedims(df4[!, [:a, :b, :c, :e]], :e) == + permutedims(df4[!, [:e, :a, :b, :c]], 1) == + permutedims(df4[!, [:a, :b, :c, :f]], :f, :e) + # Can permute single-column + @test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[]) + # Can't index float Column + @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1) + @test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1) + # Can't index columns that allow for missing + @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1) + @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1) + # Can't permute empty `df` ... + @test_throws BoundsError permutedims(DataFrame(), 1) + # ... but can permute zero-row df + @test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"]) +end + end # module