From 1e5d04a26f688ec1b1fa2c8a4fe5d816f5fc793a Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Mon, 21 Sep 2020 14:46:22 -0400 Subject: [PATCH 01/25] naive transpose implementations --- src/abstractdataframe/reshape.jl | 17 +++++++++++++++++ test/reshape.jl | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 7db56ff20a..f241dd3b14 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -399,3 +399,20 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) res end + + +transpose1(df::AbstractDataFrame, indexcol=1) = unstack(stack(df,Not(indexcol)), :variable, indexcol,:value) + +function transpose2(df::AbstractDataFrame, indexcol=1) + m = permutedims(Matrix(df[!, Not(indexcol)])) + df2 = DataFrame(variable=names(df[!, Not(indexcol)])) + hcat(df2, DataFrame(m, df[!, indexcol])) +end + +function transpose3(df::AbstractDataFrame, indexcol=1) + df2 = DataFrame(variable=names(df[!, Not(indexcol)])) + for row in eachrow(df) + df2[!,row[indexcol]] = Vector(row[Not(indexcol)]) + end + return df2 +end \ No newline at end of file diff --git a/test/reshape.jl b/test/reshape.jl index 6e215e7e92..cc31fbff88 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -507,4 +507,8 @@ end @test eltype(typeof(sdf2.value)) === Float64 end +@testset "transpose" begin + # todo +end + end # module From 2255c725a53d7740bb7224550eb6d8fbd9c5be97 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Fri, 2 Oct 2020 10:47:50 -0400 Subject: [PATCH 02/25] add copycols Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/reshape.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index f241dd3b14..db4091cf47 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -406,7 +406,7 @@ transpose1(df::AbstractDataFrame, indexcol=1) = unstack(stack(df,Not(indexcol)), function transpose2(df::AbstractDataFrame, indexcol=1) m = permutedims(Matrix(df[!, Not(indexcol)])) df2 = DataFrame(variable=names(df[!, Not(indexcol)])) - hcat(df2, DataFrame(m, df[!, indexcol])) + hcat(df2, DataFrame(m, df[!, indexcol], copycols=false), copycols=false) end function transpose3(df::AbstractDataFrame, indexcol=1) @@ -415,4 +415,4 @@ function transpose3(df::AbstractDataFrame, indexcol=1) df2[!,row[indexcol]] = Vector(row[Not(indexcol)]) end return df2 -end \ No newline at end of file +end From f02eaae8e18591e9a9608e7011d62d9bfec8e312 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Fri, 2 Oct 2020 11:50:17 -0400 Subject: [PATCH 03/25] add dest_namecol arg --- src/abstractdataframe/reshape.jl | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index db4091cf47..e906543e90 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -401,18 +401,20 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) end -transpose1(df::AbstractDataFrame, indexcol=1) = unstack(stack(df,Not(indexcol)), :variable, indexcol,:value) +function transpose1(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) + unstack(stack(df,Not(src_namescol), variable_name=dest_namescol), dest_namescol, src_namescol, :value) +end -function transpose2(df::AbstractDataFrame, indexcol=1) - m = permutedims(Matrix(df[!, Not(indexcol)])) - df2 = DataFrame(variable=names(df[!, Not(indexcol)])) - hcat(df2, DataFrame(m, df[!, indexcol], copycols=false), copycols=false) +function transpose2(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) + m = permutedims((Matrix(df[!, Not(src_namescol)]))) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) + hcat(df2, DataFrame(m, df[!, src_namescol]), copycols=false) end -function transpose3(df::AbstractDataFrame, indexcol=1) - df2 = DataFrame(variable=names(df[!, Not(indexcol)])) +function transpose3(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) for row in eachrow(df) - df2[!,row[indexcol]] = Vector(row[Not(indexcol)]) + df2[!,row[src_namescol]] = Vector(row[Not(src_namescol)]) end return df2 end From 97ae507758266960813465cd3fe2f3256ba1040e Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Fri, 2 Oct 2020 12:29:30 -0400 Subject: [PATCH 04/25] check comprehensions --- src/abstractdataframe/reshape.jl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index e906543e90..e6ac4f822f 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -405,10 +405,16 @@ function transpose1(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variab unstack(stack(df,Not(src_namescol), variable_name=dest_namescol), dest_namescol, src_namescol, :value) end -function transpose2(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) - m = permutedims((Matrix(df[!, Not(src_namescol)]))) +function transpose2(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols=false) + m = permutedims(Matrix(df[!, Not(src_namescol)])) df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - hcat(df2, DataFrame(m, df[!, src_namescol]), copycols=false) + hcat(df2, DataFrame(m, df[!, src_namescol]), copycols=copycols) +end + +function transpose2_comprehension(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols=false) + m = permutedims(Matrix(df[!, Not(src_namescol)])) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) + hcat(df2, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol]), copycols=copycols) end function transpose3(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) @@ -418,3 +424,11 @@ function transpose3(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variab end return df2 end + +function transpose3_comprehension(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) + for row in eachrow(df) + df2[!,row[src_namescol]] = [x for x in row[Not(src_namescol)]] + end + return df2 +end From 33459e57fad0ce62166bff4858801bf7eff184d3 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Fri, 2 Oct 2020 14:43:18 -0400 Subject: [PATCH 05/25] fix type signature --- src/abstractdataframe/reshape.jl | 41 +++++++++++--------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index e6ac4f822f..6eaf0ce5bf 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -401,34 +401,21 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) end -function transpose1(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) - unstack(stack(df,Not(src_namescol), variable_name=dest_namescol), dest_namescol, src_namescol, :value) -end - -function transpose2(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols=false) - m = permutedims(Matrix(df[!, Not(src_namescol)])) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - hcat(df2, DataFrame(m, df[!, src_namescol]), copycols=copycols) -end - -function transpose2_comprehension(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols=false) - m = permutedims(Matrix(df[!, Not(src_namescol)])) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - hcat(df2, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol]), copycols=copycols) -end +""" + transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols::Bool=false, makeunique=false, promote_type::Bool=true) -function transpose3(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - for row in eachrow(df) - df2[!,row[src_namescol]] = Vector(row[Not(src_namescol)]) - end - return df2 -end +Transpose a `DataFrame`, such that rows become columns, +and the column indexed by `src_namescol` becomes a header. -function transpose3_comprehension(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - for row in eachrow(df) - df2[!,row[src_namescol]] = [x for x in row[Not(src_namescol)]] +""" +function Base.transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) + if promote_type + m = permutedims(Matrix(df[!, Not(src_namescol)])) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) + return hcat(df2, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=copycols) + else + m = permutedims(Matrix{Any}(df[!, Not(src_namescol)])) + df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) + hcat(df2, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol], makeunique=makeunique), copycols=copycols) end - return df2 end From 18be9f3501a61040382141af7ea853cda7810570 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Fri, 2 Oct 2020 15:07:12 -0400 Subject: [PATCH 06/25] add some docs --- src/abstractdataframe/reshape.jl | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 6eaf0ce5bf..98b2d4d516 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -402,11 +402,28 @@ end """ - transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=:variable; copycols::Bool=false, makeunique=false, promote_type::Bool=true) + transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) Transpose a `DataFrame`, such that rows become columns, and the column indexed by `src_namescol` becomes a header. +By default, the type of resulting columns will depend on the promoted type of all input columns. +Pass `promote_type=false` to make resulting column types depend on the elements of the originating rows, +though note that this may be substantially slower. + + +# Examples + +```julia +df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) # all types can be promoted to Float64 +df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)) + +transpose(df1) +transpose(df1, promote_type=false) + +transpose(df2) +transpose(df2, promote_type=false) +```` """ function Base.transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) if promote_type From fade33292ff3315b06e34528f0c6ab60fd9ae6ff Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Fri, 2 Oct 2020 15:41:36 -0400 Subject: [PATCH 07/25] add tests --- test/reshape.jl | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/test/reshape.jl b/test/reshape.jl index cc31fbff88..bc296e6efa 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -508,7 +508,44 @@ end end @testset "transpose" begin - # todo + df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) + df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) + df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) + + df1_t = transpose(df1) + @test size(df1_t,1) == ncol(df1) - 1 + @test size(df1_t, 2) == nrow(df1) + 1 + @test names(df1_t) == ["a", "x", "y"] + @test eltype(df1_t.x) <: AbstractFloat + @test eltype(df1_t.y) <: AbstractFloat + + df1_tp = transpose(df1, promote_type=false) + @test size(df1_tp,1) == ncol(df1) - 1 + @test size(df1_tp, 2) == nrow(df1) + 1 + @test names(df1_tp) == ["a", "x", "y"] + @test Bool <: eltype(df1_tp.x) + @test Int <: eltype(df1_tp.x) + @test AbstractFloat <: eltype(df1_tp.x) + + df2_t = transpose(df2) + @test size(df2_t,1) == ncol(df2) - 1 + @test size(df2_t, 2) == nrow(df2) + 1 + @test names(df2_t) == ["a", "x", "y"] + @test Any <: eltype(df2_t.x) + @test Any <: eltype(df2_t.y) + + + df2_tp = transpose(df2, promote_type=false) + @test size(df2_tp,1) == ncol(df2) - 1 + @test size(df2_tp, 2) == nrow(df2) + 1 + @test names(df2_tp) == ["a", "x", "y"] + @test Bool <: eltype(df2_tp.x) + @test Int <: eltype(df2_tp.x) + @test AbstractFloat <: eltype(df2_tp.x) + @test Any <: eltype(df2_tp.y) + + @test_throws ArgumentError transpose(df3) + @test names(transpose(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] end end # module From 4d915923b5ab365b45fbf80483bcaa780b630aa9 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Mon, 5 Oct 2020 18:08:22 -0400 Subject: [PATCH 08/25] switch to permute dims, update args --- src/abstractdataframe/reshape.jl | 35 ++++++++----- test/reshape.jl | 86 +++++++++++++++++++------------- 2 files changed, 73 insertions(+), 48 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 98b2d4d516..889793619d 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -401,8 +401,10 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) end +Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") + """ - transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) + permutedims(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) Transpose a `DataFrame`, such that rows become columns, and the column indexed by `src_namescol` becomes a header. @@ -418,21 +420,28 @@ though note that this may be substantially slower. df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) # all types can be promoted to Float64 df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)) -transpose(df1) -transpose(df1, promote_type=false) +permutedims(df1) +permutedims(df1, promote_type=false) -transpose(df2) -transpose(df2, promote_type=false) +permutedims(df2) +permutedims(df2, promote_type=false) ```` """ -function Base.transpose(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) - if promote_type - m = permutedims(Matrix(df[!, Not(src_namescol)])) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - return hcat(df2, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=copycols) +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1, + dest_namescol::Union{Symbol, AbstractString}=src_namescol isa Integer ? + _names(df)[src_namescol] : + src_namescol; + makeunique::Bool=false, promote::Symbol=:all) + + df_notsrc = df[!, Not(src_namescol)] + df_permuted = DataFrame([names(df_notsrc)], [dest_namescol]) + if promote == :all || ((promote == :none ) && (all(col-> eltype(col) == eltype(first(eachcol(df_notsrc))), eachcol(df_notsrc)))) + m = permutedims(Matrix(df_notsrc)) + hcat!(df_permuted, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=false) + elseif promote == :none + m = permutedims(Matrix{Any}(df_notsrc)) + hcat!(df_permuted, DataFrame(collect.(eachcol(m)), df[!, src_namescol], makeunique=make_unique), copycols=false) else - m = permutedims(Matrix{Any}(df[!, Not(src_namescol)])) - df2 = DataFrame([names(df[!, Not(src_namescol)])], [dest_namescol]) - hcat(df2, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol], makeunique=makeunique), copycols=copycols) + throw(ArgumentError("Value '$promote' for `promote` not supported")) end end diff --git a/test/reshape.jl b/test/reshape.jl index bc296e6efa..c612be740f 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -507,45 +507,61 @@ end @test eltype(typeof(sdf2.value)) === Float64 end -@testset "transpose" begin +@testset "permute dims" begin df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) + df4 = DataFrame(a=rand(2), b=rand(2), c=rand(2), d=["x", "y"], e=[:x, :y], f=[missing, "y"], g=[1,2]) + + @test_throws MethodError transpose(df1) + @test_throws ArgumentError permutedims(df1, promote=:foo) + + df1_pd = permutedims(df1) + @test df1_pd + @test size(df1_pd, 1) == ncol(df1) - 1 + @test size(df1_pd, 2) == nrow(df1) + 1 + @test names(df1_pd) == ["a", "x", "y"] + @test eltype(df1_pd.x) <: AbstractFloat + @test eltype(df1_pd.y) <: AbstractFloat + + df1_pdn = permutedims(df1, promote=:none) + @test size(df1_pdn, 1) == ncol(df1) - 1 + @test size(df1_pdn, 2) == nrow(df1) + 1 + @test names(df1_pdn) == ["a", "x", "y"] + @test Bool <: eltype(df1_pdn.x) + @test Int <: eltype(df1_pdn.x) + @test AbstractFloat <: eltype(df1_pdn.x) + + df2_pd = permutedims(df2) + @test size(df2_pd, 1) == ncol(df2) - 1 + @test size(df2_pd, 2) == nrow(df2) + 1 + @test names(df2_pd) == ["a", "x", "y"] + @test Any <: eltype(df2_pd.x) + @test Any <: eltype(df2_pd.y) + + + df2_pdn = permutedims(df2, promote=:none) + @test size(df2_pdn, 1) == ncol(df2) - 1 + @test size(df2_pdn, 2) == nrow(df2) + 1 + @test names(df2_pdn) == ["a", "x", "y"] + @test Bool <: eltype(df2_pdn.x) + @test Int <: eltype(df2_pdn.x) + @test AbstractFloat <: eltype(df2_pdn.x) + @test Any <: eltype(df2_pdn.y) + + @test_throws ArgumentError permutedims(df3) + @test names(permutedims(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] + + #= + Needs other tests, TODO: https://github.com/JuliaData/DataFrames.jl/pull/2447#discussion_r499123081 + + please check: + + 1. DataFrame() + 2. DataFrame(a=Int[], b=Float64[]) + 3. and cases that should error (e.g. missing column, column with new names that is not Symbol or string etc.) + =# - df1_t = transpose(df1) - @test size(df1_t,1) == ncol(df1) - 1 - @test size(df1_t, 2) == nrow(df1) + 1 - @test names(df1_t) == ["a", "x", "y"] - @test eltype(df1_t.x) <: AbstractFloat - @test eltype(df1_t.y) <: AbstractFloat - - df1_tp = transpose(df1, promote_type=false) - @test size(df1_tp,1) == ncol(df1) - 1 - @test size(df1_tp, 2) == nrow(df1) + 1 - @test names(df1_tp) == ["a", "x", "y"] - @test Bool <: eltype(df1_tp.x) - @test Int <: eltype(df1_tp.x) - @test AbstractFloat <: eltype(df1_tp.x) - - df2_t = transpose(df2) - @test size(df2_t,1) == ncol(df2) - 1 - @test size(df2_t, 2) == nrow(df2) + 1 - @test names(df2_t) == ["a", "x", "y"] - @test Any <: eltype(df2_t.x) - @test Any <: eltype(df2_t.y) - - - df2_tp = transpose(df2, promote_type=false) - @test size(df2_tp,1) == ncol(df2) - 1 - @test size(df2_tp, 2) == nrow(df2) + 1 - @test names(df2_tp) == ["a", "x", "y"] - @test Bool <: eltype(df2_tp.x) - @test Int <: eltype(df2_tp.x) - @test AbstractFloat <: eltype(df2_tp.x) - @test Any <: eltype(df2_tp.y) - - @test_throws ArgumentError transpose(df3) - @test names(transpose(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] end end # module From 302360bdcd854622ef013adc6cf070ab5ffc5e26 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Mon, 5 Oct 2020 19:11:10 -0400 Subject: [PATCH 09/25] add some tests --- src/abstractdataframe/reshape.jl | 4 +++- test/reshape.jl | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 889793619d..a7dce0654d 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -433,6 +433,8 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1, src_namescol; makeunique::Bool=false, promote::Symbol=:all) + nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for tables with 0 rows")) + df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame([names(df_notsrc)], [dest_namescol]) if promote == :all || ((promote == :none ) && (all(col-> eltype(col) == eltype(first(eachcol(df_notsrc))), eachcol(df_notsrc)))) @@ -440,7 +442,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1, hcat!(df_permuted, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=false) elseif promote == :none m = permutedims(Matrix{Any}(df_notsrc)) - hcat!(df_permuted, DataFrame(collect.(eachcol(m)), df[!, src_namescol], makeunique=make_unique), copycols=false) + hcat!(df_permuted, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol], makeunique=makeunique), copycols=false) else throw(ArgumentError("Value '$promote' for `promote` not supported")) end diff --git a/test/reshape.jl b/test/reshape.jl index c612be740f..aa2bc3d029 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -511,7 +511,7 @@ end df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) - df4 = DataFrame(a=rand(2), b=rand(2), c=rand(2), d=["x", "y"], e=[:x, :y], f=[missing, "y"], g=[1,2]) + df4 = DataFrame(a=rand(2), b=rand(2), c=rand(2), d=["x", "y"], e=[:x, :y], f=[missing, "y"], g=[1,2], h=[1., missing]) @test_throws MethodError transpose(df1) @test_throws ArgumentError permutedims(df1, promote=:foo) @@ -561,7 +561,9 @@ end 2. DataFrame(a=Int[], b=Float64[]) 3. and cases that should error (e.g. missing column, column with new names that is not Symbol or string etc.) =# - + @test_throws MethodError permutedims(df4[!, Not([:d, :e, :f, :g, :h])]) + @test permutedims(df4[!, Not([:e, :f, :g, :h])], :d) == permutedims(df4[!, Not([:e, :f, :g, :h])], :d, promote=:none) + @test_throws ArgumentError permutedims(DataFrame()) end end # module From 8fe80987b508379918f1a550f318704ff20ce876 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Tue, 6 Oct 2020 12:22:24 -0400 Subject: [PATCH 10/25] address part of nalimilan's review --- src/abstractdataframe/reshape.jl | 15 +++++++++------ test/reshape.jl | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index a7dce0654d..066906a5af 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -428,16 +428,19 @@ permutedims(df2, promote_type=false) ```` """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1, - dest_namescol::Union{Symbol, AbstractString}=src_namescol isa Integer ? - _names(df)[src_namescol] : - src_namescol; - makeunique::Bool=false, promote::Symbol=:all) + dest_namescol::Union{Symbol, AbstractString}=src_namescol isa Integer ? + _names(df)[src_namescol] : + src_namescol; + makeunique::Bool=false, promote::Symbol=:all) + + nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for data frame with 0 rows")) - nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for tables with 0 rows")) df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame([names(df_notsrc)], [dest_namescol]) - if promote == :all || ((promote == :none ) && (all(col-> eltype(col) == eltype(first(eachcol(df_notsrc))), eachcol(df_notsrc)))) + + an_eltype = eltype(first(eachcol(df_notsrc))) + if promote == :all || ((promote == :none ) && (all(col-> eltype(col) == an_eltype, eachcol(df_notsrc)))) m = permutedims(Matrix(df_notsrc)) hcat!(df_permuted, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=false) elseif promote == :none diff --git a/test/reshape.jl b/test/reshape.jl index aa2bc3d029..7994ed54c6 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -507,7 +507,7 @@ end @test eltype(typeof(sdf2.value)) === Float64 end -@testset "permute dims" begin +@testset "permutedims" begin df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) @@ -515,12 +515,19 @@ end @test_throws MethodError transpose(df1) @test_throws ArgumentError permutedims(df1, promote=:foo) + @test_throws ArgumentError permutedims(df1, :bar) df1_pd = permutedims(df1) @test df1_pd @test size(df1_pd, 1) == ncol(df1) - 1 @test size(df1_pd, 2) == nrow(df1) + 1 @test names(df1_pd) == ["a", "x", "y"] + + orignames = names(df1)[2:end] + for (i, row) in ennumerate(eachrow(df1_pd)) + @test Vector(df[i, :]) == [orignames[i]; df1[!, orignames[i]]] + end + @test eltype(df1_pd.x) <: AbstractFloat @test eltype(df1_pd.y) <: AbstractFloat @@ -539,7 +546,6 @@ end @test Any <: eltype(df2_pd.x) @test Any <: eltype(df2_pd.y) - df2_pdn = permutedims(df2, promote=:none) @test size(df2_pdn, 1) == ncol(df2) - 1 @test size(df2_pdn, 2) == nrow(df2) + 1 @@ -558,12 +564,17 @@ end please check: 1. DataFrame() - 2. DataFrame(a=Int[], b=Float64[]) - 3. and cases that should error (e.g. missing column, column with new names that is not Symbol or string etc.) =# + + # Can't index Float Column @test_throws MethodError permutedims(df4[!, Not([:d, :e, :f, :g, :h])]) + # promote=:all and =:none are the same if all columns are same eltype @test permutedims(df4[!, Not([:e, :f, :g, :h])], :d) == permutedims(df4[!, Not([:e, :f, :g, :h])], :d, promote=:none) - @test_throws ArgumentError permutedims(DataFrame()) + + # can't permute dfs with 0 rows + @test_throws ArgumentError permutedims(DataFrame()) # not working + @test_throws ArgumentError permutedims(DataFrame(a=Int[], b=Float64[])) + end end # module From 4c6a2e72ec094e516101a3d274a647079bf4dfa3 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Tue, 6 Oct 2020 14:33:17 -0400 Subject: [PATCH 11/25] fix and add tests --- src/abstractdataframe/reshape.jl | 73 ++++++++++++++++++++++---------- test/reshape.jl | 64 +++++++++++----------------- 2 files changed, 75 insertions(+), 62 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 066906a5af..4011cea4d8 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -401,23 +401,44 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) end -Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") +Base.transpose(::AbstractDataFrame, args...; kwargs...) = + MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") + # ↑ is 94 char, is that OK? (there are a couple other lines in this file that go over) """ - permutedims(df::AbstractDataFrame, src_namescol=1, dest_namescol=names(df)[src_namescol]; copycols::Bool=false, makeunique=false, promote_type::Bool=true) + permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, + dest_namescol::Union{Symbol,Int}; + makeunique::Bool=false) + permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) + permutedims(df::AbstractDataFrame; makeunique::Bool=false) -Transpose a `DataFrame`, such that rows become columns, +Turn a `DataFrame` on its side such that rows become columns and the column indexed by `src_namescol` becomes a header. +In the resulting `DataFrame`, +The header of `df` will become the first column +with name specified by `dest_namescol` -By default, the type of resulting columns will depend on the promoted type of all input columns. -Pass `promote_type=false` to make resulting column types depend on the elements of the originating rows, -though note that this may be substantially slower. - +# Arguments +- `df` : the `AbstractDataFrame` +- `src_namescol` : the column that will become the new header. + Defaults to first column. +- `dest_namescol` : the name of the first column in the returned `DataFrame`. + Defaults to the same name as `src_namescol`. +- `makeunique` : if `false` (the default), an error will be raised + if duplicate names are found; if `true`, duplicate names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + +Note: The eltypes of columns in resulting `DataFrame` will depend +on the eltypes of _all_ input columns. +That is, if the source `DataFrame` contains `String` and `Int` columns, +all resulting columns will have eltype `Any`. +If the source has a mix of numeric types (eg. `Float64` and `Int`), +all columns in resulting `DataFrame` will be promoted to `Float64`. # Examples ```julia -df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) # all types can be promoted to Float64 +df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)) permutedims(df1) @@ -427,26 +448,34 @@ permutedims(df2) permutedims(df2, promote_type=false) ```` """ -function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1, - dest_namescol::Union{Symbol, AbstractString}=src_namescol isa Integer ? - _names(df)[src_namescol] : - src_namescol; - makeunique::Bool=false, promote::Symbol=:all) +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, + dest_namescol::Union{Symbol, AbstractString}; + makeunique::Bool=false) nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for data frame with 0 rows")) - + eltype(df[!, src_namescol]) <: SymbolOrString || throw( + ArgumentError("src_namescol must have eltype `Symbol` or `AbstractString`")) df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame([names(df_notsrc)], [dest_namescol]) - an_eltype = eltype(first(eachcol(df_notsrc))) - if promote == :all || ((promote == :none ) && (all(col-> eltype(col) == an_eltype, eachcol(df_notsrc)))) - m = permutedims(Matrix(df_notsrc)) - hcat!(df_permuted, DataFrame(m, df[!, src_namescol], makeunique=makeunique), copycols=false) - elseif promote == :none - m = permutedims(Matrix{Any}(df_notsrc)) - hcat!(df_permuted, DataFrame([[x for x in col] for col in eachcol(m)], df[!, src_namescol], makeunique=makeunique), copycols=false) + m = permutedims(Matrix(df_notsrc)) + df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + hcat!(df_permuted, df_tmp, copycols=false) +end + +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; + makeunique::Bool=false) + nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for data frame with 0 rows")) + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(ArgumentError("`src_namescol` doesn't exist")) + dest_namescol = _names(df)[src_namescol] else - throw(ArgumentError("Value '$promote' for `promote` not supported")) + dest_namescol = src_namescol end + permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) +end + +function Base.permutedims(df::AbstractDataFrame; makeunique::Bool=false) + permutedims(df, 1; makeunique=makeunique) end diff --git a/test/reshape.jl b/test/reshape.jl index 7994ed54c6..b367935eac 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -511,70 +511,54 @@ end df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) - df4 = DataFrame(a=rand(2), b=rand(2), c=rand(2), d=["x", "y"], e=[:x, :y], f=[missing, "y"], g=[1,2], h=[1., missing]) + df4 = DataFrame(a=rand(2), b=rand(2), c=[1,2], d=[1., missing], + e=["x", "y"], f=[:x, :y], # valid src + g=[missing, "y"], h=Union{Missing,String}["x","y"]) @test_throws MethodError transpose(df1) - @test_throws ArgumentError permutedims(df1, promote=:foo) @test_throws ArgumentError permutedims(df1, :bar) df1_pd = permutedims(df1) - @test df1_pd @test size(df1_pd, 1) == ncol(df1) - 1 @test size(df1_pd, 2) == nrow(df1) + 1 @test names(df1_pd) == ["a", "x", "y"] + @test df1_pd == permutedims(df1, :a) == permutedims(df1, 1) + @test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"] - orignames = names(df1)[2:end] - for (i, row) in ennumerate(eachrow(df1_pd)) - @test Vector(df[i, :]) == [orignames[i]; df1[!, orignames[i]]] + orignames1 = names(df1)[2:end] + for (i, row) in enumerate(eachrow(df1_pd)) + @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]] end - @test eltype(df1_pd.x) <: AbstractFloat - @test eltype(df1_pd.y) <: AbstractFloat - - df1_pdn = permutedims(df1, promote=:none) - @test size(df1_pdn, 1) == ncol(df1) - 1 - @test size(df1_pdn, 2) == nrow(df1) + 1 - @test names(df1_pdn) == ["a", "x", "y"] - @test Bool <: eltype(df1_pdn.x) - @test Int <: eltype(df1_pdn.x) - @test AbstractFloat <: eltype(df1_pdn.x) + @test eltype(df1_pd.x) <: Float64 + @test eltype(df1_pd.y) <: Float64 df2_pd = permutedims(df2) @test size(df2_pd, 1) == ncol(df2) - 1 @test size(df2_pd, 2) == nrow(df2) + 1 @test names(df2_pd) == ["a", "x", "y"] + + orignames2 = names(df2)[2:end] + for (i, row) in enumerate(eachrow(df2_pd)) + @test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]] + end @test Any <: eltype(df2_pd.x) @test Any <: eltype(df2_pd.y) - df2_pdn = permutedims(df2, promote=:none) - @test size(df2_pdn, 1) == ncol(df2) - 1 - @test size(df2_pdn, 2) == nrow(df2) + 1 - @test names(df2_pdn) == ["a", "x", "y"] - @test Bool <: eltype(df2_pdn.x) - @test Int <: eltype(df2_pdn.x) - @test AbstractFloat <: eltype(df2_pdn.x) - @test Any <: eltype(df2_pdn.y) - @test_throws ArgumentError permutedims(df3) @test names(permutedims(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] - #= - Needs other tests, TODO: https://github.com/JuliaData/DataFrames.jl/pull/2447#discussion_r499123081 - - please check: - - 1. DataFrame() - =# - + @test permutedims(df4[!, [:a,:b,:c,:e]], :e) == + permutedims(df4[!, [:e,:a,:b,:c]]) == + permutedims(df4[!, [:a,:b,:c,:f]], :f, :e) # Can't index Float Column - @test_throws MethodError permutedims(df4[!, Not([:d, :e, :f, :g, :h])]) - # promote=:all and =:none are the same if all columns are same eltype - @test permutedims(df4[!, Not([:e, :f, :g, :h])], :d) == permutedims(df4[!, Not([:e, :f, :g, :h])], :d, promote=:none) - + @test_throws ArgumentError permutedims(df4[!, [:a,:b,:c]]) + # Can't index in the presence of missing + @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]]) + @test_throws ArgumentError permutedims(df4[!, [:h,:a,:b]]) # can't permute dfs with 0 rows - @test_throws ArgumentError permutedims(DataFrame()) # not working - @test_throws ArgumentError permutedims(DataFrame(a=Int[], b=Float64[])) - + @test_throws ArgumentError permutedims(DataFrame()) + @test_throws ArgumentError permutedims(DataFrame(a=String[], b=Float64[])) end end # module From a5806239efcf881cb33715bb000cf84efc742c18 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Tue, 6 Oct 2020 14:40:07 -0400 Subject: [PATCH 12/25] add doctests --- src/abstractdataframe/reshape.jl | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 4011cea4d8..1da857b008 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -437,15 +437,28 @@ all columns in resulting `DataFrame` will be promoted to `Float64`. # Examples -```julia -df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) -df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)) - -permutedims(df1) -permutedims(df1, promote_type=false) - -permutedims(df2) -permutedims(df2, promote_type=false) +```jldoctest +julia> df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)); + +julia> df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)); + +julia> permutedims(df1) # note the column type +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼──────────┼──────────┤ +│ 1 │ b │ 0.982197 │ 0.263357 │ +│ 2 │ c │ 1.0 │ 2.0 │ +│ 3 │ d │ 0.0 │ 1.0 │ + +julia> permutedims(df2) +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────┼─────┼─────┤ +│ 1 │ b │ 1 │ str │ +│ 2 │ c │ 1 │ 2 │ +│ 3 │ d │ 0 │ 0 │ ```` """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, From ea627b1ee5302293b906bb015b46c36021e0a832 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Tue, 6 Oct 2020 16:50:32 -0400 Subject: [PATCH 13/25] bkamins review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bogumił Kamiński --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 1da857b008..d6e858ffa6 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -407,7 +407,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = """ permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, - dest_namescol::Union{Symbol,Int}; + dest_namescol::Union{Symbol,AbstractString}; makeunique::Bool=false) permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) permutedims(df::AbstractDataFrame; makeunique::Bool=false) From 6cb1772968a8396abda6a24cadbb89b70d07ac93 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Tue, 6 Oct 2020 18:42:05 -0400 Subject: [PATCH 14/25] most of bkamins review --- src/abstractdataframe/reshape.jl | 75 +++++++++++++++++++------------- test/reshape.jl | 64 ++++++++++++++------------- 2 files changed, 78 insertions(+), 61 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index d6e858ffa6..c6d3b80ddb 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -403,7 +403,6 @@ end Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") - # ↑ is 94 char, is that OK? (there are a couple other lines in this file that go over) """ permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, @@ -412,15 +411,16 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) permutedims(df::AbstractDataFrame; makeunique::Bool=false) -Turn a `DataFrame` on its side such that rows become columns -and the column indexed by `src_namescol` becomes a header. +Turn `df` on its side such that rows become columns +and the column indexed by `src_namescol` becomes the names of new columns. In the resulting `DataFrame`, The header of `df` will become the first column -with name specified by `dest_namescol` +with name specified by `dest_namescol`. # Arguments - `df` : the `AbstractDataFrame` - `src_namescol` : the column that will become the new header. + This column eltype must be `<: Union{String, Symbol}`. Defaults to first column. - `dest_namescol` : the name of the first column in the returned `DataFrame`. Defaults to the same name as `src_namescol`. @@ -428,67 +428,80 @@ with name specified by `dest_namescol` if duplicate names are found; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -Note: The eltypes of columns in resulting `DataFrame` will depend -on the eltypes of _all_ input columns. +Note: The eltypes of columns in resulting `DataFrame` +(other than the first column, which always has eltype `String`) +will depend on the eltypes of _all_ input columns +based on the results of `prote_type`. That is, if the source `DataFrame` contains `String` and `Int` columns, -all resulting columns will have eltype `Any`. +resulting columns will have eltype `Any`. If the source has a mix of numeric types (eg. `Float64` and `Int`), -all columns in resulting `DataFrame` will be promoted to `Float64`. +columns in resulting `DataFrame` will be promoted to `Float64`. # Examples ```jldoctest -julia> df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)); - -julia> df2 = DataFrame(a=["x", "y"], b=[1, "str"], c=[1,2], d=rand(Bool,2)); - -julia> permutedims(df1) # note the column type +julia> df1 = DataFrame(a=["x", "y"], b=[1.,2.], c=[3,4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3,4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df1) # note the column types 3×3 DataFrame -│ Row │ a │ x │ y │ -│ │ String │ Float64 │ Float64 │ -├─────┼────────┼──────────┼──────────┤ -│ 1 │ b │ 0.982197 │ 0.263357 │ -│ 2 │ c │ 1.0 │ 2.0 │ -│ 3 │ d │ 0.0 │ 1.0 │ +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ julia> permutedims(df2) 3×3 DataFrame │ Row │ a │ x │ y │ │ │ String │ Any │ Any │ ├─────┼────────┼─────┼─────┤ -│ 1 │ b │ 1 │ str │ -│ 2 │ c │ 1 │ 2 │ -│ 3 │ d │ 0 │ 0 │ -```` +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; makeunique::Bool=false) - nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for data frame with 0 rows")) + nrow(df) > 0 || throw( + ArgumentError("`permutedims` not defined for data frame with 0 rows")) eltype(df[!, src_namescol]) <: SymbolOrString || throw( - ArgumentError("src_namescol must have eltype `Symbol` or `AbstractString`")) + ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) df_notsrc = df[!, Not(src_namescol)] - df_permuted = DataFrame([names(df_notsrc)], [dest_namescol]) + df_permuted = DataFrame(dest_namescol => names(df_notsrc)) m = permutedims(Matrix(df_notsrc)) df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) - hcat!(df_permuted, df_tmp, copycols=false) + return hcat!(df_permuted, df_tmp, copycols=false) end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) - nrow(df) > 0 || throw(ArgumentError("`permutedims` not defined for data frame with 0 rows")) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(ArgumentError("`src_namescol` doesn't exist")) dest_namescol = _names(df)[src_namescol] else dest_namescol = src_namescol end - permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) + return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) end -function Base.permutedims(df::AbstractDataFrame; makeunique::Bool=false) +Base.permutedims(df::AbstractDataFrame; makeunique::Bool=false) = permutedims(df, 1; makeunique=makeunique) -end diff --git a/test/reshape.jl b/test/reshape.jl index b367935eac..2803ed84d7 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -25,10 +25,10 @@ const ≅ = isequal # first column stays as CategoricalArray in df3 @test df3 == df4 #Make sure unstack works with missing values at the start of the value column - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result - df4[1,:Mass] = missing + df4[1, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]), @@ -62,11 +62,11 @@ const ≅ = isequal @test df3 == df4 #Make sure unstack works with missing values at the start of the value column allowmissing!(df, :Value) - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result allowmissing!(df4, :Mass) - df4[2,:Mass] = missing + df4[2, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], @@ -89,9 +89,9 @@ const ≅ = isequal @test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol) # test missing value in grouping variable - mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4) - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] + mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4) + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] + @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @@ -158,7 +158,7 @@ end b = unstack(df, :variable, :value) @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4]) - df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1]) + df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1]) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value) end @@ -225,14 +225,14 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a)) + d1m = stack(d1[:, [1, 3, 4]], Not(:a)) @test propertynames(d1m) == [:a, :variable, :value] # Test naming of measure/value columns d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval) @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval] - d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval) + d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval) @test propertynames(d1m_named) == [:a, :letter, :someval] # test empty measures or ids @@ -270,21 +270,21 @@ end @test d1s[!, 5] isa DataFrames.StackedVector @test ndims(d1s[!, 5]) == 1 @test ndims(typeof(d1s[!, 2])) == 1 - @test d1s[!, 4][[1,24]] == ["a", "b"] - @test d1s[!, 5][[1,24]] == [1, 4] + @test d1s[!, 4][[1, 24]] == ["a", "b"] + @test d1s[!, 5][[1, 24]] == [1, 4] @test_throws ArgumentError d1s[!, 4][true] @test_throws ArgumentError d1s[!, 5][true] @test_throws ArgumentError d1s[!, 4][1.0] @test_throws ArgumentError d1s[!, 5][1.0] d1ss = stack(d1, [:a, :b], view=true) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol) - @test d1ss[!, 4][[1,24]] == [:a, :b] + @test d1ss[!, 4][[1, 24]] == [:a, :b] @test d1ss[!, 4] isa DataFrames.RepeatedVector # Those tests check indexing RepeatedVector/StackedVector by a vector @@ -307,7 +307,7 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a), view=true) + d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true) @test propertynames(d1m) == [:a, :variable, :value] d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true) @@ -329,13 +329,13 @@ end @test d1us3 == unstack(d1s2) # test unstack with exactly one key column that is not passed - df1 = stack(DataFrame(rand(10,10))) + df1 = stack(DataFrame(rand(10, 10))) df1[!, :id] = 1:100 @test size(unstack(df1, :variable, :value)) == (100, 11) @test unstack(df1, :variable, :value) ≅ unstack(df1) # test empty keycol - @test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value) + @test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value) end @testset "column names duplicates" begin @@ -494,7 +494,7 @@ end end @testset "test stack eltype" begin - df = DataFrame(rand(4,5)) + df = DataFrame(rand(4, 5)) sdf = stack(df) @test eltype(sdf.variable) === String @test eltype(typeof(sdf.variable)) === String @@ -508,12 +508,7 @@ end end @testset "permutedims" begin - df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1,2], d=rand(Bool,2)) - df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1,2], d=rand(Bool,2)) - df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool,10)) - df4 = DataFrame(a=rand(2), b=rand(2), c=[1,2], d=[1., missing], - e=["x", "y"], f=[:x, :y], # valid src - g=[missing, "y"], h=Union{Missing,String}["x","y"]) + df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2)) @test_throws MethodError transpose(df1) @test_throws ArgumentError permutedims(df1, :bar) @@ -533,6 +528,8 @@ end @test eltype(df1_pd.x) <: Float64 @test eltype(df1_pd.y) <: Float64 + df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1, 2], d=rand(Bool, 2)) + df2_pd = permutedims(df2) @test size(df2_pd, 1) == ncol(df2) - 1 @test size(df2_pd, 2) == nrow(df2) + 1 @@ -545,17 +542,24 @@ end @test Any <: eltype(df2_pd.x) @test Any <: eltype(df2_pd.y) + df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10)) + @test_throws ArgumentError permutedims(df3) @test names(permutedims(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] - @test permutedims(df4[!, [:a,:b,:c,:e]], :e) == - permutedims(df4[!, [:e,:a,:b,:c]]) == - permutedims(df4[!, [:a,:b,:c,:f]], :f, :e) + df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing], + e=["x", "y"], f=[:x, :y], # valid src + g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src + ) + + @test permutedims(df4[!, [:a, :b, :c, :e]], :e) == + permutedims(df4[!, [:e, :a, :b, :c]]) == + permutedims(df4[!, [:a, :b, :c, :f]], :f, :e) # Can't index Float Column - @test_throws ArgumentError permutedims(df4[!, [:a,:b,:c]]) + @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]]) # Can't index in the presence of missing @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]]) - @test_throws ArgumentError permutedims(df4[!, [:h,:a,:b]]) + @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]]) # can't permute dfs with 0 rows @test_throws ArgumentError permutedims(DataFrame()) @test_throws ArgumentError permutedims(DataFrame(a=String[], b=Float64[])) From 4313c37e1142197f82cc0433031ffc6fbd4bafbc Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Tue, 6 Oct 2020 18:51:27 -0400 Subject: [PATCH 15/25] add docs to manual --- docs/src/man/reshaping_and_pivoting.md | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index bb8ca54dea..fc5e3b688a 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -380,3 +380,49 @@ julia> first(unstack(x, :Species, :vsum), 6) │ 4 │ PetalWidth │ 0.244 │ 1.326 │ 2.026 │ │ 5 │ id │ 25.5 │ 75.5 │ 125.5 │ ``` + +To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref). + +```jldoctest reshape +julia> df1 = DataFrame(a=["x", "y"], b=[1.,2.], c=[3,4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> permutedims(df1) +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ +``` + +Note that the first column (by default) of the original `df` +becomes the column names in the permuted result, +and the column names of the original become a new column. +Note also that the types of the other columns +are the result of `promote_type` on _all_ the permuted columns. + +```jldoctest reshape +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3,4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df2) +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────┼─────┼─────┤ +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` From 5ba618f8e6c4e3ab97e669d2ee2c2836a64dd676 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 7 Oct 2020 10:04:59 -0400 Subject: [PATCH 16/25] address more comments --- docs/src/lib/functions.md | 1 + docs/src/man/reshaping_and_pivoting.md | 2 +- src/abstractdataframe/reshape.jl | 7 +++---- test/reshape.jl | 12 ++++++------ 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index a3e154d2c1..128f0ac9e7 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -57,6 +57,7 @@ vcat ```@docs stack unstack +permutedims ``` ## Sorting diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index fc5e3b688a..7d18034392 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -405,7 +405,7 @@ julia> permutedims(df1) Note that the first column (by default) of the original `df` becomes the column names in the permuted result, and the column names of the original become a new column. -Note also that the types of the other columns +Note also that the element types of the other columns are the result of `promote_type` on _all_ the permuted columns. ```jldoctest reshape diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index c6d3b80ddb..f4c8974649 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -413,8 +413,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = Turn `df` on its side such that rows become columns and the column indexed by `src_namescol` becomes the names of new columns. -In the resulting `DataFrame`, -The header of `df` will become the first column +In the resulting `DataFrame`, column names of `df` will become the first column with name specified by `dest_namescol`. # Arguments @@ -482,7 +481,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, nrow(df) > 0 || throw( ArgumentError("`permutedims` not defined for data frame with 0 rows")) eltype(df[!, src_namescol]) <: SymbolOrString || throw( - ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) + ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame(dest_namescol => names(df_notsrc)) @@ -495,7 +494,7 @@ end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) if src_namescol isa Integer - 1 <= src_namescol <= ncol(df) || throw(ArgumentError("`src_namescol` doesn't exist")) + 1 <= src_namescol <= ncol(df) || throw(BoundsError("`src_namescol` doesn't exist")) dest_namescol = _names(df)[src_namescol] else dest_namescol = src_namescol diff --git a/test/reshape.jl b/test/reshape.jl index 2803ed84d7..18546166ae 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -525,8 +525,8 @@ end @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]] end - @test eltype(df1_pd.x) <: Float64 - @test eltype(df1_pd.y) <: Float64 + @test eltype(df1_pd.x) == Float64 + @test eltype(df1_pd.y) == Float64 df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1, 2], d=rand(Bool, 2)) @@ -539,8 +539,8 @@ end for (i, row) in enumerate(eachrow(df2_pd)) @test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]] end - @test Any <: eltype(df2_pd.x) - @test Any <: eltype(df2_pd.y) + @test Any == eltype(df2_pd.x) + @test Any == eltype(df2_pd.y) df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10)) @@ -555,9 +555,9 @@ end @test permutedims(df4[!, [:a, :b, :c, :e]], :e) == permutedims(df4[!, [:e, :a, :b, :c]]) == permutedims(df4[!, [:a, :b, :c, :f]], :f, :e) - # Can't index Float Column + # Can't index float Column @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]]) - # Can't index in the presence of missing + # Can't index columns that allow for missing @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]]) @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]]) # can't permute dfs with 0 rows From 82dda05c1bc63f35fb154aaac0f7b1377382a383 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 7 Oct 2020 11:00:57 -0400 Subject: [PATCH 17/25] more comma spaces and float formatting --- docs/src/man/reshaping_and_pivoting.md | 4 ++-- src/abstractdataframe/reshape.jl | 2 +- test/reshape.jl | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 7d18034392..2de4db8547 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -384,7 +384,7 @@ julia> first(unstack(x, :Species, :vsum), 6) To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref). ```jldoctest reshape -julia> df1 = DataFrame(a=["x", "y"], b=[1.,2.], c=[3,4], d=[true,false]) +julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true,false]) 2×4 DataFrame │ Row │ a │ b │ c │ d │ │ │ String │ Float64 │ Int64 │ Bool │ @@ -409,7 +409,7 @@ Note also that the element types of the other columns are the result of `promote_type` on _all_ the permuted columns. ```jldoctest reshape -julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3,4], d=[true,false]) +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false]) 2×4 DataFrame │ Row │ a │ b │ c │ d │ │ │ String │ Any │ Int64 │ Bool │ diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index f4c8974649..e224f45764 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -475,7 +475,7 @@ julia> permutedims(df2) ``` """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, - dest_namescol::Union{Symbol, AbstractString}; + dest_namescol::Union{Symbol, <:AbstractString}; makeunique::Bool=false) nrow(df) > 0 || throw( diff --git a/test/reshape.jl b/test/reshape.jl index 18546166ae..cf31d1e3f1 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -528,7 +528,7 @@ end @test eltype(df1_pd.x) == Float64 @test eltype(df1_pd.y) == Float64 - df2 = DataFrame(a=["x", "y"], b=[1., "str"], c=[1, 2], d=rand(Bool, 2)) + df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2)) df2_pd = permutedims(df2) @test size(df2_pd, 1) == ncol(df2) - 1 From 0d51503bc0da3be870eac6d5752764c091d513ff Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 7 Oct 2020 12:04:57 -0400 Subject: [PATCH 18/25] allow zero-row and 1 column permutation --- src/abstractdataframe/reshape.jl | 13 +++++++++---- test/reshape.jl | 11 ++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index e224f45764..edd4891f9d 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -478,16 +478,21 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, <:AbstractString}; makeunique::Bool=false) - nrow(df) > 0 || throw( - ArgumentError("`permutedims` not defined for data frame with 0 rows")) + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(BoundsError("`src_namescol` doesn't exist")) + end eltype(df[!, src_namescol]) <: SymbolOrString || throw( ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame(dest_namescol => names(df_notsrc)) - m = permutedims(Matrix(df_notsrc)) - df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + if ncol(df_notsrc) == 0 + df_tmp = DataFrame(Dict(n=>[] for n in df[!, src_namescol])) + else + m = permutedims(Matrix(df_notsrc)) + df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + end return hcat!(df_permuted, df_tmp, copycols=false) end diff --git a/test/reshape.jl b/test/reshape.jl index cf31d1e3f1..6ed5c0091c 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -525,6 +525,7 @@ end @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]] end + # All columns should be promoted @test eltype(df1_pd.x) == Float64 @test eltype(df1_pd.y) == Float64 @@ -555,14 +556,18 @@ end @test permutedims(df4[!, [:a, :b, :c, :e]], :e) == permutedims(df4[!, [:e, :a, :b, :c]]) == permutedims(df4[!, [:a, :b, :c, :f]], :f, :e) + # Can permute single-column + @test permutedims(df4[!, [:e]]) == DataFrame(e=String[], x=[], y=[]) # Can't index float Column @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]]) + @test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[])) # Can't index columns that allow for missing @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]]) @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]]) - # can't permute dfs with 0 rows - @test_throws ArgumentError permutedims(DataFrame()) - @test_throws ArgumentError permutedims(DataFrame(a=String[], b=Float64[])) + # Can't permute empty `df` ... + @test_throws BoundsError permutedims(DataFrame()) + # ... but can permute zero-row df + @test permutedims(DataFrame(a=String[], b=Float64[])) == DataFrame(a=["b"]) end end # module From 7635e1aaa5c47459eb021cb9b8b4219e48fbfd04 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 7 Oct 2020 12:13:50 -0400 Subject: [PATCH 19/25] remove ColumnIndex from docstring --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index edd4891f9d..a7cd93a218 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -405,7 +405,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") """ - permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, + permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, <:AbstractString}, dest_namescol::Union{Symbol,AbstractString}; makeunique::Bool=false) permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) From 097eaa86b88d61f22d68d8b309afa7c2c34cc3e8 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Wed, 7 Oct 2020 13:30:06 -0400 Subject: [PATCH 20/25] slight clarification to docs --- docs/src/man/reshaping_and_pivoting.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 2de4db8547..1c60348bf2 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -405,7 +405,8 @@ julia> permutedims(df1) Note that the first column (by default) of the original `df` becomes the column names in the permuted result, and the column names of the original become a new column. -Note also that the element types of the other columns +Typically, this would be used on columns with homogenous element types, +since the element types of the other columns are the result of `promote_type` on _all_ the permuted columns. ```jldoctest reshape From 348cad7f483260ebd8d162ef27fbbfc52054ed24 Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Thu, 8 Oct 2020 12:32:50 -0400 Subject: [PATCH 21/25] more review comments, fix bounds error --- src/abstractdataframe/reshape.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index a7cd93a218..fb7f5f59b4 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -479,16 +479,16 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, makeunique::Bool=false) if src_namescol isa Integer - 1 <= src_namescol <= ncol(df) || throw(BoundsError("`src_namescol` doesn't exist")) + 1 <= src_namescol <= ncol(df) || throw(BoundsError(df, src_namescol)) end - eltype(df[!, src_namescol]) <: SymbolOrString || throw( - ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) + eltype(df[!, src_namescol]) <: SymbolOrString || + throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) df_notsrc = df[!, Not(src_namescol)] df_permuted = DataFrame(dest_namescol => names(df_notsrc)) if ncol(df_notsrc) == 0 - df_tmp = DataFrame(Dict(n=>[] for n in df[!, src_namescol])) + df_tmp = DataFrame((n=>[] for n in df[!, src_namescol])...) else m = permutedims(Matrix(df_notsrc)) df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) @@ -499,7 +499,7 @@ end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) if src_namescol isa Integer - 1 <= src_namescol <= ncol(df) || throw(BoundsError("`src_namescol` doesn't exist")) + 1 <= src_namescol <= ncol(df) || throw(BoundsError(df, src_namescol)) dest_namescol = _names(df)[src_namescol] else dest_namescol = src_namescol From 94fc79255d2ec3149a78fa2febc92f44a8b6396b Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Thu, 8 Oct 2020 15:18:48 -0400 Subject: [PATCH 22/25] add news entry --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 504d2386b3..fff1f8c405 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,7 @@ * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449)) * passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine` with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476)) +* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447)) ## Deprecated From cd7a4a2d6f482d823e66bccd45708865a714f2ab Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Thu, 8 Oct 2020 15:39:22 -0400 Subject: [PATCH 23/25] really fix bounds error, and comma --- docs/src/man/reshaping_and_pivoting.md | 2 +- src/abstractdataframe/reshape.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 1c60348bf2..0914707f02 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -384,7 +384,7 @@ julia> first(unstack(x, :Species, :vsum), 6) To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref). ```jldoctest reshape -julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true,false]) +julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false]) 2×4 DataFrame │ Row │ a │ b │ c │ d │ │ │ String │ Float64 │ Int64 │ Bool │ diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index fb7f5f59b4..9eae092cb6 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -479,7 +479,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, makeunique::Bool=false) if src_namescol isa Integer - 1 <= src_namescol <= ncol(df) || throw(BoundsError(df, src_namescol)) + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) end eltype(df[!, src_namescol]) <: SymbolOrString || throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) @@ -499,7 +499,7 @@ end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) if src_namescol isa Integer - 1 <= src_namescol <= ncol(df) || throw(BoundsError(df, src_namescol)) + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] else dest_namescol = src_namescol From 2f3e7b0f1b79bb227555d4fe9b482dc0c102395e Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Thu, 8 Oct 2020 16:39:10 -0400 Subject: [PATCH 24/25] fix function signature --- src/abstractdataframe/reshape.jl | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 9eae092cb6..43ce17eb3e 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -405,11 +405,9 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") """ - permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, <:AbstractString}, - dest_namescol::Union{Symbol,AbstractString}; - makeunique::Bool=false) - permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; makeunique::Bool=false) - permutedims(df::AbstractDataFrame; makeunique::Bool=false) + permutedims(df::AbstractDataFrame [, src_namescol::Union{Int, Symbol, <:AbstractString} + [, dest_namescol::Union{Symbol, AbstractString} ]]; + makeunique::Bool=false) Turn `df` on its side such that rows become columns and the column indexed by `src_namescol` becomes the names of new columns. @@ -427,14 +425,13 @@ with name specified by `dest_namescol`. if duplicate names are found; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -Note: The eltypes of columns in resulting `DataFrame` -(other than the first column, which always has eltype `String`) -will depend on the eltypes of _all_ input columns -based on the results of `prote_type`. -That is, if the source `DataFrame` contains `String` and `Int` columns, -resulting columns will have eltype `Any`. -If the source has a mix of numeric types (eg. `Float64` and `Int`), -columns in resulting `DataFrame` will be promoted to `Float64`. +Note: The element types of columns in resulting `DataFrame` +(other than the first column, which always has element type `String`) +will depend on the element types of _all_ input columns +based on the result of `promote_type`. +That is, if the source data frame contains `Int` and `Float64` columns, +resulting columns will have element type `Float64`. If the source has +`Int` and `String` columns, resulting columns will have element type `Any`. # Examples @@ -496,7 +493,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, return hcat!(df_permuted, df_tmp, copycols=false) end -function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1; makeunique::Bool=false) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) @@ -506,6 +503,3 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; end return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) end - -Base.permutedims(df::AbstractDataFrame; makeunique::Bool=false) = - permutedims(df, 1; makeunique=makeunique) From 040d8561d831f1a9c0ad2ce89afab4bf5fd98bcf Mon Sep 17 00:00:00 2001 From: "Kevin Bonham, PhD" Date: Mon, 12 Oct 2020 12:04:46 -0400 Subject: [PATCH 25/25] missing makeunique + tests --- src/abstractdataframe/reshape.jl | 6 +++--- test/reshape.jl | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 43ce17eb3e..a0618c26d6 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -426,7 +426,7 @@ with name specified by `dest_namescol`. with `_i` (`i` starting at 1 for the first duplicate). Note: The element types of columns in resulting `DataFrame` -(other than the first column, which always has element type `String`) +(other than the first column, w hich always has element type `String`) will depend on the element types of _all_ input columns based on the result of `promote_type`. That is, if the source data frame contains `Int` and `Float64` columns, @@ -485,12 +485,12 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, df_permuted = DataFrame(dest_namescol => names(df_notsrc)) if ncol(df_notsrc) == 0 - df_tmp = DataFrame((n=>[] for n in df[!, src_namescol])...) + df_tmp = DataFrame((n=>[] for n in df[!, src_namescol])..., makeunique=makeunique) else m = permutedims(Matrix(df_notsrc)) df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) end - return hcat!(df_permuted, df_tmp, copycols=false) + return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex=1; diff --git a/test/reshape.jl b/test/reshape.jl index 6ed5c0091c..7d6e364c3e 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -545,8 +545,11 @@ end df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10)) + d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...] @test_throws ArgumentError permutedims(df3) - @test names(permutedims(df3, makeunique=true)) == ["a", "x", ("x_$i" for i in 1:9)...] + @test names(permutedims(df3, makeunique=true)) == d3pd_names + @test_throws ArgumentError permutedims(df3[!, [:a]]) # single column branch + @test names(permutedims(df3[!, [:a]], makeunique=true)) == d3pd_names df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing], e=["x", "y"], f=[:x, :y], # valid src