diff --git a/NEWS.md b/NEWS.md index 504d2386b3..fff1f8c405 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,7 @@ * add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449)) * passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine` with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476)) +* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447)) ## Deprecated diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index a3e154d2c1..128f0ac9e7 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -57,6 +57,7 @@ vcat ```@docs stack unstack +permutedims ``` ## Sorting diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index bb8ca54dea..7107f7184f 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -380,3 +380,53 @@ julia> first(unstack(x, :Species, :vsum), 6) │ 4 │ PetalWidth │ 0.244 │ 1.326 │ 2.026 │ │ 5 │ id │ 25.5 │ 75.5 │ 125.5 │ ``` + +To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref). + +```jldoctest reshape +julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> permutedims(df1, 1) +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ +``` + +Note that the column indexed by `src_colnames` in the original `df` +becomes the column names in the permuted result, +and the column names of the original become a new column. +Typically, this would be used on columns with homogenous element types, +since the element types of the other columns +are the result of `promote_type` on _all_ the permuted columns. +Note also that, by default, the new column created from the column names +of the original `df` has the same name as `src_namescol`. +An optional positional argument `dest_namescol` can alter this: + +```jldoctest reshape +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df2, 1, "different_name") +3×3 DataFrame +│ Row │ different_name │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────────────┼─────┼─────┤ +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 7db56ff20a..13b4043ff1 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -399,3 +399,106 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector) res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) res end + + +Base.transpose(::AbstractDataFrame, args...; kwargs...) = + MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead") + +""" + permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, <:AbstractString} + [, dest_namescol::Union{Symbol, AbstractString}]; + makeunique::Bool=false) + +Turn `df` on its side such that rows become columns +and the column indexed by `src_namescol` becomes the names of new columns. +In the resulting `DataFrame`, column names of `df` will become the first column +with name specified by `dest_namescol`. + +# Arguments +- `df` : the `AbstractDataFrame` +- `src_namescol` : the column that will become the new header. + This column's element type must be `AbstractString` or `Symbol`. +- `dest_namescol` : the name of the first column in the returned `DataFrame`. + Defaults to the same name as `src_namescol`. +- `makeunique` : if `false` (the default), an error will be raised + if duplicate names are found; if `true`, duplicate names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + +Note: The element types of columns in resulting `DataFrame` +(other than the first column, which always has element type `String`) +will depend on the element types of _all_ input columns +based on the result of `promote_type`. +That is, if the source data frame contains `Int` and `Float64` columns, +resulting columns will have element type `Float64`. If the source has +`Int` and `String` columns, resulting columns will have element type `Any`. + +# Examples + +```jldoctest +julia> df1 = DataFrame(a=["x", "y"], b=[1., 2.], c=[3, 4], d=[true,false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Float64 │ Int64 │ Bool │ +├─────┼────────┼─────────┼───────┼──────┤ +│ 1 │ x │ 1.0 │ 3 │ 1 │ +│ 2 │ y │ 2.0 │ 4 │ 0 │ + +julia> permutedims(df1, 1) # note the column types +3×3 DataFrame +│ Row │ a │ x │ y │ +│ │ String │ Float64 │ Float64 │ +├─────┼────────┼─────────┼─────────┤ +│ 1 │ b │ 1.0 │ 2.0 │ +│ 2 │ c │ 3.0 │ 4.0 │ +│ 3 │ d │ 1.0 │ 0.0 │ + +julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false]) +2×4 DataFrame +│ Row │ a │ b │ c │ d │ +│ │ String │ Any │ Int64 │ Bool │ +├─────┼────────┼─────┼───────┼──────┤ +│ 1 │ x │ 1 │ 3 │ 1 │ +│ 2 │ y │ two │ 4 │ 0 │ + +julia> permutedims(df2, 1, "different_name") +3×3 DataFrame +│ Row │ different_name │ x │ y │ +│ │ String │ Any │ Any │ +├─────┼────────────────┼─────┼─────┤ +│ 1 │ b │ 1 │ two │ +│ 2 │ c │ 3 │ 4 │ +│ 3 │ d │ 1 │ 0 │ +``` +""" +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, + dest_namescol::Union{Symbol, AbstractString}; + makeunique::Bool=false) + + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) + end + eltype(df[!, src_namescol]) <: SymbolOrString || + throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`")) + + df_notsrc = df[!, Not(src_namescol)] + df_permuted = DataFrame(dest_namescol => names(df_notsrc)) + + if ncol(df_notsrc) == 0 + df_tmp = DataFrame((n=>[] for n in df[!, src_namescol])..., makeunique=makeunique) + else + m = permutedims(Matrix(df_notsrc)) + df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique) + end + return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) +end + +function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; + makeunique::Bool=false) + if src_namescol isa Integer + 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) + dest_namescol = _names(df)[src_namescol] + else + dest_namescol = src_namescol + end + return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique) +end diff --git a/test/reshape.jl b/test/reshape.jl index 6e215e7e92..6aeca3dc04 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -25,10 +25,10 @@ const ≅ = isequal # first column stays as CategoricalArray in df3 @test df3 == df4 #Make sure unstack works with missing values at the start of the value column - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result - df4[1,:Mass] = missing + df4[1, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]), @@ -62,11 +62,11 @@ const ≅ = isequal @test df3 == df4 #Make sure unstack works with missing values at the start of the value column allowmissing!(df, :Value) - df[1,:Value] = missing + df[1, :Value] = missing df2 = unstack(df, :Fish, :Key, :Value) #This changes the expected result allowmissing!(df4, :Mass) - df4[2,:Mass] = missing + df4[2, :Mass] = missing @test df2 ≅ df4 df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"], @@ -89,9 +89,9 @@ const ≅ = isequal @test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol) # test missing value in grouping variable - mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4) - @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] - @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:] + mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4) + @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] + @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :] @test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3] @@ -158,7 +158,7 @@ end b = unstack(df, :variable, :value) @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4]) - df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1]) + df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1]) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value) @test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value) end @@ -225,14 +225,14 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a)) + d1m = stack(d1[:, [1, 3, 4]], Not(:a)) @test propertynames(d1m) == [:a, :variable, :value] # Test naming of measure/value columns d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval) @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval] - d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval) + d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval) @test propertynames(d1m_named) == [:a, :letter, :someval] # test empty measures or ids @@ -270,21 +270,21 @@ end @test d1s[!, 5] isa DataFrames.StackedVector @test ndims(d1s[!, 5]) == 1 @test ndims(typeof(d1s[!, 2])) == 1 - @test d1s[!, 4][[1,24]] == ["a", "b"] - @test d1s[!, 5][[1,24]] == [1, 4] + @test d1s[!, 4][[1, 24]] == ["a", "b"] + @test d1s[!, 5][[1, 24]] == [1, 4] @test_throws ArgumentError d1s[!, 4][true] @test_throws ArgumentError d1s[!, 5][true] @test_throws ArgumentError d1s[!, 4][1.0] @test_throws ArgumentError d1s[!, 5][1.0] d1ss = stack(d1, [:a, :b], view=true) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String) - @test d1ss[!, 4][[1,24]] == ["a", "b"] + @test d1ss[!, 4][[1, 24]] == ["a", "b"] @test d1ss[!, 4] isa DataFrames.RepeatedVector d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol) - @test d1ss[!, 4][[1,24]] == [:a, :b] + @test d1ss[!, 4][[1, 24]] == [:a, :b] @test d1ss[!, 4] isa DataFrames.RepeatedVector # Those tests check indexing RepeatedVector/StackedVector by a vector @@ -307,7 +307,7 @@ end @test d1s2 == d1s3 @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m - d1m = stack(d1[:, [1,3,4]], Not(:a), view=true) + d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true) @test propertynames(d1m) == [:a, :variable, :value] d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true) @@ -329,13 +329,13 @@ end @test d1us3 == unstack(d1s2) # test unstack with exactly one key column that is not passed - df1 = stack(DataFrame(rand(10,10))) + df1 = stack(DataFrame(rand(10, 10))) df1[!, :id] = 1:100 @test size(unstack(df1, :variable, :value)) == (100, 11) @test unstack(df1, :variable, :value) ≅ unstack(df1) # test empty keycol - @test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value) + @test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value) end @testset "column names duplicates" begin @@ -494,7 +494,7 @@ end end @testset "test stack eltype" begin - df = DataFrame(rand(4,5)) + df = DataFrame(rand(4, 5)) sdf = stack(df) @test eltype(sdf.variable) === String @test eltype(typeof(sdf.variable)) === String @@ -507,4 +507,70 @@ end @test eltype(typeof(sdf2.value)) === Float64 end +@testset "permutedims" begin + df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2)) + + @test_throws MethodError transpose(df1) + @test_throws ArgumentError permutedims(df1, :bar) + + df1_pd = permutedims(df1, 1) + @test size(df1_pd, 1) == ncol(df1) - 1 + @test size(df1_pd, 2) == nrow(df1) + 1 + @test names(df1_pd) == ["a", "x", "y"] + @test df1_pd == permutedims(df1, :a) == permutedims(df1, 1) + @test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"] + + orignames1 = names(df1)[2:end] + for (i, row) in enumerate(eachrow(df1_pd)) + @test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]] + end + + # All columns should be promoted + @test eltype(df1_pd.x) == Float64 + @test eltype(df1_pd.y) == Float64 + + df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2)) + + df2_pd = permutedims(df2, :a) + @test size(df2_pd, 1) == ncol(df2) - 1 + @test size(df2_pd, 2) == nrow(df2) + 1 + @test names(df2_pd) == ["a", "x", "y"] + + orignames2 = names(df2)[2:end] + for (i, row) in enumerate(eachrow(df2_pd)) + @test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]] + end + @test Any == eltype(df2_pd.x) + @test Any == eltype(df2_pd.y) + + df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10)) + + d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...] + @test_throws ArgumentError permutedims(df3, 1) + @test names(permutedims(df3, 1, makeunique=true)) == d3pd_names + @test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch + @test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names + + df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing], + e=["x", "y"], f=[:x, :y], # valid src + g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src + ) + + @test permutedims(df4[!, [:a, :b, :c, :e]], :e) == + permutedims(df4[!, [:e, :a, :b, :c]], 1) == + permutedims(df4[!, [:a, :b, :c, :f]], :f, :e) + # Can permute single-column + @test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[]) + # Can't index float Column + @test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1) + @test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1) + # Can't index columns that allow for missing + @test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1) + @test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1) + # Can't permute empty `df` ... + @test_throws BoundsError permutedims(DataFrame(), 1) + # ... but can permute zero-row df + @test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"]) +end + end # module