Skip to content

Commit

Permalink
Merge 114023e into f507944
Browse files Browse the repository at this point in the history
  • Loading branch information
kescobo committed Oct 15, 2020
2 parents f507944 + 114023e commit 448ad3b
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 19 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -76,6 +76,7 @@
* add `only` method for `AbstractDataFrame` ([#2449](https://github.com/JuliaData/DataFrames.jl/pull/2449))
* passing empty sets of columns in `filter`/`filter!` and in `select`/`transform`/`combine`
with `ByRow` is now accepted ([#2476](https://github.com/JuliaData/DataFrames.jl/pull/2476))
* add `permutedims` method for `AbstractDataFrame` ([#2447](https://github.com/JuliaData/DataFrames.jl/pull/2447))

## Deprecated

Expand Down
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Expand Up @@ -57,6 +57,7 @@ vcat
```@docs
stack
unstack
permutedims
```

## Sorting
Expand Down
50 changes: 50 additions & 0 deletions docs/src/man/reshaping_and_pivoting.md
Expand Up @@ -380,3 +380,53 @@ julia> first(unstack(x, :Species, :vsum), 6)
│ 4 │ PetalWidth │ 0.244 │ 1.326 │ 2.026 │
│ 5 │ id │ 25.5 │ 75.5 │ 125.5 │
```

To turn an `AbstractDataFrame` on its side, use [`permutedims`](@ref).

```jldoctest reshape
julia> df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
2×4 DataFrame
│ Row │ a │ b │ c │ d │
│ │ String │ Float64 │ Int64 │ Bool │
├─────┼────────┼─────────┼───────┼──────┤
│ 1 │ x │ 1.0 │ 3 │ 1 │
│ 2 │ y │ 2.0 │ 4 │ 0 │
julia> permutedims(df1, 1)
3×3 DataFrame
│ Row │ a │ x │ y │
│ │ String │ Float64 │ Float64 │
├─────┼────────┼─────────┼─────────┤
│ 1 │ b │ 1.0 │ 2.0 │
│ 2 │ c │ 3.0 │ 4.0 │
│ 3 │ d │ 1.0 │ 0.0 │
```

Note that the column indexed by `src_colnames` in the original `df`
becomes the column names in the permuted result,
and the column names of the original become a new column.
Typically, this would be used on columns with homogenous element types,
since the element types of the other columns
are the result of `promote_type` on _all_ the permuted columns.
Note also that, by default, the new column created from the column names
of the original `df` has the same name as `src_namescol`.
An optional positional argument `dest_namescol` can alter this:

```jldoctest reshape
julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
2×4 DataFrame
│ Row │ a │ b │ c │ d │
│ │ String │ Any │ Int64 │ Bool │
├─────┼────────┼─────┼───────┼──────┤
│ 1 │ x │ 1 │ 3 │ 1 │
│ 2 │ y │ two │ 4 │ 0 │
julia> permutedims(df2, 1, "different_name")
3×3 DataFrame
│ Row │ different_name │ x │ y │
│ │ String │ Any │ Any │
├─────┼────────────────┼─────┼─────┤
│ 1 │ b │ 1 │ two │
│ 2 │ c │ 3 │ 4 │
│ 3 │ d │ 1 │ 0 │
```
103 changes: 103 additions & 0 deletions src/abstractdataframe/reshape.jl
Expand Up @@ -399,3 +399,106 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector)
res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
res
end


Base.transpose(::AbstractDataFrame, args...; kwargs...) =
MethodError("`transpose` not defined for `AbstractDataFrame`s. Try `permutedims` instead")

"""
permutedims(df::AbstractDataFrame, src_namescol::Union{Int, Symbol, <:AbstractString}
[, dest_namescol::Union{Symbol, AbstractString}];
makeunique::Bool=false)
Turn `df` on its side such that rows become columns
and the column indexed by `src_namescol` becomes the names of new columns.
In the resulting `DataFrame`, column names of `df` will become the first column
with name specified by `dest_namescol`.
# Arguments
- `df` : the `AbstractDataFrame`
- `src_namescol` : the column that will become the new header.
This column's element type must be `AbstractString` or `Symbol`.
- `dest_namescol` : the name of the first column in the returned `DataFrame`.
Defaults to the same name as `src_namescol`.
- `makeunique` : if `false` (the default), an error will be raised
if duplicate names are found; if `true`, duplicate names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).
Note: The element types of columns in resulting `DataFrame`
(other than the first column, which always has element type `String`)
will depend on the element types of _all_ input columns
based on the result of `promote_type`.
That is, if the source data frame contains `Int` and `Float64` columns,
resulting columns will have element type `Float64`. If the source has
`Int` and `String` columns, resulting columns will have element type `Any`.
# Examples
```jldoctest
julia> df1 = DataFrame(a=["x", "y"], b=[1., 2.], c=[3, 4], d=[true,false])
2×4 DataFrame
│ Row │ a │ b │ c │ d │
│ │ String │ Float64 │ Int64 │ Bool │
├─────┼────────┼─────────┼───────┼──────┤
│ 1 │ x │ 1.0 │ 3 │ 1 │
│ 2 │ y │ 2.0 │ 4 │ 0 │
julia> permutedims(df1, 1) # note the column types
3×3 DataFrame
│ Row │ a │ x │ y │
│ │ String │ Float64 │ Float64 │
├─────┼────────┼─────────┼─────────┤
│ 1 │ b │ 1.0 │ 2.0 │
│ 2 │ c │ 3.0 │ 4.0 │
│ 3 │ d │ 1.0 │ 0.0 │
julia> df2 = DataFrame(a=["x", "y"], b=[1, "two"], c=[3, 4], d=[true, false])
2×4 DataFrame
│ Row │ a │ b │ c │ d │
│ │ String │ Any │ Int64 │ Bool │
├─────┼────────┼─────┼───────┼──────┤
│ 1 │ x │ 1 │ 3 │ 1 │
│ 2 │ y │ two │ 4 │ 0 │
julia> permutedims(df2, 1, "different_name")
3×3 DataFrame
│ Row │ different_name │ x │ y │
│ │ String │ Any │ Any │
├─────┼────────────────┼─────┼─────┤
│ 1 │ b │ 1 │ two │
│ 2 │ c │ 3 │ 4 │
│ 3 │ d │ 1 │ 0 │
```
"""
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
dest_namescol::Union{Symbol, AbstractString};
makeunique::Bool=false)

if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
end
eltype(df[!, src_namescol]) <: SymbolOrString ||
throw(ArgumentError("src_namescol must have eltype `Symbol` or `<:AbstractString`"))

df_notsrc = df[!, Not(src_namescol)]
df_permuted = DataFrame(dest_namescol => names(df_notsrc))

if ncol(df_notsrc) == 0
df_tmp = DataFrame((n=>[] for n in df[!, src_namescol])..., makeunique=makeunique)
else
m = permutedims(Matrix(df_notsrc))
df_tmp = rename!(DataFrame(Tables.table(m)), df[!, src_namescol], makeunique=makeunique)
end
return hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
end

function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
makeunique::Bool=false)
if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
dest_namescol = _names(df)[src_namescol]
else
dest_namescol = src_namescol
end
return permutedims(df, src_namescol, dest_namescol; makeunique=makeunique)
end
104 changes: 85 additions & 19 deletions test/reshape.jl
Expand Up @@ -25,10 +25,10 @@ const ≅ = isequal
# first column stays as CategoricalArray in df3
@test df3 == df4
#Make sure unstack works with missing values at the start of the value column
df[1,:Value] = missing
df[1, :Value] = missing
df2 = unstack(df, :Fish, :Key, :Value)
#This changes the expected result
df4[1,:Mass] = missing
df4[1, :Mass] = missing
@test df2 df4

df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]),
Expand Down Expand Up @@ -62,11 +62,11 @@ const ≅ = isequal
@test df3 == df4
#Make sure unstack works with missing values at the start of the value column
allowmissing!(df, :Value)
df[1,:Value] = missing
df[1, :Value] = missing
df2 = unstack(df, :Fish, :Key, :Value)
#This changes the expected result
allowmissing!(df4, :Mass)
df4[2,:Mass] = missing
df4[2, :Mass] = missing
@test df2 df4

df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
Expand All @@ -89,9 +89,9 @@ const ≅ = isequal
@test_throws TypeError unstack(df, :Key, :Value, renamecols=Symbol)

# test missing value in grouping variable
mdf = DataFrame(id=[missing,1,2,3], a=1:4, b=1:4)
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3,:] == sort(mdf)[1:3,:]
mdf = DataFrame(id=[missing, 1, 2, 3], a=1:4, b=1:4)
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[1:3, :] == sort(mdf)[1:3, :]
@test unstack(stack(mdf, Not(:id)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]
@test unstack(stack(mdf, Not(1)), :id, :variable, :value)[:, 2:3] == sort(mdf)[:, 2:3]

Expand Down Expand Up @@ -158,7 +158,7 @@ end
b = unstack(df, :variable, :value)
@test a b DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])

df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1])
@test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :variable, :value)
@test_logs (:warn, "Duplicate entries in unstack at row 2 for key 1 and variable x.") unstack(df, :id, :variable, :value)
end
Expand Down Expand Up @@ -225,14 +225,14 @@ end
@test d1s2 == d1s3
@test propertynames(d1s) == [:c, :d, :e, :variable, :value]
@test d1s == d1m
d1m = stack(d1[:, [1,3,4]], Not(:a))
d1m = stack(d1[:, [1, 3, 4]], Not(:a))
@test propertynames(d1m) == [:a, :variable, :value]

# Test naming of measure/value columns
d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval)
@test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval)
@test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval]
d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval)
d1m_named = stack(d1[:, [1, 3, 4]], Not(:a), variable_name=:letter, value_name=:someval)
@test propertynames(d1m_named) == [:a, :letter, :someval]

# test empty measures or ids
Expand Down Expand Up @@ -270,21 +270,21 @@ end
@test d1s[!, 5] isa DataFrames.StackedVector
@test ndims(d1s[!, 5]) == 1
@test ndims(typeof(d1s[!, 2])) == 1
@test d1s[!, 4][[1,24]] == ["a", "b"]
@test d1s[!, 5][[1,24]] == [1, 4]
@test d1s[!, 4][[1, 24]] == ["a", "b"]
@test d1s[!, 5][[1, 24]] == [1, 4]
@test_throws ArgumentError d1s[!, 4][true]
@test_throws ArgumentError d1s[!, 5][true]
@test_throws ArgumentError d1s[!, 4][1.0]
@test_throws ArgumentError d1s[!, 5][1.0]

d1ss = stack(d1, [:a, :b], view=true)
@test d1ss[!, 4][[1,24]] == ["a", "b"]
@test d1ss[!, 4][[1, 24]] == ["a", "b"]
@test d1ss[!, 4] isa DataFrames.RepeatedVector
d1ss = stack(d1, [:a, :b], view=true, variable_eltype=String)
@test d1ss[!, 4][[1,24]] == ["a", "b"]
@test d1ss[!, 4][[1, 24]] == ["a", "b"]
@test d1ss[!, 4] isa DataFrames.RepeatedVector
d1ss = stack(d1, [:a, :b], view=true, variable_eltype=Symbol)
@test d1ss[!, 4][[1,24]] == [:a, :b]
@test d1ss[!, 4][[1, 24]] == [:a, :b]
@test d1ss[!, 4] isa DataFrames.RepeatedVector

# Those tests check indexing RepeatedVector/StackedVector by a vector
Expand All @@ -307,7 +307,7 @@ end
@test d1s2 == d1s3
@test propertynames(d1s) == [:c, :d, :e, :variable, :value]
@test d1s == d1m
d1m = stack(d1[:, [1,3,4]], Not(:a), view=true)
d1m = stack(d1[:, [1, 3, 4]], Not(:a), view=true)
@test propertynames(d1m) == [:a, :variable, :value]

d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true)
Expand All @@ -329,13 +329,13 @@ end
@test d1us3 == unstack(d1s2)

# test unstack with exactly one key column that is not passed
df1 = stack(DataFrame(rand(10,10)))
df1 = stack(DataFrame(rand(10, 10)))
df1[!, :id] = 1:100
@test size(unstack(df1, :variable, :value)) == (100, 11)
@test unstack(df1, :variable, :value) unstack(df1)

# test empty keycol
@test_throws ArgumentError unstack(stack(DataFrame(rand(3,2))), :variable, :value)
@test_throws ArgumentError unstack(stack(DataFrame(rand(3, 2))), :variable, :value)
end

@testset "column names duplicates" begin
Expand Down Expand Up @@ -494,7 +494,7 @@ end
end

@testset "test stack eltype" begin
df = DataFrame(rand(4,5))
df = DataFrame(rand(4, 5))
sdf = stack(df)
@test eltype(sdf.variable) === String
@test eltype(typeof(sdf.variable)) === String
Expand All @@ -507,4 +507,70 @@ end
@test eltype(typeof(sdf2.value)) === Float64
end

@testset "permutedims" begin
df1 = DataFrame(a=["x", "y"], b=rand(2), c=[1, 2], d=rand(Bool, 2))

@test_throws MethodError transpose(df1)
@test_throws ArgumentError permutedims(df1, :bar)

df1_pd = permutedims(df1, 1)
@test size(df1_pd, 1) == ncol(df1) - 1
@test size(df1_pd, 2) == nrow(df1) + 1
@test names(df1_pd) == ["a", "x", "y"]
@test df1_pd == permutedims(df1, :a) == permutedims(df1, 1)
@test names(permutedims(df1, :a, :foo)) == ["foo", "x", "y"]

orignames1 = names(df1)[2:end]
for (i, row) in enumerate(eachrow(df1_pd))
@test Vector(row) == [orignames1[i]; df1[!, orignames1[i]]]
end

# All columns should be promoted
@test eltype(df1_pd.x) == Float64
@test eltype(df1_pd.y) == Float64

df2 = DataFrame(a=["x", "y"], b=[1.0, "str"], c=[1, 2], d=rand(Bool, 2))

df2_pd = permutedims(df2, :a)
@test size(df2_pd, 1) == ncol(df2) - 1
@test size(df2_pd, 2) == nrow(df2) + 1
@test names(df2_pd) == ["a", "x", "y"]

orignames2 = names(df2)[2:end]
for (i, row) in enumerate(eachrow(df2_pd))
@test Vector(row) == [orignames2[i]; df2[!, orignames2[i]]]
end
@test Any == eltype(df2_pd.x)
@test Any == eltype(df2_pd.y)

df3 = DataFrame(a=fill("x", 10), b=rand(10), c=rand(Int, 10), d=rand(Bool, 10))

d3pd_names = ["a", "x", ("x_$i" for i in 1:9)...]
@test_throws ArgumentError permutedims(df3, 1)
@test names(permutedims(df3, 1, makeunique=true)) == d3pd_names
@test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch
@test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names

df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1., missing],
e=["x", "y"], f=[:x, :y], # valid src
g=[missing, "y"], h=Union{Missing, String}["x", "y"] # invalid src
)

@test permutedims(df4[!, [:a, :b, :c, :e]], :e) ==
permutedims(df4[!, [:e, :a, :b, :c]], 1) ==
permutedims(df4[!, [:a, :b, :c, :f]], :f, :e)
# Can permute single-column
@test permutedims(df4[!, [:e]], 1) == DataFrame(e=String[], x=[], y=[])
# Can't index float Column
@test_throws ArgumentError permutedims(df4[!, [:a, :b, :c]], 1)
@test_throws ArgumentError permutedims(DataFrame(a=Float64[], b=Float64[]), 1)
# Can't index columns that allow for missing
@test_throws ArgumentError permutedims(df4[!, [:g, :a, :b, :c]], 1)
@test_throws ArgumentError permutedims(df4[!, [:h, :a, :b]], 1)
# Can't permute empty `df` ...
@test_throws BoundsError permutedims(DataFrame(), 1)
# ... but can permute zero-row df
@test permutedims(DataFrame(a=String[], b=Float64[]), 1) == DataFrame(a=["b"])
end

end # module

0 comments on commit 448ad3b

Please sign in to comment.