Skip to content

Commit

Permalink
make unstack always use order of first appereance
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Oct 30, 2020
1 parent 58cc578 commit 35b1f0a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 31 deletions.
32 changes: 23 additions & 9 deletions src/abstractdataframe/reshape.jl
Expand Up @@ -15,7 +15,6 @@ If `view=true` then return a stacked view of a data frame (long format).
The result is a view because the columns are special `AbstractVectors`
that return views into the original data frame.
# Arguments
- `df` : the AbstractDataFrame to be stacked
- `measure_vars` : the columns to be stacked (the measurement variables),
Expand Down Expand Up @@ -207,10 +206,7 @@ end
Unstack data frame `df`, i.e. convert it from long to wide format.
Row and column keys will be ordered in the order of their first appearance except
when they are stored in an `AbstractVector` which supports `DataAPI.refpool`
(two most common cases are `CategoricalVector` and `PooledVector`),
in which case the odrer follows the order of values in this pool.
Row and column keys will be ordered in the order of their first appearance.
# Positional arguments
- `df` : the AbstractDataFrame to be unstacked
Expand Down Expand Up @@ -380,7 +376,7 @@ function find_group_row(gdf::GroupedDataFrame)
end
i += 1
end
return rows # return row index of first occurrence of each group in gdf
return rows # return row index of first occurrence of each group in gdf.groups
end

function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
Expand All @@ -389,13 +385,14 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
renamecols::Function,
allowmissing::Bool, allowduplicates::Bool)
rowref = g_rowkey.groups
df1 = df[find_group_row(g_rowkey), g_rowkey.cols]
row_group_row_idxs = find_group_row(g_rowkey)
Nrow = length(g_rowkey)

@assert groupcols(g_colkey) == _names(df)[colkey:colkey]
colref = g_colkey.groups
Ncol = length(g_colkey)
colref_map = df[find_group_row(g_colkey), colkey]
col_group_row_idxs = find_group_row(g_colkey)
colref_map = df[col_group_row_idxs, colkey]

if any(ismissing, colref_map) && !allowmissing
throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." *
Expand All @@ -415,10 +412,27 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
unstacked_val[col_id][row_id] = val
mask_filled[row_id, col_id] = true
end

# note that Symbol.(renamecols.(colref_map)) must produce unique column names
# and names between df1 and df2 must be unique
df1 = df[row_group_row_idxs, g_rowkey.cols]
df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false)
hcat(df1, df2, copycols=false)

@assert length(col_group_row_idxs) == ncol(df2)
# avoid reordering when col_group_row_idxs was already ordered
if !issorted(col_group_row_idxs)
df2 = df2[!, sortperm(col_group_row_idxs)]
end

res_df = hcat(df1, df2, copycols=false)

@assert length(row_group_row_idxs) == nrow(res_df)
# avoid reordering when col_group_row_idxs was already ordered
if !issorted(row_group_row_idxs)
res_df = res_df[sortperm(row_group_row_idxs), :]
end

return res_df
end

"""
Expand Down
43 changes: 21 additions & 22 deletions test/reshape.jl
Expand Up @@ -16,10 +16,10 @@ const ≅ = isequal
@test levels(df[!, 2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[!, 2] levels
#Unstack without specifying a row column
df3 = unstack(df, :Key, :Value)
#The expected output, XXX level should be dropped as it has no rows with this key
#The expected output is in odred of appereance
df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"],
Color = Union{String, Missing}["Red", "Grey"],
Mass = Union{String, Missing}["12 g", "18 g"])
Mass = Union{String, Missing}["12 g", "18 g"],
Color = Union{String, Missing}["Red", "Grey"])
@test df2 df4
@test typeof(df2[!, :Fish]) <: CategoricalVector{Union{String, Missing}}
# first column stays as CategoricalArray in df3
Expand All @@ -39,8 +39,8 @@ const ≅ = isequal
df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"],
_COLOR_ = Union{String, Missing}["Red", "Grey"],
_MASS_ = Union{String, Missing}["12 g", "18 g"])
_MASS_ = Union{String, Missing}["12 g", "18 g"],
_COLOR_ = Union{String, Missing}["Red", "Grey"])
@test df2 == df4
@test df3 == df4

Expand Down Expand Up @@ -498,36 +498,35 @@ end
@test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)

# an exercise on current unstack invariants
# make sure we always use order of appereance
Random.seed!(1234)
for i in 1:16
df = df[Random.shuffle(1:9), :]
@test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
wide1 = unstack(df, :id, :var, :val)
wide2 = unstack(df, [:id, :id2], :var, :val)
wide3 = unstack(df, :var, :val)
@test wide1[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val)
@test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
@test wide2[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test wide2 == wide3

df2 = copy(df)
df2.id = PooledArray(df.id)
df2.var = PooledArray(df.var)
@test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val)
@test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, :id, :var, :val) == wide1
@test unstack(df2, [:id, :id2], :var, :val) == wide2
@test unstack(df2, :var, :val) == wide3

df2 = categorical(df, 1:3)
@test unstack(df2, :id, :var, :val) ==
DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
@test unstack(df2, :id, :var, :val) == wide1
@test unstack(df2, [:id, :id2], :var, :val) == wide2
@test unstack(df2, :var, :val) == wide3
levels!(df2.id, [10, 2, 11, 3, 1, 12])
levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z'])
@test unstack(df2, :id, :var, :val) ==
DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
@test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
@test unstack(df2, :id, :var, :val) == wide1
@test unstack(df2, [:id, :id2], :var, :val) == wide2
@test unstack(df2, :var, :val) == wide3
end

df = DataFrame(id=repeat(1:3, inner=3),
Expand Down

0 comments on commit 35b1f0a

Please sign in to comment.