make unstack always use order of first appereance

JuliaData · Oct 30, 2020 · 35b1f0a · 35b1f0a
1 parent 58cc578
commit 35b1f0a
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 31 deletions.
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -15,7 +15,6 @@ If `view=true` then return a stacked view of a data frame (long format).
 The result is a view because the columns are special `AbstractVectors`
 that return views into the original data frame.
 
-
 # Arguments
 - `df` : the AbstractDataFrame to be stacked
 - `measure_vars` : the columns to be stacked (the measurement variables),
@@ -207,10 +206,7 @@ end
 
 Unstack data frame `df`, i.e. convert it from long to wide format.
 
-Row and column keys will be ordered in the order of their first appearance except
-when they are stored in an `AbstractVector` which supports `DataAPI.refpool`
-(two most common cases are `CategoricalVector` and `PooledVector`),
-in which case the odrer follows the order of values in this pool.
+Row and column keys will be ordered in the order of their first appearance.
 
 # Positional arguments
 - `df` : the AbstractDataFrame to be unstacked
@@ -380,7 +376,7 @@ function find_group_row(gdf::GroupedDataFrame)
         end
         i += 1
     end
-    return rows # return row index of first occurrence of each group in gdf
+    return rows # return row index of first occurrence of each group in gdf.groups
 end
 
 function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
@@ -389,13 +385,14 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                   renamecols::Function,
                   allowmissing::Bool, allowduplicates::Bool)
     rowref = g_rowkey.groups
-    df1 = df[find_group_row(g_rowkey), g_rowkey.cols]
+    row_group_row_idxs = find_group_row(g_rowkey)
     Nrow = length(g_rowkey)
 
     @assert groupcols(g_colkey) == _names(df)[colkey:colkey]
     colref = g_colkey.groups
     Ncol = length(g_colkey)
-    colref_map = df[find_group_row(g_colkey), colkey]
+    col_group_row_idxs = find_group_row(g_colkey)
+    colref_map = df[col_group_row_idxs, colkey]
 
     if any(ismissing, colref_map) && !allowmissing
         throw(ArgumentError("Missing value in variable :$(_names(df)[colkey])." *
@@ -415,10 +412,27 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
         unstacked_val[col_id][row_id] = val
         mask_filled[row_id, col_id] = true
     end
+
     # note that Symbol.(renamecols.(colref_map)) must produce unique column names
     # and names between df1 and df2 must be unique
+    df1 = df[row_group_row_idxs, g_rowkey.cols]
     df2 = DataFrame(unstacked_val, Symbol.(renamecols.(colref_map)), copycols=false)
-    hcat(df1, df2, copycols=false)
+
+    @assert length(col_group_row_idxs) == ncol(df2)
+    # avoid reordering when col_group_row_idxs was already ordered
+    if !issorted(col_group_row_idxs)
+        df2 = df2[!, sortperm(col_group_row_idxs)]
+    end
+
+    res_df = hcat(df1, df2, copycols=false)
+
+    @assert length(row_group_row_idxs) == nrow(res_df)
+    # avoid reordering when col_group_row_idxs was already ordered
+    if !issorted(row_group_row_idxs)
+        res_df = res_df[sortperm(row_group_row_idxs), :]
+    end
+
+    return res_df
 end
 
 """

diff --git a/test/reshape.jl b/test/reshape.jl
@@ -16,10 +16,10 @@ const ≅ = isequal
     @test levels(df[!, 2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[!, 2] levels
     #Unstack without specifying a row column
     df3 = unstack(df, :Key, :Value)
-    #The expected output, XXX level should be dropped as it has no rows with this key
+    #The expected output is in odred of appereance
     df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"],
-                    Color = Union{String, Missing}["Red", "Grey"],
-                    Mass = Union{String, Missing}["12 g", "18 g"])
+                    Mass = Union{String, Missing}["12 g", "18 g"],
+                    Color = Union{String, Missing}["Red", "Grey"])
     @test df2 ≅ df4
     @test typeof(df2[!, :Fish]) <: CategoricalVector{Union{String, Missing}}
     # first column stays as CategoricalArray in df3
@@ -39,8 +39,8 @@ const ≅ = isequal
     df2 = unstack(df, :Fish, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
     df3 = unstack(df, :Key, :Value, renamecols=x->string("_", uppercase(x), "_"))
     df4 = DataFrame(Fish = Union{String, Missing}["Bob", "Batman"],
-                    _COLOR_ = Union{String, Missing}["Red", "Grey"],
-                    _MASS_ = Union{String, Missing}["12 g", "18 g"])
+                    _MASS_ = Union{String, Missing}["12 g", "18 g"],
+                    _COLOR_ = Union{String, Missing}["Red", "Grey"])
     @test df2 == df4
     @test df3 == df4
 
@@ -498,36 +498,35 @@ end
     @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val) ==
           DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
 
-    # an exercise on current unstack invariants
+    # make sure we always use order of appereance
     Random.seed!(1234)
     for i in 1:16
         df = df[Random.shuffle(1:9), :]
-        @test unstack(df, :id, :var, :val)[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
+        wide1 = unstack(df, :id, :var, :val)
+        wide2 = unstack(df, [:id, :id2], :var, :val)
+        wide3 = unstack(df, :var, :val)
+        @test wide1[sortperm(unique(df.id)), [1; 1 .+ sortperm(unique(df.var))]] ==
               DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
-        @test unstack(df, [:id, :id2], :var, :val) == unstack(df, :var, :val)
-        @test unstack(df, :var, :val)[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
+        @test wide2[sortperm(unique(df.id)), [1:2; 2 .+ sortperm(unique(df.var))]] ==
               DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test wide2 == wide3
 
         df2 = copy(df)
         df2.id = PooledArray(df.id)
         df2.var = PooledArray(df.var)
-        @test unstack(df2, :id, :var, :val)[sortperm(df2.id.pool), [1; 1 .+ sortperm(df2.var.pool)]] ==
-              DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
-        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val)
-        @test unstack(df2, :var, :val)[sortperm(df2.id.pool), [1:2; 2 .+ sortperm(df2.var.pool)]] ==
-              DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test unstack(df2, :id, :var, :val) == wide1
+        @test unstack(df2, [:id, :id2], :var, :val) == wide2
+        @test unstack(df2, :var, :val) == wide3
 
         df2 = categorical(df, 1:3)
-        @test unstack(df2, :id, :var, :val) ==
-              DataFrame(id=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
-        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
-              DataFrame(id=1:3, id2=1:3, a=1:3:7, b=2:3:8, c=3:3:9)
+        @test unstack(df2, :id, :var, :val) == wide1
+        @test unstack(df2, [:id, :id2], :var, :val) == wide2
+        @test unstack(df2, :var, :val) == wide3
         levels!(df2.id, [10, 2, 11, 3, 1, 12])
         levels!(df2.var, ['x', 'b', 'y', 'c', 'a', 'z'])
-        @test unstack(df2, :id, :var, :val) ==
-              DataFrame(id=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
-        @test unstack(df2, [:id, :id2], :var, :val) == unstack(df2, :var, :val) ==
-              DataFrame(id=1:3, id2=1:3, b=2:3:8, c=3:3:9, a=1:3:7)[[2,3,1], :]
+        @test unstack(df2, :id, :var, :val) == wide1
+        @test unstack(df2, [:id, :id2], :var, :val) == wide2
+        @test unstack(df2, :var, :val) == wide3
     end
 
     df = DataFrame(id=repeat(1:3, inner=3),