Merge 933d33c into 6d8f924

JuliaData · Feb 28, 2018 · 8f23758 · 8f23758
2 parents 6d8f924 + 933d33c
commit 8f23758
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 61 deletions.
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -784,8 +784,11 @@ end
 """
     vcat(dfs::AbstractDataFrame...)
 
-Vertically concatenate `AbstractDataFrames` that have the same column names in
-the same order.
+Vertically concatenate `AbstractDataFrames`.
+
+Column names in all passed data frames must be the same, but they can have
+different order. In such cases the order of names in the first passed
+`DataFrame` is used.
 
 # Example
 ```jldoctest
@@ -801,55 +804,60 @@ julia> vcat(df1, df2)
 │ 4   │ 4 │ 4 │
 │ 5   │ 5 │ 5 │
 │ 6   │ 6 │ 6 │
+
+
+julia> df1 = DataFrame(A=1:3, B=1:3);
+julia> df2 = DataFrame(B=4:6, A=4:6);
+julia> vcat(df1, df2)
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1   │ 1 │ 1 │
+│ 2   │ 2 │ 2 │
+│ 3   │ 3 │ 3 │
+│ 4   │ 4 │ 4 │
+│ 5   │ 5 │ 5 │
+│ 6   │ 6 │ 6 │
 ```
 """
 Base.vcat(df::AbstractDataFrame) = df
 Base.vcat(dfs::AbstractDataFrame...) = _vcat(collect(dfs))
 function _vcat(dfs::AbstractVector{<:AbstractDataFrame})
     isempty(dfs) && return DataFrame()
     allheaders = map(names, dfs)
-    if all(h -> length(h) == 0, allheaders)
-        return DataFrame()
-    end
     uniqueheaders = unique(allheaders)
-    if length(uniqueheaders) > 1
-        unionunique = union(uniqueheaders...)
-        coldiff = setdiff(unionunique, intersect(uniqueheaders...))
-        if !isempty(coldiff)
-            # if any DataFrames are a full superset of names, skip them
-            filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
-            estrings = Vector{String}(length(uniqueheaders))
-            for (i, u) in enumerate(uniqueheaders)
-                matching = find(h -> u == h, allheaders)
-                headerdiff = setdiff(coldiff, u)
-                cols = join(headerdiff, ", ", " and ")
-                args = join(matching, ", ", " and ")
-                estrings[i] = "column(s) $cols are missing from argument(s) $args"
-            end
-            throw(ArgumentError(join(estrings, ", ", ", and ")))
-        else
-            estrings = Vector{String}(length(uniqueheaders))
-            for (i, u) in enumerate(uniqueheaders)
-                indices = find(a -> a == u, allheaders)
-                estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))"
-            end
-            throw(ArgumentError(join(estrings, " != ")))
+    unionunique = union(uniqueheaders...)
+    intersectunique = intersect(uniqueheaders...)
+    coldiff = setdiff(unionunique, intersectunique)
+
+    if !isempty(coldiff)
+        # if any DataFrames are a full superset of names, skip them
+        filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
+        estrings = Vector{String}(length(uniqueheaders))
+        for (i, u) in enumerate(uniqueheaders)
+            matching = find(h -> u == h, allheaders)
+            headerdiff = setdiff(coldiff, u)
+            cols = join(headerdiff, ", ", " and ")
+            args = join(matching, ", ", " and ")
+            estrings[i] = "column(s) $cols are missing from argument(s) $args"
         end
-    else
-        header = uniqueheaders[1]
-        cols = Vector{Any}(length(header))
-        for i in 1:length(cols)
-            data = [df[i] for df in dfs]
-            lens = map(length, data)
-            cols[i] = promote_col_type(data...)(sum(lens))
-            offset = 1
-            for j in 1:length(data)
-                copy!(cols[i], offset, data[j])
-                offset += lens[j]
-            end
+        throw(ArgumentError(join(estrings, ", ", ", and ")))
+    end
+
+    header = allheaders[1]
+    length(header) == 0 && return DataFrame()
+    cols = Vector{Any}(length(header))
+    for (i, name) in enumerate(header)
+        data = [df[name] for df in dfs]
+        lens = map(length, data)
+        cols[i] = promote_col_type(data...)(sum(lens))
+        offset = 1
+        for j in 1:length(data)
+            copy!(cols[i], offset, data[j])
+            offset += lens[j]
         end
-        return DataFrame(cols, header)
     end
+    return DataFrame(cols, header)
 end
 
 ##############################################################################

diff --git a/test/cat.jl b/test/cat.jl
@@ -123,11 +123,15 @@ module TestCat
     @test size(vcat(df, df, df)) == (size(df, 1) * 3, size(df, 2))
 
     alt_df = deepcopy(df)
-    vcat(df, alt_df)
+    @test vcat(df, alt_df) == DataFrame([[3.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0],
+                                         [2.0,2.0,1.0,3.0,2.0,2.0,1.0,3.0],
+                                         [2,2,2,3,2,2,2,3]])
 
     # Don't fail on non-matching types
     df[1] = zeros(Int, nrow(df))
-    vcat(df, alt_df)
+    @test vcat(df, alt_df) == DataFrame([[0.0,0.0,0.0,0.0,3.0,2.0,3.0,3.0],
+                                         [2.0,2.0,1.0,3.0,2.0,2.0,1.0,3.0],
+                                         [2,2,2,3,2,2,2,3]])
 
     dfr = vcat(df4, df4)
     @test size(dfr, 1) == 8
@@ -204,6 +208,22 @@ module TestCat
         @test typeof.(df.columns) == [Vector{Bool}]
     end
 
+    @testset "vcat out of order" begin
+        df1 = DataFrame(A = 1:3, B = 1:3, C = 1:3)
+        df2 = df1[reverse(names(df1))]
+        @test size(vcat(df1, df2)) == (6, 3)
+        @test size(vcat(df1, df1, df2)) == (9, 3)
+        @test size(vcat(df1, df2, df2)) == (9, 3)
+        @test size(vcat(df2, df1, df2)) == (9, 3)
+        @test size(vcat(df1, df1, df1, df2, df2, df2)) == (18, 3)
+        df3 = df1[[1, 3, 2]]
+        @test size(vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df3)) == (30, 3)
+        df1 = DataFrame(A = 1, B = 2)
+        df2 = DataFrame(B = 12, A = 11)
+        df3 = DataFrame(A = [1, 11], B = [2, 12])
+        @test [df1; df2] == df3
+    end
+
     @testset "vcat errors" begin
         err = @test_throws ArgumentError vcat(DataFrame(), DataFrame(), DataFrame(x=[]))
         @test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
@@ -227,25 +247,7 @@ module TestCat
         # >1 arguments missing >1 columns
         err = @test_throws ArgumentError vcat(df1, df2, df2, df2, df2)
         @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5"
-        # out of order
-        df2 = df1[reverse(names(df1))]
-        err = @test_throws ArgumentError vcat(df1, df2)
-        @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2"
-        # first group >1 arguments
-        err = @test_throws ArgumentError vcat(df1, df1, df2)
-        @test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3"
-        # second group >1 arguments
-        err = @test_throws ArgumentError vcat(df1, df2, df2)
-        @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3"
-        # first and second groups >1 argument
-        err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2)
-        @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6"
-        # >2 groups out of order
-        srand(1)
-        df3 = df1[shuffle(names(df1))]
-        err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df3)
-        @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10"
-        # missing columns throws error before out of order columns
+        # missing columns throws error
         df1 = DataFrame(A = 1, B = 1)
         df2 = DataFrame(A = 1)
         df3 = DataFrame(B = 1, A = 1)