Skip to content

Commit

Permalink
Merge 933d33c into 6d8f924
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Feb 28, 2018
2 parents 6d8f924 + 933d33c commit 8f23758
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 61 deletions.
88 changes: 48 additions & 40 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -784,8 +784,11 @@ end
"""
vcat(dfs::AbstractDataFrame...)
Vertically concatenate `AbstractDataFrames` that have the same column names in
the same order.
Vertically concatenate `AbstractDataFrames`.
Column names in all passed data frames must be the same, but they can have
different order. In such cases the order of names in the first passed
`DataFrame` is used.
# Example
```jldoctest
Expand All @@ -801,55 +804,60 @@ julia> vcat(df1, df2)
│ 4 │ 4 │ 4 │
│ 5 │ 5 │ 5 │
│ 6 │ 6 │ 6 │
julia> df1 = DataFrame(A=1:3, B=1:3);
julia> df2 = DataFrame(B=4:6, A=4:6);
julia> vcat(df1, df2)
6×2 DataFrames.DataFrame
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
│ 4 │ 4 │ 4 │
│ 5 │ 5 │ 5 │
│ 6 │ 6 │ 6 │
```
"""
Base.vcat(df::AbstractDataFrame) = df
Base.vcat(dfs::AbstractDataFrame...) = _vcat(collect(dfs))
function _vcat(dfs::AbstractVector{<:AbstractDataFrame})
isempty(dfs) && return DataFrame()
allheaders = map(names, dfs)
if all(h -> length(h) == 0, allheaders)
return DataFrame()
end
uniqueheaders = unique(allheaders)
if length(uniqueheaders) > 1
unionunique = union(uniqueheaders...)
coldiff = setdiff(unionunique, intersect(uniqueheaders...))
if !isempty(coldiff)
# if any DataFrames are a full superset of names, skip them
filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
matching = find(h -> u == h, allheaders)
headerdiff = setdiff(coldiff, u)
cols = join(headerdiff, ", ", " and ")
args = join(matching, ", ", " and ")
estrings[i] = "column(s) $cols are missing from argument(s) $args"
end
throw(ArgumentError(join(estrings, ", ", ", and ")))
else
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
indices = find(a -> a == u, allheaders)
estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))"
end
throw(ArgumentError(join(estrings, " != ")))
unionunique = union(uniqueheaders...)
intersectunique = intersect(uniqueheaders...)
coldiff = setdiff(unionunique, intersectunique)

if !isempty(coldiff)
# if any DataFrames are a full superset of names, skip them
filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
matching = find(h -> u == h, allheaders)
headerdiff = setdiff(coldiff, u)
cols = join(headerdiff, ", ", " and ")
args = join(matching, ", ", " and ")
estrings[i] = "column(s) $cols are missing from argument(s) $args"
end
else
header = uniqueheaders[1]
cols = Vector{Any}(length(header))
for i in 1:length(cols)
data = [df[i] for df in dfs]
lens = map(length, data)
cols[i] = promote_col_type(data...)(sum(lens))
offset = 1
for j in 1:length(data)
copy!(cols[i], offset, data[j])
offset += lens[j]
end
throw(ArgumentError(join(estrings, ", ", ", and ")))
end

header = allheaders[1]
length(header) == 0 && return DataFrame()
cols = Vector{Any}(length(header))
for (i, name) in enumerate(header)
data = [df[name] for df in dfs]
lens = map(length, data)
cols[i] = promote_col_type(data...)(sum(lens))
offset = 1
for j in 1:length(data)
copy!(cols[i], offset, data[j])
offset += lens[j]
end
return DataFrame(cols, header)
end
return DataFrame(cols, header)
end

##############################################################################
Expand Down
44 changes: 23 additions & 21 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,15 @@ module TestCat
@test size(vcat(df, df, df)) == (size(df, 1) * 3, size(df, 2))

alt_df = deepcopy(df)
vcat(df, alt_df)
@test vcat(df, alt_df) == DataFrame([[3.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0],
[2.0,2.0,1.0,3.0,2.0,2.0,1.0,3.0],
[2,2,2,3,2,2,2,3]])

# Don't fail on non-matching types
df[1] = zeros(Int, nrow(df))
vcat(df, alt_df)
@test vcat(df, alt_df) == DataFrame([[0.0,0.0,0.0,0.0,3.0,2.0,3.0,3.0],
[2.0,2.0,1.0,3.0,2.0,2.0,1.0,3.0],
[2,2,2,3,2,2,2,3]])

dfr = vcat(df4, df4)
@test size(dfr, 1) == 8
Expand Down Expand Up @@ -204,6 +208,22 @@ module TestCat
@test typeof.(df.columns) == [Vector{Bool}]
end

@testset "vcat out of order" begin
df1 = DataFrame(A = 1:3, B = 1:3, C = 1:3)
df2 = df1[reverse(names(df1))]
@test size(vcat(df1, df2)) == (6, 3)
@test size(vcat(df1, df1, df2)) == (9, 3)
@test size(vcat(df1, df2, df2)) == (9, 3)
@test size(vcat(df2, df1, df2)) == (9, 3)
@test size(vcat(df1, df1, df1, df2, df2, df2)) == (18, 3)
df3 = df1[[1, 3, 2]]
@test size(vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df3)) == (30, 3)
df1 = DataFrame(A = 1, B = 2)
df2 = DataFrame(B = 12, A = 11)
df3 = DataFrame(A = [1, 11], B = [2, 12])
@test [df1; df2] == df3
end

@testset "vcat errors" begin
err = @test_throws ArgumentError vcat(DataFrame(), DataFrame(), DataFrame(x=[]))
@test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
Expand All @@ -227,25 +247,7 @@ module TestCat
# >1 arguments missing >1 columns
err = @test_throws ArgumentError vcat(df1, df2, df2, df2, df2)
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5"
# out of order
df2 = df1[reverse(names(df1))]
err = @test_throws ArgumentError vcat(df1, df2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2"
# first group >1 arguments
err = @test_throws ArgumentError vcat(df1, df1, df2)
@test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3"
# second group >1 arguments
err = @test_throws ArgumentError vcat(df1, df2, df2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3"
# first and second groups >1 argument
err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6"
# >2 groups out of order
srand(1)
df3 = df1[shuffle(names(df1))]
err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df3)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10"
# missing columns throws error before out of order columns
# missing columns throws error
df1 = DataFrame(A = 1, B = 1)
df2 = DataFrame(A = 1)
df3 = DataFrame(B = 1, A = 1)
Expand Down

0 comments on commit 8f23758

Please sign in to comment.