-
Notifications
You must be signed in to change notification settings - Fork 11
Stop auto-promoting column-types #30
Changes from 1 commit
8db2821
4a939fe
f5a53a1
2c95f13
412ceaa
f142df5
cc95658
06dc914
c4e218e
e954226
ed8a515
1636a0c
91233d3
7462612
9b65533
b643ff8
4c68452
7310681
de280ba
88b20ca
be1cacd
19ffb58
3f2cd63
9c3ad21
1e7d26e
e39ba63
04cb9ee
5d70685
7859132
6496acf
f47810f
259ceef
26e87ac
e0f7982
d65385e
b0c29b4
95a6f31
7df712f
27da644
5fa8fa0
a1d58f9
db87443
9c66a1e
887346b
00c08cc
020c88e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -663,15 +663,15 @@ unique!(dt) # modifies dt | |
""" | ||
(unique, unique!) | ||
|
||
function nonuniquekey(dt::AbstractDataTable) | ||
# Here's another (probably a lot faster) way to do `nonunique` | ||
# by grouping on all columns. It will fail if columns cannot be | ||
# made into CategoricalVector's. | ||
gd = groupby(dt, _names(dt)) | ||
idx = [1:length(gd.idx)][gd.idx][gd.starts] | ||
res = fill(true, nrow(dt)) | ||
res[idx] = false | ||
res | ||
function nonuniquekey(dt::AbstractDataTable) | ||
# Here's another (probably a lot faster) way to do `nonunique` | ||
# by grouping on all columns. It will fail if columns cannot be | ||
# made into CategoricalVector's. | ||
gd = groupby(dt, _names(dt)) | ||
idx = [1:length(gd.idx)][gd.idx][gd.starts] | ||
res = fill(true, nrow(dt)) | ||
res[idx] = false | ||
res | ||
end | ||
|
||
# Count the number of missing values in every column of an AbstractDataTable. | ||
|
@@ -748,18 +748,33 @@ function Base.vcat(dts::AbstractDataTable...) | |
if length(uniqueheaders) == 0 | ||
return DataTable() | ||
end | ||
coldiff = setdiff(union(uniqueheaders...), intersect(uniqueheaders...)) | ||
if length(uniqueheaders) > 1 | ||
unionunique = union(uniqueheaders...) | ||
coldiff = setdiff(unionunique, intersect(uniqueheaders...)) | ||
if !isempty(coldiff) | ||
headerlengths = length.(allheaders) | ||
minheaderloci = find(headerlengths .== minimum(headerlengths)) | ||
throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))")) | ||
# if any datatables are a full superset of names, skip them | ||
filter!(u -> Set(u) != Set(unionunique), uniqueheaders) | ||
estrings = Vector{String}(length(uniqueheaders)) | ||
for (i, u) in enumerate(uniqueheaders) | ||
matchingloci = find(h -> u == h, allheaders) | ||
headerdiff = filter(x -> !in(x, u), coldiff) | ||
headerdiff = length(headerdiff) > 1 ? | ||
join(string.(headerdiff[1:end-1]), ", ") * " and " * string(headerdiff[end]) : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather align everything on Anyway, no need for this length check nor to handle the last element manually: as I said, just use |
||
string(headerdiff[end]) | ||
matchingloci = length(matchingloci) > 1 ? | ||
join(string.(matchingloci[1:end-1]), ", ") * " and " * string(matchingloci[end]) : | ||
string(matchingloci[end]) | ||
estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci" | ||
end | ||
throw(ArgumentError(join(estrings, ", and "))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also use |
||
else | ||
estrings = Vector{String}(length(uniqueheaders)) | ||
for (i, u) in enumerate(uniqueheaders) | ||
indices = find(a -> a == u, allheaders) | ||
indices = join(string.(indices), ", ") | ||
estrings[i] = "column order of argument(s) ($indices)" | ||
indices = length(indices) > 1 ? | ||
join(string.(indices[1:end-1]), ", ") * " and " * string(indices[end]) : | ||
string(indices[end]) | ||
estrings[i] = "column order of argument(s) $indices" | ||
end | ||
throw(ArgumentError(join(estrings, " != "))) | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -122,24 +122,69 @@ module TestCat | |
@testset "vcat errors" begin | ||
dt1 = DataTable(A = 1:3, B = 1:3) | ||
dt2 = DataTable(A = 1:3) | ||
@test_throws ArgumentError vcat(dt1, dt2) | ||
dt2 = DataTable(A = 1:3, C = 1:3) | ||
@test_throws ArgumentError vcat(dt1, dt2) | ||
# right missing 1 column | ||
err = @test_throws ArgumentError vcat(dt1, dt2) | ||
@test err.value.msg == "column(s) B are missing from argument(s) 2" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like that this is possible (catching the error output in a variable to test the string). Thanks for the suggestion here |
||
# left missing 1 column | ||
err = @test_throws ArgumentError vcat(dt2, dt1) | ||
@test err.value.msg == "column(s) B are missing from argument(s) 1" | ||
# multiple missing 1 column | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2) | ||
@test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" | ||
# argument missing >1columns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing space. |
||
dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) | ||
err = @test_throws ArgumentError vcat(dt1, dt2) | ||
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2" | ||
# >1 arguments missing >1 columns | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2) | ||
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5" | ||
# out of order | ||
dt2 = dt1[reverse(names(dt1))] | ||
err = @test_throws ArgumentError vcat(dt1, dt2) | ||
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2" | ||
# left >1 | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt2) | ||
@test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3" | ||
# right >1 | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt2) | ||
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3" | ||
# left and right >1 | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2) | ||
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6" | ||
# >2 groups out of order | ||
srand(1) | ||
dt3 = dt1[shuffle(names(dt1))] | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3) | ||
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10" | ||
# missing columns throws error before out of order columns | ||
dt1 = DataTable(A = 1, B = 1) | ||
dt2 = DataTable(B = 1, A = 1) | ||
@test_throws ArgumentError vcat(dt1, dt2) | ||
@test_throws ArgumentError vcat(dt1, dt1, dt1, dt1, dt2, dt2, dt2, dt2) | ||
dt3 = DataTable(A = 1, B = 1, C = 1) | ||
@test_throws ArgumentError vcat(dt1, dt3) | ||
@test_throws ArgumentError vcat(dt1, dt1, dt3, dt3) | ||
@test_throws ArgumentError vcat(dt2, dt3) | ||
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1) | ||
@test_throws ArgumentError vcat(dt1, dt4) | ||
@test_throws ArgumentError vcat(dt2, dt4) | ||
@test_throws ArgumentError vcat(dt3, dt4) | ||
dt5 = hcat(dt4, dt4, dt4, dt4) | ||
@test_throws ArgumentError vcat(dt3, dt5) | ||
dt5r = names!(copy(dt5), reverse(names(dt5))) | ||
@test_throws ArgumentError vcat(dt5, dt5r) | ||
dt2 = DataTable(A = 1) | ||
dt3 = DataTable(B = 1, A = 1) | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt3) | ||
@test err.value.msg == "column(s) B are missing from argument(s) 2" | ||
# unique columns for both sides | ||
dt1 = DataTable(A = 1, B = 1, C = 1, D = 1) | ||
dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1) | ||
err = @test_throws ArgumentError vcat(dt1, dt2) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" | ||
dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt3) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" | ||
# dt4 is a superset of names found in all other datatables and won't be shown in error | ||
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" | ||
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" | ||
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) | ||
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, and column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for each unique set of column names I'm throwing the error to tell which columns are missing from each of the sets. If any of the inputs to vcat have all of the column names then we can't show which are missing, so they're dropped from the error output