Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

Stop auto-promoting column-types #30

Closed
wants to merge 46 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
8db2821
add changes
cjprybol Mar 10, 2017
4a939fe
make vcat error more informative
cjprybol Mar 13, 2017
f5a53a1
add docstring for vcat
cjprybol Mar 13, 2017
2c95f13
incorporate edits suggested during review
cjprybol Mar 13, 2017
412ceaa
_unsafe_get -> NullableArrays.unsafe_get
cjprybol Mar 13, 2017
f142df5
Merge branch 'master' into cjp/rebaseretaintype
cjprybol Mar 13, 2017
cc95658
fix new tests from master
cjprybol Mar 13, 2017
06dc914
remove RepeatedVector, StackedVector, unstackdt, meltdt
cjprybol Mar 14, 2017
c4e218e
DataFrames doensn't reshape 2d Arrays -> Vectors so don't do it here
cjprybol Mar 14, 2017
e954226
minor cleanup
cjprybol Mar 14, 2017
ed8a515
change (de)nullify back to copy and cleanup docstrings
cjprybol Mar 15, 2017
1636a0c
NullableArrays.unsafe_get -> compat(unsafe_get)
cjprybol Mar 15, 2017
91233d3
default to NullableArray for joins that may introduce missing data
cjprybol Mar 15, 2017
7462612
align comments
cjprybol Mar 15, 2017
9b65533
lots of edits
cjprybol Mar 15, 2017
b643ff8
tests and no need for compat
cjprybol Mar 15, 2017
4c68452
spacing mistakes
cjprybol Mar 15, 2017
7310681
throw errors on 1-d matrices and change confusing variable name
cjprybol Mar 15, 2017
de280ba
add back check to differentiate scalars from AbstractArrays
cjprybol Mar 15, 2017
88b20ca
save work
cjprybol Mar 16, 2017
be1cacd
save progress, switch to test master
cjprybol Mar 16, 2017
19ffb58
join is ready and tests in place. right join still broken
cjprybol Mar 16, 2017
3f2cd63
fix right join
cjprybol Mar 17, 2017
9c3ad21
update join help message and add note about temp fix
cjprybol Mar 17, 2017
1e7d26e
indentation
cjprybol Mar 17, 2017
e39ba63
changes
cjprybol Mar 17, 2017
04cb9ee
spacing
cjprybol Mar 17, 2017
5d70685
put old unstack back and stabilize types, ordering
cjprybol Mar 18, 2017
7859132
fix bad copy and paste spacing and condense scalar recycling code
cjprybol Mar 18, 2017
6496acf
update vcat error
cjprybol Mar 18, 2017
f47810f
unused function, another test, remove unused variable
cjprybol Mar 18, 2017
259ceef
revert function removal to appease new code failures?
cjprybol Mar 18, 2017
26e87ac
fix v0.5 issue
cjprybol Mar 18, 2017
e0f7982
update vcat testing and change similar_nullable constructor call
cjprybol Mar 18, 2017
d65385e
:Merge branch 'cjp/retaintype' of github.com:cjprybol/DataTables.jl i…
cjprybol Mar 18, 2017
b0c29b4
remove old error message from docstring
cjprybol Mar 18, 2017
95a6f31
and change docstring to doctest
cjprybol Mar 18, 2017
7df712f
change similar_nullable back and fix unrelated copy paste space removal
cjprybol Mar 18, 2017
27da644
add missing rightperm reordering and properly unify hcat! functions
cjprybol Mar 18, 2017
5fa8fa0
accidental spacing changes
cjprybol Mar 18, 2017
a1d58f9
forgot one spacing change
cjprybol Mar 18, 2017
db87443
change deprecations
cjprybol Mar 18, 2017
9c66a1e
add back extra spaces
cjprybol Mar 18, 2017
887346b
bump catarrays version, remove manual resetting of levels in unstack
cjprybol Mar 20, 2017
00c08cc
Merge branch 'master' into cjp/retaintype
cjprybol Mar 24, 2017
020c88e
only use "and" when joining the last estring
cjprybol Mar 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 30 additions & 15 deletions src/abstractdatatable/abstractdatatable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -663,15 +663,15 @@ unique!(dt) # modifies dt
"""
(unique, unique!)

function nonuniquekey(dt::AbstractDataTable)
# Here's another (probably a lot faster) way to do `nonunique`
# by grouping on all columns. It will fail if columns cannot be
# made into CategoricalVector's.
gd = groupby(dt, _names(dt))
idx = [1:length(gd.idx)][gd.idx][gd.starts]
res = fill(true, nrow(dt))
res[idx] = false
res
function nonuniquekey(dt::AbstractDataTable)
# Here's another (probably a lot faster) way to do `nonunique`
# by grouping on all columns. It will fail if columns cannot be
# made into CategoricalVector's.
gd = groupby(dt, _names(dt))
idx = [1:length(gd.idx)][gd.idx][gd.starts]
res = fill(true, nrow(dt))
res[idx] = false
res
end

# Count the number of missing values in every column of an AbstractDataTable.
Expand Down Expand Up @@ -748,18 +748,33 @@ function Base.vcat(dts::AbstractDataTable...)
if length(uniqueheaders) == 0
return DataTable()
end
coldiff = setdiff(union(uniqueheaders...), intersect(uniqueheaders...))
if length(uniqueheaders) > 1
unionunique = union(uniqueheaders...)
coldiff = setdiff(unionunique, intersect(uniqueheaders...))
if !isempty(coldiff)
headerlengths = length.(allheaders)
minheaderloci = find(headerlengths .== minimum(headerlengths))
throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))"))
# if any datatables are a full superset of names, skip them
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for each unique set of column names I'm throwing the error to tell which columns are missing from each of the sets. If any of the inputs to vcat have all of the column names then we can't show which are missing, so they're dropped from the error output

filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
matchingloci = find(h -> u == h, allheaders)
headerdiff = filter(x -> !in(x, u), coldiff)
headerdiff = length(headerdiff) > 1 ?
join(string.(headerdiff[1:end-1]), ", ") * " and " * string(headerdiff[end]) :
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather align everything on length, without further indentation. Anyway if you keep the indentation, it should be four spaces. Same below.

Anyway, no need for this length check nor to handle the last element manually: as I said, just use join's last argument.

string(headerdiff[end])
matchingloci = length(matchingloci) > 1 ?
join(string.(matchingloci[1:end-1]), ", ") * " and " * string(matchingloci[end]) :
string(matchingloci[end])
estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci"
end
throw(ArgumentError(join(estrings, ", and ")))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also use last here for "and".

else
estrings = Vector{String}(length(uniqueheaders))
for (i, u) in enumerate(uniqueheaders)
indices = find(a -> a == u, allheaders)
indices = join(string.(indices), ", ")
estrings[i] = "column order of argument(s) ($indices)"
indices = length(indices) > 1 ?
join(string.(indices[1:end-1]), ", ") * " and " * string(indices[end]) :
string(indices[end])
estrings[i] = "column order of argument(s) $indices"
end
throw(ArgumentError(join(estrings, " != ")))
end
Expand Down
5 changes: 3 additions & 2 deletions src/abstractdatatable/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,9 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int)
payload[j][i] = valuecol[k]
end
end
col = typeof(similar_nullable(dt[rowkey], 1))(levels(refkeycol))
insert!(payload, 1, col, _names(dt)[rowkey])
levs = levels(refkeycol)
col = similar_nullable(dt[rowkey], length(levs))
insert!(payload, 1, copy!(col, levs), _names(dt)[rowkey])
end
unstack(dt::AbstractDataTable, rowkey, colkey, value) =
unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value])
Expand Down
81 changes: 63 additions & 18 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,24 +122,69 @@ module TestCat
@testset "vcat errors" begin
dt1 = DataTable(A = 1:3, B = 1:3)
dt2 = DataTable(A = 1:3)
@test_throws ArgumentError vcat(dt1, dt2)
dt2 = DataTable(A = 1:3, C = 1:3)
@test_throws ArgumentError vcat(dt1, dt2)
# right missing 1 column
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) B are missing from argument(s) 2"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like that this is possible (catching the error output in a variable to test the string). Thanks for the suggestion here

# left missing 1 column
err = @test_throws ArgumentError vcat(dt2, dt1)
@test err.value.msg == "column(s) B are missing from argument(s) 1"
# multiple missing 1 column
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2)
@test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6"
# argument missing >1columns
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space.

dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3)
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2"
# >1 arguments missing >1 columns
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2)
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5"
# out of order
dt2 = dt1[reverse(names(dt1))]
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2"
# left >1
err = @test_throws ArgumentError vcat(dt1, dt1, dt2)
@test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3"
# right >1
err = @test_throws ArgumentError vcat(dt1, dt2, dt2)
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3"
# left and right >1
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6"
# >2 groups out of order
srand(1)
dt3 = dt1[shuffle(names(dt1))]
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3)
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10"
# missing columns throws error before out of order columns
dt1 = DataTable(A = 1, B = 1)
dt2 = DataTable(B = 1, A = 1)
@test_throws ArgumentError vcat(dt1, dt2)
@test_throws ArgumentError vcat(dt1, dt1, dt1, dt1, dt2, dt2, dt2, dt2)
dt3 = DataTable(A = 1, B = 1, C = 1)
@test_throws ArgumentError vcat(dt1, dt3)
@test_throws ArgumentError vcat(dt1, dt1, dt3, dt3)
@test_throws ArgumentError vcat(dt2, dt3)
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1)
@test_throws ArgumentError vcat(dt1, dt4)
@test_throws ArgumentError vcat(dt2, dt4)
@test_throws ArgumentError vcat(dt3, dt4)
dt5 = hcat(dt4, dt4, dt4, dt4)
@test_throws ArgumentError vcat(dt3, dt5)
dt5r = names!(copy(dt5), reverse(names(dt5)))
@test_throws ArgumentError vcat(dt5, dt5r)
dt2 = DataTable(A = 1)
dt3 = DataTable(B = 1, A = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
@test err.value.msg == "column(s) B are missing from argument(s) 2"
# unique columns for both sides
dt1 = DataTable(A = 1, B = 1, C = 1, D = 1)
dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1)
err = @test_throws ArgumentError vcat(dt1, dt2)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4"
dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
# dt4 is a superset of names found in all other datatables and won't be shown in error
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1)
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, and column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11"
end
end