-
Notifications
You must be signed in to change notification settings - Fork 11
Stop auto-promoting column-types #30
Changes from 33 commits
8db2821
4a939fe
f5a53a1
2c95f13
412ceaa
f142df5
cc95658
06dc914
c4e218e
e954226
ed8a515
1636a0c
91233d3
7462612
9b65533
b643ff8
4c68452
7310681
de280ba
88b20ca
be1cacd
19ffb58
3f2cd63
9c3ad21
1e7d26e
e39ba63
04cb9ee
5d70685
7859132
6496acf
f47810f
259ceef
26e87ac
e0f7982
d65385e
b0c29b4
95a6f31
7df712f
27da644
5fa8fa0
a1d58f9
db87443
9c66a1e
887346b
00c08cc
020c88e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,13 +61,13 @@ d = stackdt(iris) | |
|
||
This saves memory. To create the view, several AbstractVectors are defined: | ||
|
||
`:variable` column -- `EachRepeatedVector` | ||
`:variable` column -- `EachRepeatedVector` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There aren't any |
||
This repeats the variables N times where N is the number of rows of the original AbstractDataTable. | ||
|
||
`:value` column -- `StackedVector` | ||
`:value` column -- `StackedVector` | ||
This is provides a view of the original columns stacked together. | ||
|
||
Id columns -- `RepeatedVector` | ||
Id columns -- `RepeatedVector` | ||
This repeats the original columns N times where N is the number of columns stacked. | ||
|
||
For more details on the storage representation, see: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,10 @@ The following are normally implemented for AbstractDataTables: | |
* [`nonunique`](@ref) : indexes of duplicate rows | ||
* [`unique!`](@ref) : remove duplicate rows | ||
* `similar` : a DataTable with similar columns as `d` | ||
* `denullify` : unwrap `Nullable` columns | ||
* `denullify!` : unwrap `Nullable` columns in-place | ||
* `nullify` : convert all columns to NullableArrays | ||
* `nullify!` : convert all columns to NullableArrays in-place | ||
|
||
**Indexing** | ||
|
||
|
@@ -706,83 +710,79 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2) | |
Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...) | ||
Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...) | ||
|
||
# vcat only accepts DataTables. Finds union of columns, maintaining order | ||
# of first dt. Missing data become null values. | ||
""" | ||
vcat(dts::AbstractDataTable...) | ||
|
||
Base.vcat(dt::AbstractDataTable) = dt | ||
Vertically concatenate `AbstractDataTables` that have the same column names in | ||
the same order. | ||
|
||
Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) | ||
```julia | ||
julia> dt1 = DataTable(A=1:3, B=1:3); | ||
|
||
function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) | ||
isempty(dts) && return DataTable() | ||
coltyps, colnams, similars = _colinfo(dts) | ||
|
||
res = DataTable() | ||
Nrow = sum(nrow, dts) | ||
for j in 1:length(colnams) | ||
colnam = colnams[j] | ||
col = similar(similars[j], coltyps[j], Nrow) | ||
|
||
i = 1 | ||
for dt in dts | ||
if haskey(dt, colnam) | ||
copy!(col, i, dt[colnam]) | ||
end | ||
i += size(dt, 1) | ||
end | ||
julia> dt2 = DataTable(A=4:6, B=4:6); | ||
|
||
res[colnam] = col | ||
end | ||
res | ||
end | ||
julia> dt3 = DataTable(A=7:9, B=7:9, C=7:9); | ||
|
||
_isnullable{T}(::AbstractArray{T}) = T <: Nullable | ||
const EMPTY_DATA = NullableArray(Void, 0) | ||
|
||
function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) | ||
dt1 = dts[1] | ||
colindex = copy(index(dt1)) | ||
coltyps = eltypes(dt1) | ||
similars = collect(columns(dt1)) | ||
nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] | ||
|
||
for i in 2:length(dts) | ||
dt = dts[i] | ||
for j in 1:size(dt, 2) | ||
col = dt[j] | ||
cn, ct = _names(dt)[j], eltype(col) | ||
if haskey(colindex, cn) | ||
idx = colindex[cn] | ||
|
||
oldtyp = coltyps[idx] | ||
if !(ct <: oldtyp) | ||
coltyps[idx] = promote_type(oldtyp, ct) | ||
# Needed on Julia 0.4 since e.g. | ||
# promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, | ||
# which is not a usable type: fall back to Nullable{Any} | ||
if VERSION < v"0.5.0-dev" && | ||
coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) | ||
coltyps[idx] = Nullable{Any} | ||
end | ||
end | ||
nonnull_ct[idx] += !_isnullable(col) | ||
else # new column | ||
push!(colindex, cn) | ||
push!(coltyps, ct) | ||
push!(similars, col) | ||
push!(nonnull_ct, !_isnullable(col)) | ||
end | ||
end | ||
end | ||
julia> vcat(dt1, dt2) | ||
6×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
│ 4 │ 4 │ 4 │ | ||
│ 5 │ 5 │ 5 │ | ||
│ 6 │ 6 │ 6 │ | ||
|
||
for j in 1:length(colindex) | ||
if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) | ||
similars[j] = EMPTY_DATA | ||
julia> vcat(dt1, dt2, dt3) | ||
ERROR: ArgumentError: columns (A, B) of input(s) (1, 2) != columns (A, B, C) of input(s) (3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the error message could be confusing: it seems to mean that the problem is that columns are different. Probably clearer: "column names of input(s) X != column names of input(s) Y: (A, B) != (A, B, C)". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, you asked me to change this before too and I'm still struggling to think of a better way. This will handle all conditions for any number of inputs. Ideally, we would present the differences in a format like a git diff where only differences are shown and (bonus feature:) they would be shown colorized (red for missing, green for extra columns). julia> dt1 = DataTable(A = 1, B = 1)
1×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
julia> dt2 = DataTable(B = 1, A = 1)
1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
julia> dt3 = DataTable(B = 1, A = 1, C = 1)
1×3 DataTables.DataTable
│ Row │ B │ A │ C │
├─────┼───┼───┼───┤
│ 1 │ 1 │ 1 │ 1 │
julia> vcat(dt1, dt2)
ERROR: ArgumentError: columns (A, B) of input(s) (1) != columns (B, A) of input(s) (2)
Stacktrace:
[1] vcat(::DataTables.DataTable, ::DataTables.DataTable) at /Users/Cameron/.julia/v0.6/DataTables/src/abstractdatatable/abstractdatatable.jl:756
julia> vcat(dt2, dt3)
ERROR: ArgumentError: columns (B, A) of input(s) (1) != columns (B, A, C) of input(s) (2)
Stacktrace:
[1] vcat(::DataTables.DataTable, ::DataTables.DataTable) at /Users/Cameron/.julia/v0.6/DataTables/src/abstractdatatable/abstractdatatable.jl:756
julia> vcat(dt1, dt2, dt3)
ERROR: ArgumentError: columns (A, B) of input(s) (1) != columns (B, A) of input(s) (2) != columns (B, A, C) of input(s) (3)
Stacktrace:
[1] vcat(::DataTables.DataTable, ::DataTables.DataTable, ::DataTables.DataTable, ::Vararg{DataTables.DataTable,N} where N) at /Users/Cameron/.julia/v0.6/DataTables/src/abstractdatatable/abstractdatatable.jl:756 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It stays compact as long as the inputs are mostly correct because it will just start to extend the list of inputs that match the column condition julia> dt3, dt4, dt5, dt6, dt7, dt8 = dt2, dt2, dt2, dt2, dt2, dt2
(1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │, 1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │, 1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │, 1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │, 1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │, 1×2 DataTables.DataTable
│ Row │ B │ A │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │)
julia> vcat(dt1, dt2, dt3, dt4, dt5, dt6, dt7, dt8)
ERROR: ArgumentError: columns (A, B) of input(s) (1) != columns (B, A) of input(s) (2, 3, 4, 5, 6, 7, 8) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a previous comment I had proposed a solution to make this simpler: first check that the number of columns match, and if not just print the names of columns which are missing somewhere. That way you don't need to care about the order at that point. Then, if the number of columns is the same but the names are different, print the non matching names and their position. Finally, if names are the same but not in the same order, just say so, possibly giving the number of the first problematic column and the corresponding names. What really matters here is not flooding the output with 500 variables for large datasets. |
||
``` | ||
""" | ||
Base.vcat(dt::AbstractDataTable) = dt | ||
function Base.vcat(dts::AbstractDataTable...) | ||
isempty(dts) && return DataTable() | ||
allheaders = map(names, dts) | ||
# don't vcat empty DataTables | ||
notempty = find(x -> length(x) > 0, allheaders) | ||
uniqueheaders = unique(allheaders[notempty]) | ||
if length(uniqueheaders) == 0 | ||
return DataTable() | ||
end | ||
if length(uniqueheaders) > 1 | ||
unionunique = union(uniqueheaders...) | ||
coldiff = setdiff(unionunique, intersect(uniqueheaders...)) | ||
if !isempty(coldiff) | ||
# if any datatables are a full superset of names, skip them | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for each unique set of column names I'm throwing the error to tell which columns are missing from each of the sets. If any of the inputs to vcat have all of the column names then we can't show which are missing, so they're dropped from the error output |
||
filter!(u -> Set(u) != Set(unionunique), uniqueheaders) | ||
estrings = Vector{String}(length(uniqueheaders)) | ||
for (i, u) in enumerate(uniqueheaders) | ||
matchingloci = find(h -> u == h, allheaders) | ||
headerdiff = filter(x -> !in(x, u), coldiff) | ||
headerdiff = length(headerdiff) > 1 ? | ||
join(string.(headerdiff[1:end-1]), ", ") * " and " * string(headerdiff[end]) : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather align everything on Anyway, no need for this length check nor to handle the last element manually: as I said, just use |
||
string(headerdiff[end]) | ||
matchingloci = length(matchingloci) > 1 ? | ||
join(string.(matchingloci[1:end-1]), ", ") * " and " * string(matchingloci[end]) : | ||
string(matchingloci[end]) | ||
estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci" | ||
end | ||
throw(ArgumentError(join(estrings, ", and "))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also use |
||
else | ||
estrings = Vector{String}(length(uniqueheaders)) | ||
for (i, u) in enumerate(uniqueheaders) | ||
indices = find(a -> a == u, allheaders) | ||
indices = length(indices) > 1 ? | ||
join(string.(indices[1:end-1]), ", ") * " and " * string(indices[end]) : | ||
string(indices[end]) | ||
estrings[i] = "column order of argument(s) $indices" | ||
end | ||
throw(ArgumentError(join(estrings, " != "))) | ||
end | ||
else | ||
header = uniqueheaders[1] | ||
dts_to_vcat = dts[notempty] | ||
return DataTable(Any[vcat(map(dt -> dt[col], dts_to_vcat)...) for col in header], header) | ||
end | ||
colnams = _names(colindex) | ||
|
||
coltyps, colnams, similars | ||
end | ||
|
||
############################################################################## | ||
|
@@ -801,6 +801,180 @@ function Base.hash(dt::AbstractDataTable) | |
return @compat UInt(h) | ||
end | ||
|
||
""" | ||
denullify!(dt::AbstractDataTable) | ||
|
||
Convert columns with a `Nullable` element type without any null values | ||
to a non-`Nullable` equivalent array type. The table `dt` is modified in place. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mention that new columns may alias the old ones, even when they were converted. Same for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should julia> using DataTables
julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> ddt = denullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> dt[:A] === ddt[:A]
true
julia> ddt[:A] = 1
1
julia> ddt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 1 │ 2 │
│ 3 │ 1 │ 3 │
julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 1 │ 2 │
│ 3 │ 1 │ 3 │ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and nullify... julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> ndt = nullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> dt[:B] === ndt[:B]
true
julia> ndt[:B] = 3
3
julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 3 │
│ 3 │ 3 │ 3 │
julia> ndt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 3 │
│ 3 │ 3 │ 3 │ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure, we would need concrete use cases to decide. In both cases people can easily make a copy manually. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed these back to using copy as you suggested and added a note for |
||
|
||
Columns in the returned `AbstractDataTable` may alias the columns of the | ||
input `dt`. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(denullify!(dt)) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
``` | ||
|
||
See also [`denullify`](@ref) and [`nullify!`](@ref). | ||
""" | ||
function denullify!(dt::AbstractDataTable) | ||
for i in 1:size(dt,2) | ||
if !anynull(dt[i]) | ||
dt[i] = dropnull!(dt[i]) | ||
end | ||
end | ||
dt | ||
end | ||
|
||
""" | ||
denullify(dt::AbstractDataTable) | ||
|
||
Return a copy of `dt` where columns with a `Nullable` element type without any | ||
null values have been converted to a non-`Nullable` equivalent array type. | ||
|
||
Columns in the returned `AbstractDataTable` may alias the columns of the | ||
input `dt`. If no aliasing is desired, use `denullify!(deepcopy(dt))`. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(denullify(dt)) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
``` | ||
|
||
See also [`denullify!`] and [`nullify`](@ref). | ||
""" | ||
denullify(dt::AbstractDataTable) = denullify!(copy(dt)) | ||
|
||
""" | ||
nullify!(dt::AbstractDataTable) | ||
|
||
Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place. | ||
|
||
Columns in the returned `AbstractDataTable` may alias the columns of the | ||
input `dt`. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = 1:3, B = 1:3) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(nullify!(dt)) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
``` | ||
|
||
See also [`nullify`](@ref) and [`denullify!`](@ref). | ||
""" | ||
function nullify!(dt::AbstractDataTable) | ||
for i in 1:size(dt,2) | ||
dt[i] = nullify(dt[i]) | ||
end | ||
dt | ||
end | ||
|
||
nullify(x::AbstractArray) = convert(NullableArray, x) | ||
nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x) | ||
|
||
""" | ||
nullify(dt::AbstractDataTable) | ||
|
||
Return a copy of `dt` with all columns converted to nullable arrays. | ||
|
||
Columns in the returned `AbstractDataTable` may alias the columns of the | ||
input `dt`. If no aliasing is desired, use `nullify!(deepcopy(dt))`. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = 1:3, B = 1:3) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(nullify(dt)) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
``` | ||
|
||
See also [`nullify!`](@ref) and [`denullify`](@ref). | ||
""" | ||
function nullify(dt::AbstractDataTable) | ||
nullify!(copy(dt)) | ||
end | ||
|
||
## Documentation for methods defined elsewhere | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, these probably are important. 2 spaces == newline. I forgot this is markdown interpreter dependant