-
Notifications
You must be signed in to change notification settings - Fork 11
Stop auto-promoting column-types #30
Changes from 1 commit
8db2821
4a939fe
f5a53a1
2c95f13
412ceaa
f142df5
cc95658
06dc914
c4e218e
e954226
ed8a515
1636a0c
91233d3
7462612
9b65533
b643ff8
4c68452
7310681
de280ba
88b20ca
be1cacd
19ffb58
3f2cd63
9c3ad21
1e7d26e
e39ba63
04cb9ee
5d70685
7859132
6496acf
f47810f
259ceef
26e87ac
e0f7982
d65385e
b0c29b4
95a6f31
7df712f
27da644
5fa8fa0
a1d58f9
db87443
9c66a1e
887346b
00c08cc
020c88e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,10 @@ The following are normally implemented for AbstractDataTables: | |
* [`nonunique`](@ref) : indexes of duplicate rows | ||
* [`unique!`](@ref) : remove duplicate rows | ||
* `similar` : a DataTable with similar columns as `d` | ||
* `denullify` : unwrap `Nullable` columns | ||
* `denullify!` : unwrap `Nullable` columns in-place | ||
* `nullify` : convert all columns to NullableArrays | ||
* `nullify!` : convert all columns to NullableArrays in-place | ||
|
||
**Indexing** | ||
|
||
|
@@ -711,78 +715,23 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable | |
|
||
Base.vcat(dt::AbstractDataTable) = dt | ||
|
||
Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) | ||
|
||
function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) | ||
function Base.vcat(dts::AbstractDataTable...) | ||
isempty(dts) && return DataTable() | ||
coltyps, colnams, similars = _colinfo(dts) | ||
|
||
res = DataTable() | ||
Nrow = sum(nrow, dts) | ||
for j in 1:length(colnams) | ||
colnam = colnams[j] | ||
col = similar(similars[j], coltyps[j], Nrow) | ||
|
||
i = 1 | ||
for dt in dts | ||
if haskey(dt, colnam) | ||
copy!(col, i, dt[colnam]) | ||
end | ||
i += size(dt, 1) | ||
end | ||
|
||
res[colnam] = col | ||
end | ||
res | ||
end | ||
|
||
_isnullable{T}(::AbstractArray{T}) = T <: Nullable | ||
const EMPTY_DATA = NullableArray(Void, 0) | ||
|
||
function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) | ||
dt1 = dts[1] | ||
colindex = copy(index(dt1)) | ||
coltyps = eltypes(dt1) | ||
similars = collect(columns(dt1)) | ||
nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] | ||
|
||
for i in 2:length(dts) | ||
dt = dts[i] | ||
for j in 1:size(dt, 2) | ||
col = dt[j] | ||
cn, ct = _names(dt)[j], eltype(col) | ||
if haskey(colindex, cn) | ||
idx = colindex[cn] | ||
|
||
oldtyp = coltyps[idx] | ||
if !(ct <: oldtyp) | ||
coltyps[idx] = promote_type(oldtyp, ct) | ||
# Needed on Julia 0.4 since e.g. | ||
# promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, | ||
# which is not a usable type: fall back to Nullable{Any} | ||
if VERSION < v"0.5.0-dev" && | ||
coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) | ||
coltyps[idx] = Nullable{Any} | ||
end | ||
end | ||
nonnull_ct[idx] += !_isnullable(col) | ||
else # new column | ||
push!(colindex, cn) | ||
push!(coltyps, ct) | ||
push!(similars, col) | ||
push!(nonnull_ct, !_isnullable(col)) | ||
end | ||
end | ||
end | ||
|
||
for j in 1:length(colindex) | ||
if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) | ||
similars[j] = EMPTY_DATA | ||
end | ||
allheaders = map(names, dts) | ||
# don't vcat empty DataTables | ||
notempty = find(x -> length(x) > 0, allheaders) | ||
uniqueheaders = unique(allheaders[notempty]) | ||
if length(uniqueheaders) == 0 | ||
return DataTable() | ||
elseif length(unique(map(length, uniqueheaders))) > 1 | ||
throw(ArgumentError("not all DataTables have the same number of columns. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) | ||
elseif length(uniqueheaders) > 1 | ||
throw(ArgumentError("Column names do not match. Use `rename!` or `names!` to adjust columns names. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think errors should start with a lowercase letter. |
||
else | ||
header = uniqueheaders[1] | ||
dts_to_vcat = dts[notempty] | ||
return DataTable(Any[vcat(map(dt -> dt[col], dts_to_vcat)...) for col in header], header) | ||
end | ||
colnams = _names(colindex) | ||
|
||
coltyps, colnams, similars | ||
end | ||
|
||
############################################################################## | ||
|
@@ -801,6 +750,165 @@ function Base.hash(dt::AbstractDataTable) | |
return @compat UInt(h) | ||
end | ||
|
||
""" | ||
denullify!(dt::AbstractDataTable) | ||
|
||
Convert columns with a `Nullable` element type without any null values | ||
to a non-`Nullable` equivalent array type. The table `dt` is modified in place. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mention that new columns may alias the old ones, even when they were converted. Same for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should julia> using DataTables
julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> ddt = denullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> dt[:A] === ddt[:A]
true
julia> ddt[:A] = 1
1
julia> ddt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 1 │ 2 │
│ 3 │ 1 │ 3 │
julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 1 │ 2 │
│ 3 │ 1 │ 3 │ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and nullify... julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> ndt = nullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │
julia> dt[:B] === ndt[:B]
true
julia> ndt[:B] = 3
3
julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 3 │
│ 3 │ 3 │ 3 │
julia> ndt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 3 │
│ 3 │ 3 │ 3 │ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure, we would need concrete use cases to decide. In both cases people can easily make a copy manually. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed these back to using copy as you suggested and added a note for |
||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(denullify!(dt)) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
``` | ||
|
||
See also [`denullify`](@ref) & [`nullify!`](@ref). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better use "and" than "&". |
||
""" | ||
function denullify!(dt::AbstractDataTable) | ||
for i in 1:size(dt,2) | ||
if !anynull(dt[i]) | ||
dt[i] = dropnull(dt[i]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
end | ||
end | ||
dt | ||
end | ||
|
||
""" | ||
denullify(dt::AbstractDataTable) | ||
|
||
Return a copy of `dt` where columns with a `Nullable` element type without any | ||
null values have been converted to a non-`Nullable` equivalent array type. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(denullify(dt)) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
``` | ||
|
||
See also [`denullify!`] & [`nullify`](@ref). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use literal "and" (as mentioned before). |
||
""" | ||
denullify(dt::AbstractDataTable) = denullify!(copy(dt)) | ||
|
||
""" | ||
nullify!(dt::AbstractDataTable) | ||
|
||
Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = 1:3, B = 1:3) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(nullify!(dt)) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
``` | ||
|
||
See also [`nullify`](@ref) & [`denullify!`](@ref). | ||
""" | ||
function nullify!(dt::AbstractDataTable) | ||
for i in 1:size(dt,2) | ||
dt[i] = NullableArray(dt[i]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather define |
||
end | ||
dt | ||
end | ||
|
||
""" | ||
nullify(dt::AbstractDataTable) | ||
|
||
Return a copy of `dt` with all columns converted to nullable arrays. | ||
|
||
# Examples | ||
|
||
```jldoctest | ||
julia> dt = DataTable(A = 1:3, B = 1:3) | ||
3×2 DataTables.DataTable | ||
│ Row │ A │ B │ | ||
├─────┼───┼───┤ | ||
│ 1 │ 1 │ 1 │ | ||
│ 2 │ 2 │ 2 │ | ||
│ 3 │ 3 │ 3 │ | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
|
||
julia> eltypes(nullify(dt)) | ||
2-element Array{Type,1}: | ||
Nullable{Int64} | ||
Nullable{Int64} | ||
|
||
julia> eltypes(dt) | ||
2-element Array{Type,1}: | ||
Int64 | ||
Int64 | ||
``` | ||
|
||
See also [`nullify!`](@ref) & [`denullify`](@ref). | ||
""" | ||
function nullify(dt::AbstractDataTable) | ||
nullify!(copy(dt)) | ||
end | ||
|
||
## Documentation for methods defined elsewhere | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,19 +42,20 @@ function printtable(io::IO, | |
quotestr = string(quotemark) | ||
for i in 1:n | ||
for j in 1:p | ||
if !isnull(dt[j],i) | ||
if !isnull(dt[j][i]) | ||
if ! (etypes[j] <: Real) | ||
print(io, quotemark) | ||
escapedprint(io, get(dt[i, j]), quotestr) | ||
print(io, quotemark) | ||
print(io, quotemark) | ||
x = isa(dt[i, j], Nullable) ? get(dt[i, j]) : dt[i, j] | ||
escapedprint(io, x, quotestr) | ||
print(io, quotemark) | ||
else | ||
print(io, dt[i, j]) | ||
print(io, dt[i, j]) | ||
end | ||
else | ||
print(io, nastring) | ||
print(io, nastring) | ||
end | ||
if j < p | ||
print(io, separator) | ||
print(io, separator) | ||
else | ||
print(io, '\n') | ||
end | ||
|
@@ -167,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) | |
write(io, " & ") | ||
cell = dt[row,col] | ||
if !isnull(cell) | ||
content = get(cell) | ||
content = isa(cell, Nullable) ? get(cell) : cell | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should be able to use |
||
if mimewritable(MIME("text/latex"), content) | ||
show(io, MIME("text/latex"), content) | ||
else | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,19 +2,6 @@ | |
## Join / merge | ||
## | ||
|
||
# Like similar, but returns a nullable array | ||
similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = | ||
NullableArray(T, dims) | ||
|
||
similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = | ||
NullableArray(eltype(T), dims) | ||
|
||
similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = | ||
NullableCategoricalArray(T, dims) | ||
|
||
similar_nullable(dt::AbstractDataTable, dims::Int) = | ||
DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) | ||
|
||
# helper structure for DataTables joining | ||
immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} | ||
dtl::DT1 | ||
|
@@ -76,9 +63,12 @@ function compose_joined_table(joiner::DataTableJoiner, | |
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] | ||
end | ||
all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) | ||
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] | ||
for col in columns(dtr_noon)], | ||
names(dtr_noon)) | ||
resizelen = length(all_orig_right_ixs)+length(leftonly_ixs) | ||
rightcols = Any[length(col[all_orig_right_ixs]) >= resizelen ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
resize!(col[all_orig_right_ixs], resizelen)[right_perm] : | ||
NullableArray(vcat(col[all_orig_right_ixs], fill(Nullable(), resizelen - length(col[all_orig_right_ixs]))))[right_perm] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be much better to allocate a Finally, what happens with |
||
for col in columns(dtr_noon)] | ||
right_dt = DataTable(rightcols, names(dtr_noon)) | ||
# merge left and right parts of the joined table | ||
res = hcat!(left_dt, right_dt) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Resolve" might not be completely clear. Maybe "unmatched" (sorry, not a native speaker)?
Also it would still be interesting to print the number of columns of each table, since that's what the message says.