Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

Stop auto-promoting column-types #30

Closed
wants to merge 46 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
8db2821
add changes
cjprybol Mar 10, 2017
4a939fe
make vcat error more informative
cjprybol Mar 13, 2017
f5a53a1
add docstring for vcat
cjprybol Mar 13, 2017
2c95f13
incorporate edits suggested during review
cjprybol Mar 13, 2017
412ceaa
_unsafe_get -> NullableArrays.unsafe_get
cjprybol Mar 13, 2017
f142df5
Merge branch 'master' into cjp/rebaseretaintype
cjprybol Mar 13, 2017
cc95658
fix new tests from master
cjprybol Mar 13, 2017
06dc914
remove RepeatedVector, StackedVector, unstackdt, meltdt
cjprybol Mar 14, 2017
c4e218e
DataFrames doensn't reshape 2d Arrays -> Vectors so don't do it here
cjprybol Mar 14, 2017
e954226
minor cleanup
cjprybol Mar 14, 2017
ed8a515
change (de)nullify back to copy and cleanup docstrings
cjprybol Mar 15, 2017
1636a0c
NullableArrays.unsafe_get -> compat(unsafe_get)
cjprybol Mar 15, 2017
91233d3
default to NullableArray for joins that may introduce missing data
cjprybol Mar 15, 2017
7462612
align comments
cjprybol Mar 15, 2017
9b65533
lots of edits
cjprybol Mar 15, 2017
b643ff8
tests and no need for compat
cjprybol Mar 15, 2017
4c68452
spacing mistakes
cjprybol Mar 15, 2017
7310681
throw errors on 1-d matrices and change confusing variable name
cjprybol Mar 15, 2017
de280ba
add back check to differentiate scalars from AbstractArrays
cjprybol Mar 15, 2017
88b20ca
save work
cjprybol Mar 16, 2017
be1cacd
save progress, switch to test master
cjprybol Mar 16, 2017
19ffb58
join is ready and tests in place. right join still broken
cjprybol Mar 16, 2017
3f2cd63
fix right join
cjprybol Mar 17, 2017
9c3ad21
update join help message and add note about temp fix
cjprybol Mar 17, 2017
1e7d26e
indentation
cjprybol Mar 17, 2017
e39ba63
changes
cjprybol Mar 17, 2017
04cb9ee
spacing
cjprybol Mar 17, 2017
5d70685
put old unstack back and stabilize types, ordering
cjprybol Mar 18, 2017
7859132
fix bad copy and paste spacing and condense scalar recycling code
cjprybol Mar 18, 2017
6496acf
update vcat error
cjprybol Mar 18, 2017
f47810f
unused function, another test, remove unused variable
cjprybol Mar 18, 2017
259ceef
revert function removal to appease new code failures?
cjprybol Mar 18, 2017
26e87ac
fix v0.5 issue
cjprybol Mar 18, 2017
e0f7982
update vcat testing and change similar_nullable constructor call
cjprybol Mar 18, 2017
d65385e
:Merge branch 'cjp/retaintype' of github.com:cjprybol/DataTables.jl i…
cjprybol Mar 18, 2017
b0c29b4
remove old error message from docstring
cjprybol Mar 18, 2017
95a6f31
and change docstring to doctest
cjprybol Mar 18, 2017
7df712f
change similar_nullable back and fix unrelated copy paste space removal
cjprybol Mar 18, 2017
27da644
add missing rightperm reordering and properly unify hcat! functions
cjprybol Mar 18, 2017
5fa8fa0
accidental spacing changes
cjprybol Mar 18, 2017
a1d58f9
forgot one spacing change
cjprybol Mar 18, 2017
db87443
change deprecations
cjprybol Mar 18, 2017
9c66a1e
add back extra spaces
cjprybol Mar 18, 2017
887346b
bump catarrays version, remove manual resetting of levels in unstack
cjprybol Mar 20, 2017
00c08cc
Merge branch 'master' into cjp/retaintype
cjprybol Mar 24, 2017
020c88e
only use "and" when joining the last estring
cjprybol Mar 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/DataTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ export @~,
combine,
completecases,
deleterows!,
denullify!,
denullify,
describe,
dropnull,
dropnull!,
Expand All @@ -61,6 +63,8 @@ export @~,
nonunique,
nrow,
nullable!,
nullify!,
nullify,
order,
printtable,
rename!,
Expand Down
248 changes: 178 additions & 70 deletions src/abstractdatatable/abstractdatatable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ The following are normally implemented for AbstractDataTables:
* [`nonunique`](@ref) : indexes of duplicate rows
* [`unique!`](@ref) : remove duplicate rows
* `similar` : a DataTable with similar columns as `d`
* `denullify` : unwrap `Nullable` columns
* `denullify!` : unwrap `Nullable` columns in-place
* `nullify` : convert all columns to NullableArrays
* `nullify!` : convert all columns to NullableArrays in-place

**Indexing**

Expand Down Expand Up @@ -711,78 +715,23 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable

Base.vcat(dt::AbstractDataTable) = dt

Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...])

function Base.vcat{T<:AbstractDataTable}(dts::Vector{T})
function Base.vcat(dts::AbstractDataTable...)
isempty(dts) && return DataTable()
coltyps, colnams, similars = _colinfo(dts)

res = DataTable()
Nrow = sum(nrow, dts)
for j in 1:length(colnams)
colnam = colnams[j]
col = similar(similars[j], coltyps[j], Nrow)

i = 1
for dt in dts
if haskey(dt, colnam)
copy!(col, i, dt[colnam])
end
i += size(dt, 1)
end

res[colnam] = col
end
res
end

_isnullable{T}(::AbstractArray{T}) = T <: Nullable
const EMPTY_DATA = NullableArray(Void, 0)

function _colinfo{T<:AbstractDataTable}(dts::Vector{T})
dt1 = dts[1]
colindex = copy(index(dt1))
coltyps = eltypes(dt1)
similars = collect(columns(dt1))
nonnull_ct = Int[_isnullable(c) for c in columns(dt1)]

for i in 2:length(dts)
dt = dts[i]
for j in 1:size(dt, 2)
col = dt[j]
cn, ct = _names(dt)[j], eltype(col)
if haskey(colindex, cn)
idx = colindex[cn]

oldtyp = coltyps[idx]
if !(ct <: oldtyp)
coltyps[idx] = promote_type(oldtyp, ct)
# Needed on Julia 0.4 since e.g.
# promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T},
# which is not a usable type: fall back to Nullable{Any}
if VERSION < v"0.5.0-dev" &&
coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType)
coltyps[idx] = Nullable{Any}
end
end
nonnull_ct[idx] += !_isnullable(col)
else # new column
push!(colindex, cn)
push!(coltyps, ct)
push!(similars, col)
push!(nonnull_ct, !_isnullable(col))
end
end
end

for j in 1:length(colindex)
if nonnull_ct[j] < length(dts) && !_isnullable(similars[j])
similars[j] = EMPTY_DATA
end
allheaders = map(names, dts)
# don't vcat empty DataTables
notempty = find(x -> length(x) > 0, allheaders)
uniqueheaders = unique(allheaders[notempty])
if length(uniqueheaders) == 0
return DataTable()
elseif length(unique(map(length, uniqueheaders))) > 1
throw(ArgumentError("not all DataTables have the same number of columns. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Resolve" might not be completely clear. Maybe "unmatched" (sorry, not a native speaker)?

Also it would still be interesting to print the number of columns of each table, since that's what the message says.

elseif length(uniqueheaders) > 1
throw(ArgumentError("Column names do not match. Use `rename!` or `names!` to adjust columns names. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think errors should start with a lowercase letter.

else
header = uniqueheaders[1]
dts_to_vcat = dts[notempty]
return DataTable(Any[vcat(map(dt -> dt[col], dts_to_vcat)...) for col in header], header)
end
colnams = _names(colindex)

coltyps, colnams, similars
end

##############################################################################
Expand All @@ -801,6 +750,165 @@ function Base.hash(dt::AbstractDataTable)
return @compat UInt(h)
end

"""
denullify!(dt::AbstractDataTable)

Convert columns with a `Nullable` element type without any null values
to a non-`Nullable` equivalent array type. The table `dt` is modified in place.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mention that new columns may alias the old ones, even when they were converted. Same for denullify.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should denullify switch to deepcopy? I'm not sure I like this alias behavior for denullify.

julia> using DataTables

julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 222 │
│ 333 │

julia> ddt = denullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 222 │
│ 333 │

julia> dt[:A] === ddt[:A]
true

julia> ddt[:A] = 1
1

julia> ddt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 212 │
│ 313 │

julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 212 │
│ 313

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and nullify...

julia> dt = DataTable(A = 1:3, B = NullableArray(1:3))
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 222 │
│ 333 │

julia> ndt = nullify(dt)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 111 │
│ 222 │
│ 333 │

julia> dt[:B] === ndt[:B]
true

julia> ndt[:B] = 3
3

julia> dt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 113 │
│ 223 │
│ 333 │

julia> ndt
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 113 │
│ 223 │
│ 333

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure, we would need concrete use cases to decide. In both cases people can easily make a copy manually.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed these back to using copy as you suggested and added a note for nullify and denullify that if users want fully alias-free copies, they should use nullify!(deepcopy(dt)) and denullify!(deepcopy(dt)). Hopefully if anyone hits this issue and doesn't know why columns are changed across multiple tables, they'll open the help docstrings and see the note.


# Examples

```jldoctest
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3])
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │

julia> eltypes(dt)
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}

julia> eltypes(denullify!(dt))
2-element Array{Type,1}:
Int64
Int64

julia> eltypes(dt)
2-element Array{Type,1}:
Int64
Int64
```

See also [`denullify`](@ref) & [`nullify!`](@ref).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better use "and" than "&".

"""
function denullify!(dt::AbstractDataTable)
for i in 1:size(dt,2)
if !anynull(dt[i])
dt[i] = dropnull(dt[i])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use dropnull!? That will avoid a copy for NullableArray, and it won't make a difference for other arrays. Since there a no nulls, the original arrays won't be modified.

end
end
dt
end

"""
denullify(dt::AbstractDataTable)

Return a copy of `dt` where columns with a `Nullable` element type without any
null values have been converted to a non-`Nullable` equivalent array type.

# Examples

```jldoctest
julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3])
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │

julia> eltypes(dt)
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}

julia> eltypes(denullify(dt))
2-element Array{Type,1}:
Int64
Int64

julia> eltypes(dt)
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}
```

See also [`denullify!`] & [`nullify`](@ref).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use literal "and" (as mentioned before).

"""
denullify(dt::AbstractDataTable) = denullify!(copy(dt))

"""
nullify!(dt::AbstractDataTable)

Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place.

# Examples

```jldoctest
julia> dt = DataTable(A = 1:3, B = 1:3)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │

julia> eltypes(dt)
2-element Array{Type,1}:
Int64
Int64

julia> eltypes(nullify!(dt))
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}

julia> eltypes(dt)
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}
```

See also [`nullify`](@ref) & [`denullify!`](@ref).
"""
function nullify!(dt::AbstractDataTable)
for i in 1:size(dt,2)
dt[i] = NullableArray(dt[i])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather define nullify(x::AbstractArray) = convert(NullableArray, x) and nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x) and call it. That's actually what similar_nullable did. (Using convert can theoretically avoid unnecessary copies.)

end
dt
end

"""
nullify(dt::AbstractDataTable)

Return a copy of `dt` with all columns converted to nullable arrays.

# Examples

```jldoctest
julia> dt = DataTable(A = 1:3, B = 1:3)
3×2 DataTables.DataTable
│ Row │ A │ B │
├─────┼───┼───┤
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 2 │
│ 3 │ 3 │ 3 │

julia> eltypes(dt)
2-element Array{Type,1}:
Int64
Int64

julia> eltypes(nullify(dt))
2-element Array{Type,1}:
Nullable{Int64}
Nullable{Int64}

julia> eltypes(dt)
2-element Array{Type,1}:
Int64
Int64
```

See also [`nullify!`](@ref) & [`denullify`](@ref).
"""
function nullify(dt::AbstractDataTable)
nullify!(copy(dt))
end

## Documentation for methods defined elsewhere

Expand Down
17 changes: 9 additions & 8 deletions src/abstractdatatable/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,20 @@ function printtable(io::IO,
quotestr = string(quotemark)
for i in 1:n
for j in 1:p
if !isnull(dt[j],i)
if !isnull(dt[j][i])
if ! (etypes[j] <: Real)
print(io, quotemark)
escapedprint(io, get(dt[i, j]), quotestr)
print(io, quotemark)
print(io, quotemark)
x = isa(dt[i, j], Nullable) ? get(dt[i, j]) : dt[i, j]
escapedprint(io, x, quotestr)
print(io, quotemark)
else
print(io, dt[i, j])
print(io, dt[i, j])
end
else
print(io, nastring)
print(io, nastring)
end
if j < p
print(io, separator)
print(io, separator)
else
print(io, '\n')
end
Expand Down Expand Up @@ -167,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable)
write(io, " & ")
cell = dt[row,col]
if !isnull(cell)
content = get(cell)
content = isa(cell, Nullable) ? get(cell) : cell
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to use _unsafe_get, which works for non-nullable.

if mimewritable(MIME("text/latex"), content)
show(io, MIME("text/latex"), content)
else
Expand Down
22 changes: 6 additions & 16 deletions src/abstractdatatable/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,6 @@
## Join / merge
##

# Like similar, but returns a nullable array
similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableArray(T, dims)

similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableArray(eltype(T), dims)

similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableCategoricalArray(T, dims)

similar_nullable(dt::AbstractDataTable, dims::Int) =
DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt)))

# helper structure for DataTables joining
immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable}
dtl::DT1
Expand Down Expand Up @@ -76,9 +63,12 @@ function compose_joined_table(joiner::DataTableJoiner,
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil]
end
all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm]
for col in columns(dtr_noon)],
names(dtr_noon))
resizelen = length(all_orig_right_ixs)+length(leftonly_ixs)
rightcols = Any[length(col[all_orig_right_ixs]) >= resizelen ?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

length(col[all_orig_right_ixs]) could simply be length(all_orig_right_ixs).

resize!(col[all_orig_right_ixs], resizelen)[right_perm] :
NullableArray(vcat(col[all_orig_right_ixs], fill(Nullable(), resizelen - length(col[all_orig_right_ixs]))))[right_perm]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be much better to allocate a NullableArray of the correct length, and then fill it. That would also fix the issue we discussed in the other PR about the fact that the [right_perm] indexing makes copies.

Finally, what happens with CategoricalArray columns? Maybe we need to call something like similar_nullable?

for col in columns(dtr_noon)]
right_dt = DataTable(rightcols, names(dtr_noon))
# merge left and right parts of the joined table
res = hcat!(left_dt, right_dt)

Expand Down
Loading