Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid Union{Missing,T} in columns after DropMissing #35

Merged
merged 10 commits into from
Apr 9, 2022
53 changes: 41 additions & 12 deletions src/transforms/filter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ function revert(::Filter, newtable, cache)
rows |> Tables.materializer(newtable)
end

# DropMissing

const VecOrTuple{T} = Union{Vector{T}, NTuple{N, T}} where {T, N}

"""
DropMissing()
DropMissing(:)
Expand Down Expand Up @@ -69,22 +65,55 @@ DropMissing(cols::T...) where {T<:ColSelector} =

isrevertible(::Type{<:DropMissing}) = true

_ftrans(::DropMissing{Colon}, table) =
# ftrans
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
_ftrans(::DropMissing{Colon}, cols) =
Filter(row -> all(!ismissing, row))

function _ftrans(transform::DropMissing, table)
allcols = Tables.columnnames(table)
cols = _filter(transform.colspec, allcols)
_ftrans(::DropMissing, cols) =
Filter(row -> all(!ismissing, getindex.(Ref(row), cols)))

# nonmissing
_nonmissing(::Type{T}, c) where {T} = c
_nonmissing(::Type{Union{Missing,T}}, c) where {T} =
collect(T, c)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

function _nonmissing(columns, col)
c = Tables.getcolumn(columns, col)
_nonmissing(eltype(c), c)
end

function _nonmissing(table, cols, allcols)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
columns = Tables.columns(table)
newcols = [col ∈ cols ? _nonmissing(columns, col) : Tables.getcolumn(columns, col)
for col in allcols]
juliohm marked this conversation as resolved.
Show resolved Hide resolved
𝒯 = (; zip(allcols, newcols)...)
𝒯 |> Tables.materializer(table)
end

# reverttypes
function _reverttypes(table, types)
columns = Tables.columns(table)
allcols = Tables.columnnames(table)
newcols = [collect(T, Tables.getcolumn(columns, col))
for (T, col) in zip(types, allcols)]
𝒯 = (; zip(allcols, newcols)...)
𝒯 |> Tables.materializer(table)
end

function apply(transform::DropMissing, table)
ftrans = _ftrans(transform, table)
allcols = Tables.columnnames(table)
cols = _filter(transform.colspec, allcols)
ftrans = _ftrans(transform, cols)
newtable, fcache = apply(ftrans, table)
newtable, (ftrans, fcache)
# post-processing
types = Tables.schema(table).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
ptable = _nonmissing(newtable, cols, allcols)
ptable, (ftrans, fcache, types)
end

function revert(::DropMissing, newtable, cache)
ftrans, fcache = cache
revert(ftrans, newtable, fcache)
ftrans, fcache, types = cache
# pre-processing
ptable = _reverttypes(newtable, types)
revert(ftrans, ptable, fcache)
end
39 changes: 39 additions & 0 deletions test/transforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,45 @@
@test isequalmissing(n.e, [missing, 5, 6, 5])
@test isequalmissing(n.f, [4, missing, 4, 5])

# column eltype
ttypes = Tables.schema(t).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

T = DropMissing()
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Int
@test ntypes[2] == Int
@test ntypes[3] == Int
@test ntypes[4] == Int
@test ntypes[5] == Int
@test ntypes[6] == Int
tₒ = revert(T, n, c)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
@test ttypes == Tables.schema(tₒ).types

T = DropMissing([:a, :c, :d])
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Int
@test ntypes[2] == Union{Missing,Int}
@test ntypes[3] == Int
@test ntypes[4] == Int
@test ntypes[5] == Union{Missing,Int}
@test ntypes[6] == Union{Missing,Int}
tₒ = revert(T, n, c)
@test ttypes == Tables.schema(tₒ).types

T = DropMissing([:b, :e, :f])
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Union{Missing,Int}
@test ntypes[2] == Int
@test ntypes[3] == Union{Missing,Int}
@test ntypes[4] == Union{Missing,Int}
@test ntypes[5] == Int
@test ntypes[6] == Int
tₒ = revert(T, n, c)
@test ttypes == Tables.schema(tₒ).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

# reapply test
T = DropMissing()
n1, c1 = apply(T, t)
Expand Down