Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid Union{Missing,T} in columns after DropMissing #35

Merged
merged 10 commits into from
Apr 9, 2022
45 changes: 32 additions & 13 deletions src/transforms/filter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ function revert(::Filter, newtable, cache)
rows |> Tables.materializer(newtable)
end

# DropMissing

const VecOrTuple{T} = Union{Vector{T}, NTuple{N, T}} where {T, N}

"""
DropMissing()
DropMissing(:)
Expand Down Expand Up @@ -69,22 +65,45 @@ DropMissing(cols::T...) where {T<:ColSelector} =

isrevertible(::Type{<:DropMissing}) = true

_ftrans(::DropMissing{Colon}, table) =
# ftrans
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
_ftrans(::DropMissing{Colon}, cols) =
Filter(row -> all(!ismissing, row))

function _ftrans(transform::DropMissing, table)
allcols = Tables.columnnames(table)
cols = _filter(transform.colspec, allcols)
_ftrans(::DropMissing, cols) =
Filter(row -> all(!ismissing, getindex.(Ref(row), cols)))
end

# nonmissing
_nonmissing(::Type{T}, c) where {T} = c
_nonmissing(::Type{Union{Missing,T}}, c) where {T} =
collect(T, c)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

_nonmissing(col) = _nonmissing(eltype(col), col)

function apply(transform::DropMissing, table)
ftrans = _ftrans(transform, table)
colnames = Tables.columnnames(table)
select = _filter(transform.colspec, colnames)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
ftrans = _ftrans(transform, select)
newtable, fcache = apply(ftrans, table)
newtable, (ftrans, fcache)

# post-processing
coltable = Tables.columntable(newtable)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
pcolumns = [nm ∈ select ? _nonmissing(col) : col for (nm, col) in pairs(coltable)]
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
𝒯 = (; zip(colnames, pcolumns)...)
ptable = 𝒯 |> Tables.materializer(newtable)

types = Tables.schema(table).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
ptable, (ftrans, fcache, types)
end

function revert(::DropMissing, newtable, cache)
ftrans, fcache = cache
revert(ftrans, newtable, fcache)
ftrans, fcache, types = cache

# pre-processing
colnames = Tables.columnnames(newtable)
coltable = Tables.columntable(newtable)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
pcolumns = [collect(T, col) for (T, col) in zip(types, coltable)]
𝒯 = (; zip(colnames, pcolumns)...)
ptable = 𝒯 |> Tables.materializer(newtable)

revert(ftrans, ptable, fcache)
end
39 changes: 39 additions & 0 deletions test/transforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,45 @@
@test isequalmissing(n.e, [missing, 5, 6, 5])
@test isequalmissing(n.f, [4, missing, 4, 5])

# column eltype
ttypes = Tables.schema(t).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

T = DropMissing()
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Int
@test ntypes[2] == Int
@test ntypes[3] == Int
@test ntypes[4] == Int
@test ntypes[5] == Int
@test ntypes[6] == Int
tₒ = revert(T, n, c)
eliascarv marked this conversation as resolved.
Show resolved Hide resolved
@test ttypes == Tables.schema(tₒ).types

T = DropMissing([:a, :c, :d])
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Int
@test ntypes[2] == Union{Missing,Int}
@test ntypes[3] == Int
@test ntypes[4] == Int
@test ntypes[5] == Union{Missing,Int}
@test ntypes[6] == Union{Missing,Int}
tₒ = revert(T, n, c)
@test ttypes == Tables.schema(tₒ).types

T = DropMissing([:b, :e, :f])
n, c = apply(T, t)
ntypes = Tables.schema(n).types
@test ntypes[1] == Union{Missing,Int}
@test ntypes[2] == Int
@test ntypes[3] == Union{Missing,Int}
@test ntypes[4] == Union{Missing,Int}
@test ntypes[5] == Int
@test ntypes[6] == Int
tₒ = revert(T, n, c)
@test ttypes == Tables.schema(tₒ).types
eliascarv marked this conversation as resolved.
Show resolved Hide resolved

# reapply test
T = DropMissing()
n1, c1 = apply(T, t)
Expand Down