Skip to content

Commit

Permalink
Merge 99ae870 into c5b35a7
Browse files Browse the repository at this point in the history
  • Loading branch information
nalimilan committed Nov 5, 2020
2 parents c5b35a7 + 99ae870 commit bb7b968
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 242 deletions.
74 changes: 45 additions & 29 deletions src/array.jl
Expand Up @@ -19,15 +19,41 @@ function reftype(sz::Int)
end
end

# This check is only there to print a user-friendly warning before
# a TypeError is thrown due to restrictions in the type signature
function check_supported_eltype(::Type{T}, ::Type{U}) where {T, U}
T === Symbol &&
throw(ArgumentError("CategoricalArray no longer supports Symbol as element type "*
"as that forces recompiling too many Julia Base methods: " *
"use strings instead, e.g. via categorical(string.(x))"))
T <: Union{SupportedTypes, Missing} ||
throw(ArgumentError("CategoricalArray only supports " *
"AbstractString, AbstractChar and Number element types " *
"(got element type $U)"))
end

fixstringtype(T::Type) = T <: SubString || T === AbstractString ? String : T
fixstringtype(T::Union) = Union{fixstringtype(T.a), fixstringtype(T.b)}
fixstringtype(::Type{Union{}}) = Union{}

# Find a narrow type that is supported to hold all elements if possible
function fixtype(A::AbstractArray{T}) where T
if T <: Union{SupportedTypes, Missing}
return fixstringtype(T)
else
U = fixstringtype(mapreduce(typeof, Base.promote_typejoin, A))
check_supported_eltype(U, T)
return U
end
end

"""
CategoricalArray{T}(undef, dims::Dims; levels=nothing, ordered=false)
CategoricalArray{T}(undef, dims::Int...; levels=nothing, ordered=false)
Construct an uninitialized `CategoricalArray` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalArray` with levels of type
`T <: $SupportedTypes` and dimensions `dims`.
The `levels` keyword argument can be a vector specifying possible values for the data
(this is equivalent to but more efficient than calling [`levels!`](@ref)
on the resulting array).
Expand All @@ -52,8 +78,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).
CategoricalArray(A::CategoricalArray; levels=nothing, ordered=false)
If `A` is already a `CategoricalArray`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -62,7 +86,8 @@ function CategoricalArray end
"""
CategoricalVector{T}(undef, m::Int; levels=nothing, ordered=false)
Construct an uninitialized `CategoricalVector` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalVector` with levels of type
`T <: $SupportedTypes` and dimensions `dim`.
The `levels` keyword argument can be a vector specifying possible values for the data
(this is equivalent to but more efficient than calling [`levels!`](@ref)
Expand All @@ -87,8 +112,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).
CategoricalVector(A::CategoricalVector; levels=nothing, ordered=false)
If `A` is already a `CategoricalVector`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -97,7 +120,8 @@ function CategoricalVector end
"""
CategoricalMatrix{T}(undef, m::Int, n::Int; levels=nothing, ordered=false)
Construct an uninitialized `CategoricalMatrix` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalMatrix` with levels of type
`T <: $SupportedTypes` and dimensions `dim`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).
Expand All @@ -118,8 +142,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).
CategoricalMatrix(A::CategoricalMatrix; levels=nothing, ordered=isordered(A))
If `A` is already a `CategoricalMatrix`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -137,6 +159,7 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int};
ordered::Bool=false) where {T, N, R}
U = leveltype(nonmissingtype(T))
S = T >: Missing ? Union{U, Missing} : U
check_supported_eltype(S, T)
V = CategoricalValue{U, R}
levs = levels === nothing ? U[] : collect(U, levels)
CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered))
Expand Down Expand Up @@ -231,6 +254,7 @@ function CategoricalArray{T, N, R}(A::AbstractArray;
end

# From AbstractArray

CategoricalArray{T, N}(A::AbstractArray{S, N};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {S, T, N} =
Expand All @@ -242,17 +266,17 @@ CategoricalArray{T}(A::AbstractArray{S, N};
CategoricalArray(A::AbstractArray{T, N};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T, N} =
CategoricalArray{fixstringtype(T), N}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), N}(A, levels=levels, ordered=ordered)

CategoricalVector(A::AbstractVector{T};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T} =
CategoricalArray{fixstringtype(T), 1}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), 1}(A, levels=levels, ordered=ordered)

CategoricalMatrix(A::AbstractMatrix{T};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T} =
CategoricalArray{fixstringtype(T), 2}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), 2}(A, levels=levels, ordered=ordered)

# From CategoricalArray (preserve R)
CategoricalArray{T, N}(A::CategoricalArray{S, N, R};
Expand Down Expand Up @@ -286,12 +310,12 @@ convert(::Type{CategoricalArray{T, N}}, A::AbstractArray{S, N}) where {S, T, N}
convert(::Type{CategoricalArray{T}}, A::AbstractArray{S, N}) where {S, T, N} =
convert(CategoricalArray{T, N}, A)
convert(::Type{CategoricalArray}, A::AbstractArray{T, N}) where {T, N} =
convert(CategoricalArray{T, N}, A)
convert(CategoricalArray{fixtype(A), N}, A)

convert(::Type{CategoricalVector{T}}, A::AbstractVector) where {T} =
convert(CategoricalVector{T, DefaultRefType}, A)
convert(::Type{CategoricalVector}, A::AbstractVector{T}) where {T} =
convert(CategoricalVector{T}, A)
convert(CategoricalVector{fixtype(A)}, A)
convert(::Type{CategoricalVector{T}},
A::CategoricalVector{S, R}) where {S, T, R <: Integer} =
convert(CategoricalVector{T, R}, A)
Expand All @@ -301,7 +325,7 @@ convert(::Type{CategoricalVector}, A::CategoricalVector) = A
convert(::Type{CategoricalMatrix{T}}, A::AbstractMatrix) where {T} =
convert(CategoricalMatrix{T, DefaultRefType}, A)
convert(::Type{CategoricalMatrix}, A::AbstractMatrix{T}) where {T} =
convert(CategoricalMatrix{T}, A)
convert(CategoricalMatrix{fixtype(A)}, A)
convert(::Type{CategoricalMatrix{T}},
A::CategoricalMatrix{S, R}) where {S, T, R <: Integer} =
convert(CategoricalMatrix{T, R}, A)
Expand All @@ -313,6 +337,8 @@ convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}) where {S, T,

function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N};
levels::Union{AbstractVector, Nothing}=nothing) where {S, T, N, R}
check_supported_eltype(T, T)

res = CategoricalArray{T, N, R}(undef, size(A), levels=levels)
copyto!(res, A)

Expand Down Expand Up @@ -698,7 +724,7 @@ function vcat(A::CategoricalArray...)
[x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)}
end

T = Base.promote_eltype(A...) >: Missing ?
T = cat_promote_eltype(A...) >: Missing ?
Union{eltype(newlevels), Missing} : eltype(newlevels)
refs = DefaultRefType[refsvec...;]
pool = CategoricalPool(newlevels, ordered)
Expand Down Expand Up @@ -912,15 +938,15 @@ are preserved unless explicitly overriden.
compress::Bool=false) where {T, N}
# @inline is needed so that return type is inferred when compress is not provided
RefType = compress ? reftype(length(unique(A))) : DefaultRefType
CategoricalArray{fixstringtype(T), N, RefType}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), N, RefType}(A, levels=levels, ordered=ordered)
end
@inline function categorical(A::CategoricalArray{T, N, R};
levels::Union{AbstractVector, Nothing}=nothing,
ordered=_isordered(A),
compress::Bool=false) where {T, N, R}
# @inline is needed so that return type is inferred when compress is not provided
RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R
CategoricalArray{fixstringtype(T), N, RefType}(A, levels=levels, ordered=ordered)
CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered)
end

function in(x::Any, y::CategoricalArray{T, N, R}) where {T, N, R}
Expand Down Expand Up @@ -1046,21 +1072,11 @@ end

StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}},
A::AbstractVector) where {T} =
categoricalmissing(T, A)
CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing))
StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}},
A::Vector) where {T} =
categoricalmissing(T, A)
categoricalmissing(T, A::AbstractVector) =
CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing))

StructTypes.construct(::Type{<:CategoricalArray{Union{Nothing, T}}},
A::AbstractVector) where {T} =
categoricalnothing(T, A)
StructTypes.construct(::Type{<:CategoricalArray{Union{Nothing, T}}},
A::Vector) where {T} =
categoricalnothing(T, A)
categoricalnothing(T, A::AbstractVector) = CategoricalArray{Union{Nothing, T}}(A)

# DataAPI refarray/refvalue/refpool support
struct CategoricalRefPool{T, P} <: AbstractVector{T}
pool::P
Expand Down
2 changes: 1 addition & 1 deletion src/pool.jl
Expand Up @@ -62,7 +62,7 @@ avoid doing a dict lookup twice
end

function mergelevels(ordered, levels...)
T = Base.promote_eltype(levels...)
T = cat_promote_eltype(levels...)
res = Vector{T}(undef, 0)

nonempty_lv = findfirst(!isempty, levels)
Expand Down
19 changes: 10 additions & 9 deletions src/recode.jl
Expand Up @@ -274,8 +274,9 @@ recode!(a::AbstractArray, default::Any, pairs::Pair...) =
recode!(a, a, default, pairs...)
recode!(a::AbstractArray, pairs::Pair...) = recode!(a, a, nothing, pairs...)

promote_valuetype(x::Pair{K, V}) where {K, V} = V
promote_valuetype(x::Pair{K, V}, y::Pair...) where {K, V} = promote_type(V, promote_valuetype(y...))
cat_promote_valuetype(x::Pair{K, V}) where {K, V} = V
cat_promote_valuetype(x::Pair{K, V}, y::Pair...) where {K, V} =
cat_promote_type(V, cat_promote_valuetype(y...))

keytype_hasmissing(x::Pair{K}) where {K} = K === Missing
keytype_hasmissing(x::Pair{K}, y::Pair...) where {K} = K === Missing || keytype_hasmissing(y...)
Expand Down Expand Up @@ -350,11 +351,11 @@ recode(a::AbstractArray, pairs::Pair...) = recode(a, nothing, pairs...)
recode(a::CategoricalArray, pairs::Pair...) = recode(a, nothing, pairs...)

function recode(a::AbstractArray, default::Any, pairs::Pair...)
V = promote_valuetype(pairs...)
V = cat_promote_valuetype(pairs...)
# T cannot take into account eltype(src), since we can't know
# whether it matters at compile time (all levels recoded or not)
# and using a wider type than necessary would be annoying
T = default isa Nothing ? V : promote_type(typeof(default), V)
T = default isa Nothing ? V : cat_promote_type(typeof(default), V)
# Exception 1: if T === Missing and default not missing,
# assume the caller wants to recode only some values to missing,
# but accept original values
Expand All @@ -371,11 +372,11 @@ function recode(a::AbstractArray, default::Any, pairs::Pair...)
end

function recode(a::CategoricalArray{S, N, R}, default::Any, pairs::Pair...) where {S, N, R}
V = promote_valuetype(pairs...)
V = cat_promote_valuetype(pairs...)
# T cannot take into account eltype(src), since we can't know
# whether it matters at compile time (all levels recoded or not)
# and using a wider type than necessary would be annoying
T = default isa Nothing ? V : promote_type(typeof(default), V)
T = default isa Nothing ? V : cat_promote_type(typeof(default), V)
# Exception 1: if T === Missing and default not missing,
# assume the caller wants to recode only some values to missing,
# but accept original values
Expand All @@ -396,13 +397,13 @@ end
function Base.replace(a::CategoricalArray{S, N, R}, pairs::Pair...) where {S, N, R}
# Base.replace(a::Array, pairs::Pair...) uses a wider type promotion than
# recode. It promotes the source type S with the replaced types T.
T = promote_valuetype(pairs...)
T = cat_promote_valuetype(pairs...)
# Exception: replacing missings
# Example: replace(categorical([missing,1.5]), missing=>0)
if keytype_hasmissing(pairs...)
dest = CategoricalArray{promote_type(nonmissingtype(S), T), N, R}(undef, size(a))
dest = CategoricalArray{cat_promote_type(nonmissingtype(S), T), N, R}(undef, size(a))
else
dest = CategoricalArray{promote_type(S, T), N, R}(undef, size(a))
dest = CategoricalArray{cat_promote_type(S, T), N, R}(undef, size(a))
end
recode!(dest, a, nothing, pairs...)
end
Expand Down
14 changes: 7 additions & 7 deletions src/typedefs.jl
@@ -1,12 +1,13 @@
const DefaultRefType = UInt32
const SupportedTypes = Union{AbstractString, AbstractChar, Number}

## Pools

# Type params:
# * `T` type of categorized values
# * `R` integer type for referencing category levels
# * `V` categorical value type
mutable struct CategoricalPool{T, R <: Integer, V}
mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
levels::Vector{T} # category levels ordered by their reference codes
invindex::Dict{T, R} # map from category levels to their reference codes
valindex::Vector{V} # "category value" objects 1-to-1 matching `index`
Expand Down Expand Up @@ -42,9 +43,6 @@ mutable struct CategoricalPool{T, R <: Integer, V}
function CategoricalPool{T, R, V}(levels::Vector{T},
invindex::Dict{T, R},
ordered::Bool) where {T, R, V}
if T <: CategoricalValue && T !== Union{}
throw(ArgumentError("Level type $T cannot be a categorical value type"))
end
if !(V <: CategoricalValue)
throw(ArgumentError("Type $V is not a categorical value type"))
end
Expand All @@ -70,7 +68,7 @@ end
## Values

"""
CategoricalValue{T, R <: Integer}
CategoricalValue{T <: $SupportedTypes, R <: Integer}
A wrapper around a value of type `T` corresponding to a level
in a `CategoricalPool`.
Expand All @@ -82,7 +80,7 @@ if [`isordered`](@ref) is `true` for the value's pool, and in that case
the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the standard
ordering of values of type `T`.
"""
struct CategoricalValue{T, R <: Integer}
struct CategoricalValue{T <: SupportedTypes, R <: Integer}
level::R
pool::CategoricalPool{T, R, CategoricalValue{T, R}}
end
Expand All @@ -96,7 +94,9 @@ end
# * `V` original type of elements (excluding Missing) before categorization
# * `C` categorical value type
# * `U` type of missing value, `Union{}` if missing values are not accepted
abstract type AbstractCategoricalArray{T, N, R, V, C, U} <: AbstractArray{Union{C, U}, N} end
abstract type AbstractCategoricalArray{T <: Union{CategoricalValue, SupportedTypes, Missing}, N,
R <: Integer, V, C <: CategoricalValue, U} <:
AbstractArray{Union{C, U}, N} end
const AbstractCategoricalVector{T, R, V, C, U} = AbstractCategoricalArray{T, 1, R, V, C, U}
const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2, R, V, C, U}

Expand Down

0 comments on commit bb7b968

Please sign in to comment.