Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
CategoricalArrays = "0.10"
MLJModelInterface = "1.10"
MLJModelInterface = "1.11"
ScientificTypes = "3.0"
StatsBase = "0.34"
TableOperations = "1.2"
Expand All @@ -30,9 +30,9 @@ julia = "1.6.7"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"

Expand Down
4 changes: 4 additions & 0 deletions src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ export FrequencyEncoder
include("transformers/cardinality_reducer/cardinality_reducer.jl")
include("transformers/cardinality_reducer/interface_mlj.jl")
export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
export CardinalityReducer
include("encoders/missingness_encoding/missingness_encoding.jl")
include("encoders/missingness_encoding/interface_mlj.jl")
export MissingnessEncoder

# Contrast encoder
include("encoders/contrast_encoder/contrast_encoder.jl")
Expand Down
9 changes: 9 additions & 0 deletions src/encoders/missingness_encoding/errors.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
UNSUPPORTED_COL_TYPE_ME(col_type) =
"In MissingnessEncoder, elements have type $(col_type). The supported types are `Char`, `AbstractString`, and `Number`"
VALID_TYPES_NEW_VAL_ME(possible_col_type) =
"In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Char`, `AbstractString`, and `Number`"
COLLISION_NEW_VAL_ME(value) =
"In MissingnessEncoder, label_for_missing specifies new feature name $(value). However, this name already exists in one of the features. Please respecify label_for_missing."
UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing) =
"In MissingnessEncoder, $(col_type) does not appear in label_for_missing which only has keys $(keys(label_for_missing))"

161 changes: 161 additions & 0 deletions src/encoders/missingness_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
### MissingnessEncoder with MLJ Interface

# 1. Interface Struct
mutable struct MissingnessEncoder{
AS <: AbstractVector{Symbol},
T <: Type,
A <: Any,
} <: Unsupervised
features::AS
ignore::Bool
ordered_factor::Bool
label_for_missing::Dict{T, A}
end;

# 2. Constructor
function MissingnessEncoder(;
features = Symbol[],
ignore = true,
ordered_factor = false,
label_for_missing = Dict(
AbstractString => "missing",
Char => 'm',
),
)
return MissingnessEncoder(features, ignore, ordered_factor, label_for_missing)
end;


# 4. Fitted parameters (for user access)
MMI.fitted_params(::MissingnessEncoder, fitresult) = (
label_for_missing_given_feature = fitresult,
)

# 5. Fit method
function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
generic_cache = missingness_encoder_fit(
X,
transformer.features;
ignore = transformer.ignore,
ordered_factor = transformer.ordered_factor,
label_for_missing = transformer.label_for_missing,
)
fitresult = generic_cache[:label_for_missing_given_feature]

report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
cache = nothing
return fitresult, cache, report
end;


# 6. Transform method
function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
generic_cache = Dict(
:label_for_missing_given_feature =>
fitresult,
)
Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
return Xnew_transf
end

# 8. Extra metadata
MMI.metadata_pkg(
MissingnessEncoder,
package_name = "MLJTransforms",
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
is_pure_julia = true,
)

MMI.metadata_model(
MissingnessEncoder,
input_scitype = Table,
output_scitype = Table,
load_path = "MLJTransforms.MissingnessEncoder",
)



"""
$(MMI.doc_header(MissingnessEncoder))

`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing").
By this, missingness will be treated as a new
level by any subsequent model. This assumes that the categorical features have raw
types that are in `Char`, `AbstractString`, and `Number`.

# Training data

In MLJ (or MLJBase) bind an instance unsupervised `model` to data with

mach = machine(model, X)

Here:

- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.

Train the machine using `fit!(mach, rows=...)`.

# Hyper-parameters

- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.

# Operations

- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
are always left unchanged.

# Fitted parameters

The fields of `fitted_params(mach)` are:

- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`

# Report

The fields of `report(mach)` are:

- `encoded_features`: The subset of the categorical features of X that were encoded

# Examples

```julia
import StatsBase.proportionmap
using MLJ

# Define a table with missing values
Xm = (
A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]),
B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing],
C= categorical([7, 5, missing, missing, 10, 0, missing]),
D = [23, 23, 44, 66, 14, 23, 11],
E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p'])
)

encoder = MissingnessEncoder()
mach = fit!(machine(encoder, Xm))
Xnew = transform(mach, Xm)

julia> Xnew
(A = ["Ben", "John", "missing", "missing", "Mary", "John", "missing"],
B = Union{Missing, Float64}[1.85, 1.67, missing, missing, 1.5, 1.67, missing],
C = [7, 5, -1, -1, 10, 0, -1],
D = [23, 23, 44, 66, 14, 23, 11],
E = ['m', 'g', 'r', 'm', 'r', 'g', 'p'],)

```

See also
[`CardinalityReducer`](@ref)
"""
MissingnessEncoder
121 changes: 121 additions & 0 deletions src/encoders/missingness_encoding/missingness_encoding.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
include("errors.jl")

"""
**Private method.**

Fit a transformer that maps any missing value into a new level (e.g., "Missing"). By this, missingness will be treated as a new
level by any subsequent model. This assumes that the categorical features have raw
types that are in `Char`, `AbstractString`, and `Number`.

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.

# Returns (in a dict)

- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
- `encoded_features`: The subset of the categorical features of X that were encoded
"""
function missingness_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
label_for_missing::Dict{<:Type, <:Any} = Dict(
AbstractString => "missing",
Char => 'm',
),
)
supportedtypes = Union{Char, AbstractString, Number}

# 1. Define feature mapper
function feature_mapper(col, name)
col_type = nonmissingtype(eltype(col)).parameters[1]
feat_levels = levels(col; skipmissing=true)

# Ensure column type is valid (can't test because never occurs)
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
if !(col_type <: supportedtypes)
throw(ArgumentError(UNSUPPORTED_COL_TYPE_ME(col_type)))
end

# Ensure label_for_missing keys are valid types
for possible_col_type in keys(label_for_missing)
if !(possible_col_type in union_types(supportedtypes))
throw(ArgumentError(VALID_TYPES_NEW_VAL_ME(possible_col_type)))
end
end

# Check no collision between keys(label_for_missing) and feat_levels
for value in values(label_for_missing)
if !ismissing(value)
if value in feat_levels
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
end
end
end

# Get ancestor type of column
elgrandtype = nothing
for allowed_type in union_types(supportedtypes)
if col_type <: allowed_type
elgrandtype = allowed_type
break
end
end

# Nonmissing levels remain as is
label_for_missing_given_feature = Dict{Missing, col_type}()

# Missing levels are mapped
if elgrandtype in keys(label_for_missing)
label_for_missing_given_feature[missing] = label_for_missing[elgrandtype]
elseif elgrandtype == Number
label_for_missing_given_feature[missing] = minimum(feat_levels) - 1
else
throw(ArgumentError(UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing)))
end

return label_for_missing_given_feature::Dict{Missing, col_type}
end

# 2. Pass it to generic_fit
label_for_missing_given_feature, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
)
cache = Dict(
:label_for_missing_given_feature => label_for_missing_given_feature,
:encoded_features => encoded_features,
)
return cache
end

"""
**Private method.**

Apply a fitted missingness encoder to a table given the output of `missingness_encoder_fit`

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
- `cache`: The output of `missingness_encoder_fit`

# Returns

- `X_tr`: The table with selected features after the selected features are transformed by missingness encoder
"""
function missingness_encoder_transform(X, cache::Dict)
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
end

4 changes: 2 additions & 2 deletions src/transformers/cardinality_reducer/cardinality_reducer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ include("errors.jl")
Fit a transformer that maps any level of a categorical feature that occurs with
frequency < `min_frequency` into a new level (e.g., "Other"). This is useful when some categorical features have
high cardinality and many levels are infrequent. This assumes that the categorical features have raw
types that are in `Union{Char, AbstractString, Number}`.
types that are in `Char`, `AbstractString`, and `Number`.

# Arguments

Expand All @@ -19,7 +19,7 @@ types that are in `Union{Char, AbstractString, Number}`.
- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be
an integer or a float which decides whether raw counts or normalized frequencies are used.
- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies
the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'`
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/cardinality_reducer/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Train the machine using `fit!(mach, rows=...)`.
- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be
an integer or a float which decides whether raw counts or normalized frequencies are used.
- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies
the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'`
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
Expand Down
Loading