From 9fb4c27495061762b952d8c79d7ab45348192e80 Mon Sep 17 00:00:00 2001 From: Essam Date: Wed, 24 Jul 2024 13:15:36 -0500 Subject: [PATCH 1/6] =?UTF-8?q?=F0=9F=A6=99=20Missingness=20encoder=20is?= =?UTF-8?q?=20here?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Project.toml | 8 +- src/MLJTransforms.jl | 7 + src/encoders/missingness_encoding/errors.jl | 9 + .../missingness_encoding/interface_mlj.jl | 161 ++++++++++++++++++ .../missingness_encoding.jl | 121 +++++++++++++ test/encoders/missingness_encoding.jl | 156 +++++++++++++++++ test/runtests.jl | 1 + 7 files changed, 460 insertions(+), 3 deletions(-) create mode 100644 src/encoders/missingness_encoding/errors.jl create mode 100644 src/encoders/missingness_encoding/interface_mlj.jl create mode 100644 src/encoders/missingness_encoding/missingness_encoding.jl create mode 100644 test/encoders/missingness_encoding.jl diff --git a/Project.toml b/Project.toml index b4a3ebc..0f64d51 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" @@ -20,7 +21,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] CategoricalArrays = "0.10" -MLJModelInterface = "1.10" +MLJModelInterface = "1.11" ScientificTypes = "3.0" StatsBase = "0.34" TableOperations = "1.2" @@ -29,10 +30,11 @@ julia = "1.6.7" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" [targets] -test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs"] +test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"] diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl index 311e237..7a8d2ec 100644 --- a/src/MLJTransforms.jl +++ b/src/MLJTransforms.jl @@ -24,20 +24,27 @@ include("encoders/target_encoding/errors.jl") include("encoders/target_encoding/target_encoding.jl") include("encoders/target_encoding/interface_mlj.jl") export target_encoder_fit, target_encoder_transform, TargetEncoder +export TargetEncoder # Ordinal encoding include("encoders/ordinal_encoding/ordinal_encoding.jl") include("encoders/ordinal_encoding/interface_mlj.jl") export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder +export OrdinalEncoder # Frequency encoding include("encoders/frequency_encoding/frequency_encoding.jl") include("encoders/frequency_encoding/interface_mlj.jl") export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder +export FrequencyEncoder # Cardinality reduction include("transformers/cardinality_reducer/cardinality_reducer.jl") include("transformers/cardinality_reducer/interface_mlj.jl") export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer +export CardinalityReducer +include("encoders/missingness_encoding/missingness_encoding.jl") +include("encoders/missingness_encoding/interface_mlj.jl") +export MissingnessEncoder end \ No newline at end of file diff --git a/src/encoders/missingness_encoding/errors.jl b/src/encoders/missingness_encoding/errors.jl new file mode 100644 index 0000000..784418f --- /dev/null +++ b/src/encoders/missingness_encoding/errors.jl @@ -0,0 +1,9 @@ +UNSUPPORTED_COL_TYPE_ME(col_type) = + "In MissingnessEncoder, elements have type $(col_type). The supported types are `Union{Char, AbstractString, Number}`" +VALID_TYPES_NEW_VAL_ME(possible_col_type) = + "In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Union{Char, AbstractString, Number}`" +COLLISION_NEW_VAL_ME(value) = + "In MissingnessEncoder, label_for_missing specifies new feature name $(value). However, this name already exists in one of the features. Please respecify label_for_missing." +UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing) = + "In MissingnessEncoder, $(col_type) does not appear in label_for_missing which only has keys $(keys(label_for_missing))" + diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl new file mode 100644 index 0000000..081b1c9 --- /dev/null +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -0,0 +1,161 @@ +### MissingnessEncoder with MLJ Interface + +# 1. Interface Struct +mutable struct MissingnessEncoder{ + AS <: AbstractVector{Symbol}, + T <: Type, + A <: Any, +} <: Unsupervised + features::AS + ignore::Bool + ordered_factor::Bool + label_for_missing::Dict{T, A} +end; + +# 2. Constructor +function MissingnessEncoder(; + features = Symbol[], + ignore = true, + ordered_factor = false, + label_for_missing = Dict( + AbstractString => "missing", + Char => 'm', + ), +) + return MissingnessEncoder(features, ignore, ordered_factor, label_for_missing) +end; + + +# 4. Fitted parameters (for user access) +MMI.fitted_params(::MissingnessEncoder, fitresult) = ( + new_cat_given_col_val = fitresult, +) + +# 5. Fit method +function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X) + generic_cache = missingness_encoder_fit( + X, + transformer.features; + ignore = transformer.ignore, + ordered_factor = transformer.ordered_factor, + label_for_missing = transformer.label_for_missing, + ) + fitresult = generic_cache[:new_cat_given_col_val] + + report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features + cache = nothing + return fitresult, cache, report +end; + + +# 6. Transform method +function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew) + generic_cache = Dict( + :new_cat_given_col_val => + fitresult, + ) + Xnew_transf = missingness_encoder_transform(Xnew, generic_cache) + return Xnew_transf +end + +# 8. Extra metadata +MMI.metadata_pkg( + MissingnessEncoder, + package_name = "MLJTransforms", + package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6", + package_url = "https://github.com/JuliaAI/MLJTransforms.jl", + is_pure_julia = true, +) + +MMI.metadata_model( + MissingnessEncoder, + input_scitype = Table, + output_scitype = Table, + load_path = "MLJTransforms.MissingnessEncoder", +) + + + +""" +$(MMI.doc_header(MissingnessEncoder)) + +`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing"). +By this, missingness will be treated as a new +level by any subsequent model. This assumes that the categorical features have raw +types that are in `Union{Char, AbstractString, Number}`. + +# Training data + +In MLJ (or MLJBase) bind an instance unsupervised `model` to data with + + mach = machine(model, X) + +Here: + +- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must + have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to + check scitypes. + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters + +- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding +- `ignore=true`: Whether to exclude or includes the features given in `features` +- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +- `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A +dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value +signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` +then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` +and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. + +# Operations + +- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and + return the new table. Features that are neither `Multiclass` nor `OrderedFactor` + are always left unchanged. + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` + +# Report + +The fields of `report(mach)` are: + +- `encoded_features`: The subset of the categorical features of X that were encoded + +# Examples + +```julia +import StatsBase.proportionmap +using MLJ + +# Define a table with missing values +Xm = ( + A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]), + B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing], + C= categorical([7, 5, missing, missing, 10, 0, missing]), + D = [23, 23, 44, 66, 14, 23, 11], + E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p']) +) + +encoder = MissingnessEncoder(ordered_factor = false) +mach = fit!(machine(encoder, Xm)) +Xnew = transform(mach, Xm) + +julia> Xnew +(A = ["Ben", "John", "missing", "missing", "Mary", "John", "missing"], + B = Union{Missing, Float64}[1.85, 1.67, missing, missing, 1.5, 1.67, missing], + C = [7, 5, -1, -1, 10, 0, -1], + D = [23, 23, 44, 66, 14, 23, 11], + E = ['m', 'g', 'r', 'm', 'r', 'g', 'p'],) + +``` + +See also +[`CardinalityReducer`](@ref) +""" +MissingnessEncoder \ No newline at end of file diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl new file mode 100644 index 0000000..38f1905 --- /dev/null +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -0,0 +1,121 @@ +include("errors.jl") + +""" +**Private method.** + +Fit a transformer that maps any missing value into a new level (e.g., "Missing"). By this, missingness will be treated as a new +level by any subsequent model. This assumes that the categorical features have raw +types that are in `Union{Char, AbstractString, Number}`. + +# Arguments + + - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) + `Multiclass` or `OrderedFactor` + - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding + - `ignore=true`: Whether to exclude or includes the features given in `features` + - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them + - `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A + dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value + signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` + then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` + and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. + +# Returns (in a dict) + + - `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` + - `encoded_features`: The subset of the categorical features of X that were encoded +""" +function missingness_encoder_fit( + X, + features::AbstractVector{Symbol} = Symbol[]; + ignore::Bool = true, + ordered_factor::Bool = false, + label_for_missing::Dict{<:Type, <:Any} = Dict( + AbstractString => "missing", + Char => 'm', + ), +) + supportedtypes = Union{Char, AbstractString, Number} + + # 1. Define feature mapper + function feature_mapper(col, name) + col_type = nonmissingtype(eltype(col)).parameters[1] + feat_levels = levels(col; skipmissing=true) + + # Ensure column type is valid (can't test because never occurs) + # Converting array elements to strings before wrapping in a `CategoricalArray`, as... + if !(col_type <: supportedtypes) + throw(ArgumentError(UNSUPPORTED_COL_TYPE_ME(col_type))) + end + + # Ensure label_for_missing keys are valid types + for possible_col_type in keys(label_for_missing) + if !(possible_col_type in union_types(supportedtypes)) + throw(ArgumentError(VALID_TYPES_NEW_VAL_ME(possible_col_type))) + end + end + + # Check no collision between keys(label_for_missing) and feat_levels + for value in values(label_for_missing) + if !ismissing(value) + if value in feat_levels + throw(ArgumentError(COLLISION_NEW_VAL_ME(value))) + end + end + end + + # Get ancestor type of column + elgrandtype = nothing + for allowed_type in union_types(supportedtypes) + if col_type <: allowed_type + elgrandtype = allowed_type + break + end + end + + # Nonmissing levels remain as is + new_cat_given_col_val = Dict{Missing, col_type}() + + # Missing levels are mapped + if elgrandtype in keys(label_for_missing) + new_cat_given_col_val[missing] = label_for_missing[elgrandtype] + elseif elgrandtype == Number + new_cat_given_col_val[missing] = minimum(feat_levels) - 1 + else + throw(ArgumentError(UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing))) + end + + return new_cat_given_col_val::Dict{Missing, col_type} + end + + # 2. Pass it to generic_fit + new_cat_given_col_val, encoded_features = generic_fit( + X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, + ) + cache = Dict( + :new_cat_given_col_val => new_cat_given_col_val, + :encoded_features => encoded_features, + ) + return cache +end + +""" +**Private method.** + +Apply a fitted missingness encoder to a table given the output of `missingness_encoder_fit` + +# Arguments + + - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) + `Multiclass` or `OrderedFactor` + - `cache`: The output of `missingness_encoder_fit` + +# Returns + + - `X_tr`: The table with selected features after the selected features are transformed by missingness encoder +""" +function missingness_encoder_transform(X, cache::Dict) + new_cat_given_col_val = cache[:new_cat_given_col_val] + return generic_transform(X, new_cat_given_col_val; ignore_unknown = true) +end + diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl new file mode 100644 index 0000000..a47e259 --- /dev/null +++ b/test/encoders/missingness_encoding.jl @@ -0,0 +1,156 @@ +using MLJTransforms: missingness_encoder_fit, missingness_encoder_transform + +@testset "Throws errors when needed" begin + @test_throws ArgumentError begin + X = generate_X_with_missingness(;john_name="missing") + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "missing", Char => 'm'), + ) + end + @test_throws ArgumentError begin + X = generate_X_with_missingness() + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "Other", Bool => 'X'), + ) + end + @test_throws ArgumentError begin + X = generate_X_with_missingness() + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "X"), + ) + end +end + + +@testset "Default for Numbers Set Correctly" begin + X = generate_X_with_missingness() + cache = missingness_encoder_fit(X) + new_cat_given_col_val = cache[:new_cat_given_col_val] + @test new_cat_given_col_val[:C][missing] == minimum(levels(X.C)) - 1 +end + + +@testset "End-to-end test" begin + X = generate_X_with_missingness() + + cache = missingness_encoder_fit(X; label_for_missing = Dict(AbstractString => "missing-item", Char => 'i', Number => -99)) + X_tr = missingness_encoder_transform(X, cache) + + for col in [:A, :B, :C, :D, :E] + @test issubset(levels(X[col]), levels(X_tr[col])) + end + + @test Set(push!(levels(X[:A]), "missing-item")) == Set(levels(X_tr[:A])) + @test Set(push!(levels(X[:C]), -99)) == Set(levels(X_tr[:C])) + @test Set(push!(levels(X[:E]), 'i')) == Set(levels(X_tr[:E])) + @test levels(X[:B]) == levels(X_tr[:B]) + @test levels(X[:D]) == levels(X_tr[:D]) +end + + +@testset "Missingness Encoder Fit" begin + X = generate_X_with_missingness() + + result = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + )[:new_cat_given_col_val] + + true_output = Dict{Symbol, Dict{Any, Any}}( + :A => Dict([(missing, "MissingOne")]), + :C => Dict([(missing, -90)]), + :E => Dict([(missing, 'X')]), + ) + @test result == true_output +end + +# Redundant because it must work if generic transform work which has been tested before +@testset "Missingness Encoder Transform" begin + X = generate_X_with_missingness() + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + ) + + enc_char = (col, level) -> ismissing(level) ? 'X' : level + enc_num = (col, level) -> ismissing(level) ? -90 : level + enc_str = (col, level) -> ismissing(level) ? "MissingOne" : level + enc_idn = (col, level) -> level + + X_tr = missingness_encoder_transform(X, cache) + + target = ( + A = [ + enc_str(X[:A], X[:A][i]) for i in 1:7 + ], + B = [ + enc_idn(X[:B], X[:B][i]) for i in 1:7 + ], + C = [ + enc_num(X[:C], X[:C][i]) for i in 1:7 + ], + D = [ + enc_idn(X[:D], X[:D][i]) for i in 1:7 + ], + E = [ + enc_char(X[:E], X[:E][i]) for i in 1:7 + ] + ) + + @test isequal(target, X_tr) +end + +@testset "Schema doesn't change after transform" begin + X = generate_X_with_missingness() + + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + ) + + X_tr = missingness_encoder_transform(X, cache) + + @test elscitype(X_tr[:A]) <: Multiclass + @test elscitype(X_tr[:B]) <: Union{Missing, Continuous} + @test elscitype(X_tr[:C]) <: Multiclass + @test elscitype(X_tr[:D]) <: Count + @test elscitype(X_tr[:E]) <: Multiclass +end + +@testset "Adding new levels" begin + X = generate_X_with_missingness() + levels!(Tables.getcolumn(X, :A), ["Ben", "John", "Mary", "Max"]) + + cache = missingness_encoder_fit( + X; + label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), + ) + X_tr = missingness_encoder_transform(X, cache) + + @test issubset(levels(X[:A]), levels(X_tr[:A])) # Will have "MissingOne" added +end + +@testset "MLJ Interface Missingness Encoder" begin + X = generate_X_with_missingness() + # functional api + generic_cache = missingness_encoder_fit(X; ignore = true, ordered_factor = false) + X_transf = missingness_encoder_transform(X, generic_cache) + # mlj api + encoder = MissingnessEncoder(ignore = true, ordered_factor = false) + mach = machine(encoder, X) + fit!(mach) + Xnew_transf = MMI.transform(mach, X) + + # same output + @test isequal(X_transf, Xnew_transf) + + # fitted parameters is correct + new_cat_given_col_val = fitted_params(mach).new_cat_given_col_val + @test new_cat_given_col_val == generic_cache[:new_cat_given_col_val] + + # Test report + @test report(mach) == (encoded_features = generic_cache[:encoded_features],) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 966ae53..9366562 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,3 +25,4 @@ include("encoders/target_encoding.jl") include("encoders/ordinal_encoding.jl") include("encoders/frequency_encoder.jl") include("transformers/cardinality_reducer.jl") +include("encoders/missingness_encoding.jl") From 77f374048b72fc344673e0c3866b7d5f6c05b860 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 30 Jul 2024 20:09:42 -0500 Subject: [PATCH 2/6] Update src/encoders/missingness_encoding/interface_mlj.jl Co-authored-by: Anthony Blaom, PhD --- src/encoders/missingness_encoding/interface_mlj.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index 081b1c9..1b85424 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -103,7 +103,7 @@ Train the machine using `fit!(mach, rows=...)`. - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them -- `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A +- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` From 75418d42cc682605687b1436a1bcfd201cf08e97 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 30 Jul 2024 20:09:49 -0500 Subject: [PATCH 3/6] Update src/encoders/missingness_encoding/missingness_encoding.jl Co-authored-by: Anthony Blaom, PhD --- src/encoders/missingness_encoding/missingness_encoding.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index 38f1905..e75ab35 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -14,7 +14,7 @@ types that are in `Union{Char, AbstractString, Number}`. - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - - `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A + - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` From c17b5a5706aa43109474fb4283e88ced56cf3d26 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 30 Jul 2024 20:42:47 -0500 Subject: [PATCH 4/6] =?UTF-8?q?=E2=9C=8D=F0=9F=8F=BB=20Modify=20type=20doc?= =?UTF-8?q?string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/encoders/missingness_encoding/errors.jl | 4 ++-- src/encoders/missingness_encoding/interface_mlj.jl | 4 ++-- src/encoders/missingness_encoding/missingness_encoding.jl | 4 ++-- src/transformers/cardinality_reducer/cardinality_reducer.jl | 4 ++-- src/transformers/cardinality_reducer/interface_mlj.jl | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/encoders/missingness_encoding/errors.jl b/src/encoders/missingness_encoding/errors.jl index 784418f..dd5c126 100644 --- a/src/encoders/missingness_encoding/errors.jl +++ b/src/encoders/missingness_encoding/errors.jl @@ -1,7 +1,7 @@ UNSUPPORTED_COL_TYPE_ME(col_type) = - "In MissingnessEncoder, elements have type $(col_type). The supported types are `Union{Char, AbstractString, Number}`" + "In MissingnessEncoder, elements have type $(col_type). The supported types are `Char`, `AbstractString`, and `Number`" VALID_TYPES_NEW_VAL_ME(possible_col_type) = - "In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Union{Char, AbstractString, Number}`" + "In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Char`, `AbstractString`, and `Number`" COLLISION_NEW_VAL_ME(value) = "In MissingnessEncoder, label_for_missing specifies new feature name $(value). However, this name already exists in one of the features. Please respecify label_for_missing." UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing) = diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index 1b85424..ffeb15f 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -82,7 +82,7 @@ $(MMI.doc_header(MissingnessEncoder)) `MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing"). By this, missingness will be treated as a new level by any subsequent model. This assumes that the categorical features have raw -types that are in `Union{Char, AbstractString, Number}`. +types that are in `Char`, `AbstractString`, and `Number`. # Training data @@ -104,7 +104,7 @@ Train the machine using `fit!(mach, rows=...)`. - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A -dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value +dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index e75ab35..c1f2698 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -5,7 +5,7 @@ include("errors.jl") Fit a transformer that maps any missing value into a new level (e.g., "Missing"). By this, missingness will be treated as a new level by any subsequent model. This assumes that the categorical features have raw -types that are in `Union{Char, AbstractString, Number}`. +types that are in `Char`, `AbstractString`, and `Number`. # Arguments @@ -15,7 +15,7 @@ types that are in `Union{Char, AbstractString, Number}`. - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A - dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value + dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index 4bdf905..1d8f531 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -7,7 +7,7 @@ include("errors.jl") Fit a transformer that maps any level of a categorical feature that occurs with frequency < `min_frequency` into a new level (e.g., "Other"). This is useful when some categorical features have high cardinality and many levels are infrequent. This assumes that the categorical features have raw -types that are in `Union{Char, AbstractString, Number}`. +types that are in `Char`, `AbstractString`, and `Number`. # Arguments @@ -19,7 +19,7 @@ types that are in `Union{Char, AbstractString, Number}`. - `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be an integer or a float which decides whether raw counts or normalized frequencies are used. - `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A - dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies + dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl index 11537df..31af464 100644 --- a/src/transformers/cardinality_reducer/interface_mlj.jl +++ b/src/transformers/cardinality_reducer/interface_mlj.jl @@ -112,7 +112,7 @@ Train the machine using `fit!(mach, rows=...)`. - `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be an integer or a float which decides whether raw counts or normalized frequencies are used. - `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A -dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies +dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'` and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. From 8c0b05a0afd06b712e8e2855c6ce0f8080a6b9d8 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 30 Jul 2024 20:45:53 -0500 Subject: [PATCH 5/6] =?UTF-8?q?=E2=9C=8D=F0=9F=8F=BB=20Rename=20report=20v?= =?UTF-8?q?ariable=20to=20better?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../missingness_encoding/interface_mlj.jl | 8 ++++---- .../missingness_encoding.jl | 18 +++++++++--------- test/encoders/missingness_encoding.jl | 10 +++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index ffeb15f..54cd534 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -28,7 +28,7 @@ end; # 4. Fitted parameters (for user access) MMI.fitted_params(::MissingnessEncoder, fitresult) = ( - new_cat_given_col_val = fitresult, + label_for_missing_given_feature = fitresult, ) # 5. Fit method @@ -40,7 +40,7 @@ function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X) ordered_factor = transformer.ordered_factor, label_for_missing = transformer.label_for_missing, ) - fitresult = generic_cache[:new_cat_given_col_val] + fitresult = generic_cache[:label_for_missing_given_feature] report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features cache = nothing @@ -51,7 +51,7 @@ end; # 6. Transform method function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew) generic_cache = Dict( - :new_cat_given_col_val => + :label_for_missing_given_feature => fitresult, ) Xnew_transf = missingness_encoder_transform(Xnew, generic_cache) @@ -119,7 +119,7 @@ and if the raw type subtypes `Number` then the new value is the lowest value in The fields of `fitted_params(mach)` are: -- `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` +- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` # Report diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index c1f2698..848916c 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -22,7 +22,7 @@ types that are in `Char`, `AbstractString`, and `Number`. # Returns (in a dict) - - `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` + - `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` - `encoded_features`: The subset of the categorical features of X that were encoded """ function missingness_encoder_fit( @@ -74,26 +74,26 @@ function missingness_encoder_fit( end # Nonmissing levels remain as is - new_cat_given_col_val = Dict{Missing, col_type}() + label_for_missing_given_feature = Dict{Missing, col_type}() # Missing levels are mapped if elgrandtype in keys(label_for_missing) - new_cat_given_col_val[missing] = label_for_missing[elgrandtype] + label_for_missing_given_feature[missing] = label_for_missing[elgrandtype] elseif elgrandtype == Number - new_cat_given_col_val[missing] = minimum(feat_levels) - 1 + label_for_missing_given_feature[missing] = minimum(feat_levels) - 1 else throw(ArgumentError(UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing))) end - return new_cat_given_col_val::Dict{Missing, col_type} + return label_for_missing_given_feature::Dict{Missing, col_type} end # 2. Pass it to generic_fit - new_cat_given_col_val, encoded_features = generic_fit( + label_for_missing_given_feature, encoded_features = generic_fit( X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, ) cache = Dict( - :new_cat_given_col_val => new_cat_given_col_val, + :label_for_missing_given_feature => label_for_missing_given_feature, :encoded_features => encoded_features, ) return cache @@ -115,7 +115,7 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e - `X_tr`: The table with selected features after the selected features are transformed by missingness encoder """ function missingness_encoder_transform(X, cache::Dict) - new_cat_given_col_val = cache[:new_cat_given_col_val] - return generic_transform(X, new_cat_given_col_val; ignore_unknown = true) + label_for_missing_given_feature = cache[:label_for_missing_given_feature] + return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true) end diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index a47e259..4bcc306 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -28,8 +28,8 @@ end @testset "Default for Numbers Set Correctly" begin X = generate_X_with_missingness() cache = missingness_encoder_fit(X) - new_cat_given_col_val = cache[:new_cat_given_col_val] - @test new_cat_given_col_val[:C][missing] == minimum(levels(X.C)) - 1 + label_for_missing_given_feature = cache[:label_for_missing_given_feature] + @test label_for_missing_given_feature[:C][missing] == minimum(levels(X.C)) - 1 end @@ -57,7 +57,7 @@ end result = missingness_encoder_fit( X; label_for_missing = Dict(AbstractString => "MissingOne", Char => 'X', Number => -90), - )[:new_cat_given_col_val] + )[:label_for_missing_given_feature] true_output = Dict{Symbol, Dict{Any, Any}}( :A => Dict([(missing, "MissingOne")]), @@ -148,8 +148,8 @@ end @test isequal(X_transf, Xnew_transf) # fitted parameters is correct - new_cat_given_col_val = fitted_params(mach).new_cat_given_col_val - @test new_cat_given_col_val == generic_cache[:new_cat_given_col_val] + label_for_missing_given_feature = fitted_params(mach).label_for_missing_given_feature + @test label_for_missing_given_feature == generic_cache[:label_for_missing_given_feature] # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) From c30e7d7d3abd1acbcff7a7a6aeea1d53f769efa3 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 30 Jul 2024 20:47:59 -0500 Subject: [PATCH 6/6] =?UTF-8?q?=E2=9C=85=20Fix=20MLJ=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/encoders/missingness_encoding/interface_mlj.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index 54cd534..d39228e 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -142,7 +142,7 @@ Xm = ( E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p']) ) -encoder = MissingnessEncoder(ordered_factor = false) +encoder = MissingnessEncoder() mach = fit!(machine(encoder, Xm)) Xnew = transform(mach, Xm)