diff --git a/.gitignore b/.gitignore index 9df13d3..8d91ae8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ meh/*.ipynb .DS_Store /*.jl scratchpad/ +examples/test.jl diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl index 39eee4f..928f45a 100644 --- a/src/encoders/frequency_encoding/frequency_encoding.jl +++ b/src/encoders/frequency_encoding/frequency_encoding.jl @@ -24,11 +24,15 @@ function frequency_encoder_fit( ignore::Bool = true, ordered_factor::Bool = false, normalize::Bool = false, + output_type::Type = Float32, ) # 1. Define feature mapper function feature_mapper(col, name) frequency_map = (!normalize) ? countmap(col) : proportionmap(col) - statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col)) + feat_levels = levels(col) + statistic_given_feat_val = Dict{eltype(feat_levels), output_type}( + level => frequency_map[level] for level in feat_levels + ) return statistic_given_feat_val end diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl index 89bd88b..1e477b2 100644 --- a/src/encoders/frequency_encoding/interface_mlj.jl +++ b/src/encoders/frequency_encoding/interface_mlj.jl @@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised ignore::Bool ordered_factor::Bool normalize::Bool + output_type::Type end; # 2. Constructor @@ -14,8 +15,9 @@ function FrequencyEncoder(; ignore = true, ordered_factor = false, normalize = false, + output_type = Float32, ) - return FrequencyEncoder(features, ignore, ordered_factor, normalize) + return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type) end; @@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X) ignore = transformer.ignore, ordered_factor = transformer.ordered_factor, normalize = transformer.normalize, + output_type = transformer.output_type, ) fitresult = generic_cache[:statistic_given_feat_val] @@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`. - `ignore=true`: Whether to exclude or include the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts. +- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values. # Operations diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl index 4d3bf2e..0e4e39a 100644 --- a/src/encoders/missingness_encoding/missingness_encoding.jl +++ b/src/encoders/missingness_encoding/missingness_encoding.jl @@ -30,7 +30,7 @@ function missingness_encoder_fit( features::AbstractVector{Symbol} = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, - label_for_missing::Dict{<:Type, <:Any} = Dict( + label_for_missing::Dict{<:Type, <:Any} = Dict( AbstractString => "missing", Char => 'm', ), @@ -40,8 +40,8 @@ function missingness_encoder_fit( # 1. Define feature mapper function feature_mapper(col, name) - col_type = nonmissingtype(eltype(col)).parameters[1] - feat_levels = levels(col; skipmissing=true) + feat_levels = levels(col; skipmissing = true) + col_type = nonmissingtype(eltype(feat_levels)) # Ensure column type is valid (can't test because never occurs) # Converting array elements to strings before wrapping in a `CategoricalArray`, as... @@ -58,7 +58,7 @@ function missingness_encoder_fit( # Check no collision between keys(label_for_missing) and feat_levels for value in values(label_for_missing) - if !ismissing(value) + if !ismissing(value) if value in feat_levels throw(ArgumentError(COLLISION_NEW_VAL_ME(value))) end @@ -73,7 +73,7 @@ function missingness_encoder_fit( break end end - + # Nonmissing levels remain as is label_for_missing_given_feature = Dict{Missing, col_type}() @@ -91,7 +91,8 @@ function missingness_encoder_fit( # 2. Pass it to generic_fit label_for_missing_given_feature, encoded_features = generic_fit( - X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, + X, features; ignore = ignore, ordered_factor = ordered_factor, + feature_mapper = feature_mapper, ) cache = Dict( :label_for_missing_given_feature => label_for_missing_given_feature, @@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e """ function missingness_encoder_transform(X, cache::Dict) label_for_missing_given_feature = cache[:label_for_missing_given_feature] - return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true) + return generic_transform( + X, + label_for_missing_given_feature; + ignore_unknown = true, + ensure_categorical = true, + ) end diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl index c6b32cf..86549d5 100644 --- a/src/encoders/ordinal_encoding/interface_mlj.jl +++ b/src/encoders/ordinal_encoding/interface_mlj.jl @@ -5,6 +5,7 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised features::AS ignore::Bool ordered_factor::Bool + output_type::Type end; # 2. Constructor @@ -12,8 +13,9 @@ function OrdinalEncoder(; features = Symbol[], ignore = true, ordered_factor = false, + output_type = Float32, ) - return OrdinalEncoder(features, ignore, ordered_factor) + return OrdinalEncoder(features, ignore, ordered_factor, output_type) end; @@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X) transformer.features; ignore = transformer.ignore, ordered_factor = transformer.ordered_factor, + output_type = transformer.output_type, ) fitresult = generic_cache[:index_given_feat_level] @@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`. - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them +- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`. # Operations diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl index 4afff9d..c3c7d0a 100644 --- a/src/encoders/ordinal_encoding/ordinal_encoding.jl +++ b/src/encoders/ordinal_encoding/ordinal_encoding.jl @@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding - `ignore=true`: Whether to exclude or includes the features given in `features` - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them - + - `dtype`: The numerical concrete type of the encoded features. Default is `Float32`. # Returns (in a dict) - `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer. @@ -21,12 +21,13 @@ function ordinal_encoder_fit( features::AbstractVector{Symbol} = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, + output_type::Type = Float32, ) # 1. Define feature mapper function feature_mapper(col, name) feat_levels = levels(col) index_given_feat_val = - Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels)) + Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels)) return index_given_feat_val end diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl index e7fa859..6dae479 100644 --- a/src/encoders/target_encoding/target_encoding.jl +++ b/src/encoders/target_encoding/target_encoding.jl @@ -166,8 +166,9 @@ function target_encoder_fit( # 3. Define function to compute the new value(s) for each level given a column function feature_mapper(col, name) + feat_levels = levels(col) y_stat_given_feat_level_for_col = - Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}() + Dict{eltype(feat_levels), Any}() for level in levels(col) # Get the targets of an example that belong to this level targets_for_level = y[col.==level] diff --git a/src/generic.jl b/src/generic.jl index 3acc398..80b2d24 100644 --- a/src/generic.jl +++ b/src/generic.jl @@ -49,11 +49,13 @@ function generic_fit(X, feat_col = Tables.getcolumn(X, feat_name) feat_type = elscitype(feat_col) feat_has_allowed_type = - feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) + feat_type <: Union{Missing, Multiclass} || + (ordered_factor && feat_type <: Union{Missing, OrderedFactor}) if feat_has_allowed_type # then should be encoded push!(encoded_features, feat_name) # Compute the dict using the given feature_mapper function - mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...) + mapping_per_feat_level[feat_name] = + feature_mapper(feat_col, feat_name, args...; kwargs...) end end return mapping_per_feat_level, encoded_features @@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names) new_column_names = [] while conflict - suffix = repeat("_", count) + suffix = repeat("_", count) new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds] conflict = any(name -> name in existing_names, new_column_names) count += 1 @@ -85,22 +87,29 @@ end """ **Private method.** -Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in +Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in a subset of categorical features of X into a scalar or a vector (as specified in single_feat) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a scalar (single_feat=true) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a scalar (single_feat=true) - - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` - into a set of k features where k is the length of the vector (single_feat=false) + - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` + into a set of k features where k is the length of the vector (single_feat=false) - In both cases it attempts to preserve the type of the table. - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such - assumption is necessary because any column in X must correspond to a constant number of features + assumption is necessary because any column in X must correspond to a constant number of features in the output table (which is equal to k). - Features not in the dictionary are mapped to themselves (i.e., not changed). - - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error. + - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error. + - If `ensure_categorical` is true, then any input categorical column will remain categorical """ -function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false) +function generic_transform( + X, + mapping_per_feat_level; + single_feat = true, + ignore_unknown = false, + ensure_categorical = false, +) feat_names = Tables.schema(X).names new_feat_names = Symbol[] new_cols = [] @@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore if !issubset(test_levels, train_levels) # get the levels in test that are not in train lost_levels = setdiff(test_levels, train_levels) - error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.") + error( + "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.", + ) end end - + if single_feat level2scalar = mapping_per_feat_level[feat_name] - new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col + if ensure_categorical + new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col + else + new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col + end + push!(new_cols, new_col) push!(new_feat_names, feat_name) else level2vector = mapping_per_feat_level[feat_name] - new_multi_col = map(x->get(level2vector, x, x), col) + new_multi_col = map(x -> get(level2vector, x, x), col) new_multi_col = [col for col in eachrow(hcat(new_multi_col...))] push!(new_cols, new_multi_col...) @@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore end end - transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) + transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...) # Attempt to preserve table type transformed_X = Tables.materializer(X)(transformed_X) return transformed_X -end \ No newline at end of file +end diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl index 18ca84c..0ad9d5c 100644 --- a/src/transformers/cardinality_reducer/cardinality_reducer.jl +++ b/src/transformers/cardinality_reducer/cardinality_reducer.jl @@ -35,20 +35,20 @@ function cardinality_reducer_fit( features::AbstractVector{Symbol} = Symbol[]; ignore::Bool = true, ordered_factor::Bool = false, - min_frequency::Real = 3, - label_for_infrequent::Dict{<:Type, <:Any} = Dict( + min_frequency::Real = 3, + label_for_infrequent::Dict{<:Type, <:Any} = Dict( AbstractString => "Other", Char => 'O', ), -) +) supportedtypes_list = [Char, AbstractString, Number] supportedtypes = Union{supportedtypes_list...} # 1. Define feature mapper function feature_mapper(col, name) val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col) - col_type = eltype(col).parameters[1] feat_levels = levels(col) + col_type = eltype(feat_levels) # Ensure column type is valid (can't test because never occurs) # Converting array elements to strings before wrapping in a `CategoricalArray`, as... @@ -88,7 +88,11 @@ function cardinality_reducer_fit( elseif elgrandtype == Number new_cat_given_col_val[level] = minimum(feat_levels) - 1 else - throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent))) + throw( + ArgumentError( + UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent), + ), + ) end end end @@ -98,7 +102,8 @@ function cardinality_reducer_fit( # 2. Pass it to generic_fit new_cat_given_col_val, encoded_features = generic_fit( - X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper, + X, features; ignore = ignore, ordered_factor = ordered_factor, + feature_mapper = feature_mapper, ) cache = Dict( :new_cat_given_col_val => new_cat_given_col_val, @@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r """ function cardinality_reducer_transform(X, cache::Dict) new_cat_given_col_val = cache[:new_cat_given_col_val] - return generic_transform(X, new_cat_given_col_val; ignore_unknown = true) + return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true) end diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl index fa110c7..d58f76c 100644 --- a/test/encoders/contrast_encoder.jl +++ b/test/encoders/contrast_encoder.jl @@ -51,9 +51,7 @@ end cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy) k = length(levels(X.name)) contrast_matrix = get_dummy_contrast(k) - print() for (i, level) in enumerate(levels(X.name)) - println(cache[:vector_given_value_given_feature]) @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :] end end @@ -289,4 +287,40 @@ end # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) +end + + +@testset "Test Contrast Encoder Output Types" begin + X = ( + name = categorical(["Ben", "John", "Mary", "John"]), + height = [1.85, 1.67, 1.5, 1.67], + favnum = categorical([7, 5, 10, 1]), + age = [23, 23, 14, 23], + ) + + methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis] + matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis] + + for (i, method) in enumerate(methods) + encoder = ContrastEncoder( + features = [:name, :favnum], + ignore = false, + mode = method, + buildmatrix=matrix_func[i] + ) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + # Test Consistency with Types + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Check scitypes for previously Count feature + last_type, last_sctype = ts[end], scs[end] + @test last_type <: Integer && isconcretetype(last_type) + @test last_sctype <: Count + end end \ No newline at end of file diff --git a/test/encoders/frequency_encoder.jl b/test/encoders/frequency_encoder.jl index 555b9f1..b9ba90f 100644 --- a/test/encoders/frequency_encoder.jl +++ b/test/encoders/frequency_encoder.jl @@ -9,7 +9,8 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform for norm in normalize result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val] enc = - (col, level) -> ((norm) ? sum(col .== level) / length(col) : sum(col .== level)) + (col, level) -> + Float32((norm) ? sum(col .== level) / length(col) : sum(col .== level)) true_output = Dict{Symbol, Dict{Any, Any}}( :F => Dict( "m" => enc(F_col, "m"), @@ -44,7 +45,7 @@ end X_tr = frequency_encoder_transform(X, cache) enc = (col, level) -> - ((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level)) + Float32((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level)) target = ( A = [enc(:A, X[:A][i]) for i in 1:10], @@ -81,4 +82,42 @@ end # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) end -end \ No newline at end of file +end + +@testset "Test Frequency Encoding Output Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Continuous, + :C => Multiclass, + :D => Multiclass, + :E => OrderedFactor, + ) + + # Check scitype coercions: + schema(X) + + encoder = FrequencyEncoder(ordered_factor = false, normalize = false) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes correctness + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Ordinal column should be intact + @test scs[end] === schema(X).scitypes[end] + @test ts[end] == schema(X).types[end] +end diff --git a/test/encoders/missingness_encoding.jl b/test/encoders/missingness_encoding.jl index 4bcc306..d01d90f 100644 --- a/test/encoders/missingness_encoding.jl +++ b/test/encoders/missingness_encoding.jl @@ -153,4 +153,35 @@ end # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) -end \ No newline at end of file +end + + + +@testset "Test Missingness Encoder Output Types" begin + # Define a table with missing values + Xm = ( + A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]), + B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing], + C = categorical([7, 5, missing, missing, 10, 0, missing]), + D = categorical([23, 23, 44, 66, 14, 23, missing], ordered = true), + E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p']), + ) + + encoder = MissingnessEncoder() + mach = fit!(machine(encoder, Xm)) + Xnew = MMI.transform(mach, Xm) + + schema(Xm) + schema(Xnew) + Xnew.B + + scs = schema(Xnew).scitypes + for (i, type) in enumerate(schema(Xm).scitypes) + print(nonmissingtype(type)) + if nonmissingtype(type) <: Multiclass + @test scs[i] <: Multiclass + else + scs[i] == type + end + end +end diff --git a/test/encoders/ordinal_encoding.jl b/test/encoders/ordinal_encoding.jl index a631830..e5a5d26 100644 --- a/test/encoders/ordinal_encoding.jl +++ b/test/encoders/ordinal_encoding.jl @@ -82,4 +82,55 @@ end # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) end -end \ No newline at end of file +end + + +@testset "Test Ordinal Encoding Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Multiclass, + :C => Multiclass, + :D => Continuous, + :E => OrderedFactor, + ) + + + encoder = OrdinalEncoder(ordered_factor = false) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + # Check that for last column it did not changed + scs[end] === schema(X).scitypes[end] + scs[end] + schema(X).scitypes[end] + + ## Int32 case + encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously categorical features + @test all(scs[1:end-2] .== Count) + @test all(t -> (t <: Integer) && isconcretetype(t), ts[1:end-2]) + # Check rest of the types + scs[end-1:end] + @test scs[end-1:end] == schema(X).scitypes[end-1:end] + @test ts[end-1:end] == schema(X).types[end-1:end] +end diff --git a/test/encoders/target_encoding.jl b/test/encoders/target_encoding.jl index 4e6b0be..946379e 100644 --- a/test/encoders/target_encoding.jl +++ b/test/encoders/target_encoding.jl @@ -345,4 +345,44 @@ end # Test report @test report(mach) == (encoded_features = generic_cache[:encoded_features],) end -end \ No newline at end of file +end + + + +@testset "Test Target Encoding Types" begin + # Define categorical features + A = ["g", "b", "g", "r", "r"] + B = [1.0, 2.0, 3.0, 4.0, 5.0] + C = ["f", "f", "f", "m", "f"] + D = [true, false, true, false, true] + E = [1, 2, 3, 4, 5] + + # Define the target variable + y = ["c1", "c2", "c3", "c1", "c2"] + + # Combine into a named tuple + X = (A = A, B = B, C = C, D = D, E = E) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Continuous, + :C => Multiclass, + :D => Multiclass, + :E => OrderedFactor, + ) + y = coerce(y, Multiclass) + + encoder = TargetEncoder(ordered_factor = false, lambda = 1.0, m = 0) + mach = fit!(machine(encoder, X, y)) + Xnew = MMI.transform(mach, X) + + scs = schema(Xnew).scitypes + ts = schema(Xnew).types + # Check scitypes for previously continuos or categorical features + @test all(scs[1:end-1] .== Continuous) + @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1]) + @test scs[end] === schema(X).scitypes[end] + @test ts[end] == schema(X).types[end] +end + diff --git a/test/runtests.jl b/test/runtests.jl index b867435..d8b0f5a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,7 @@ using StatsModels # Other transformers using Tables, CategoricalArrays -using ScientificTypes: scitype +using ScientificTypes: scitype, schema using Statistics using StableRNGs stable_rng = StableRNGs.StableRNG(123) diff --git a/test/transformers/cardinality_reducer.jl b/test/transformers/cardinality_reducer.jl index 4763683..e0d0dc3 100644 --- a/test/transformers/cardinality_reducer.jl +++ b/test/transformers/cardinality_reducer.jl @@ -190,5 +190,30 @@ end @test report(mach) == (encoded_features = generic_cache[:encoded_features],) end + +@testset "Test Cardinality Reducer Output Types" begin + # Define categorical features + A = [["a" for i in 1:100]..., "b", "b", "b", "c", "d"] + B = [[0 for i in 1:100]..., 1, 2, 3, 4, 4] + + # Combine into a named tuple + X = (A = A, B = B) + + # Coerce A, C, D to multiclass and B to continuous and E to ordinal + X = coerce(X, + :A => Multiclass, + :B => Multiclass, + ) + + levels(X.A) + + encoder = CardinalityReducer(ordered_factor = false, min_frequency = 3) + mach = fit!(machine(encoder, X)) + Xnew = MMI.transform(mach, X) + @test schema(X).types == schema(Xnew).types + @test all(s -> (s <: Multiclass), schema(Xnew).scitypes) +end + + # Look into MLJModelInterfaceTest # Add tests to ensure categorical feature properties are as expected \ No newline at end of file