diff --git a/src/common_docs.jl b/src/common_docs.jl index 46bff23..1a3a401 100644 --- a/src/common_docs.jl +++ b/src/common_docs.jl @@ -1,16 +1,18 @@ const X_doc = """ -- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) - `Multiclass` or `OrderedFactor` +- X: A table where the elements of the categorical features have + [scitypes](https://juliaai. github.io/ScientificTypes.jl/dev/) `Multiclass` or + `OrderedFactor` """ const X_doc_mlj = """ - `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must - have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to - check scitypes. + have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to + check scitypes. """ const features_doc = """ -- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding, - according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol), - or a callable that returns true for features to be included/excluded +- features=[]: A list of names of categorical features given as symbols to exclude or in + clude from encoding, according to the value of `ignore`, or a single symbol (which is + treated as a vector with one symbol), or a callable that returns true for features to be + included/excluded. """ const ignore_doc = """ - ignore=true: Whether to exclude or include the features given in `features` @@ -24,4 +26,3 @@ const encoded_features_doc = """ const cache_doc = """ - `cache`: The output of `contrast_encoder_fit` """ - diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl index a42cd00..7fd5a0c 100644 --- a/src/encoders/contrast_encoder/interface_mlj.jl +++ b/src/encoders/contrast_encoder/interface_mlj.jl @@ -73,10 +73,10 @@ MMI.metadata_model( """ $(MMI.doc_header(ContrastEncoder)) -`ContrastEncoder` implements the following contrast encoding methods for -categorical features: dummy, sum, backward/forward difference, and Helmert coding. -More generally, users can specify a custom contrast or hypothesis matrix, and each feature -can be encoded using a different method. +`ContrastEncoder` implements the following contrast encoding methods for categorical +features: dummy, sum, backward/forward difference, and Helmert coding. More generally, +users can specify a custom contrast or hypothesis matrix, and each feature can be encoded +using a different method. # Training data @@ -93,26 +93,36 @@ Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters $features_doc -- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. -If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different -contrast encoding scheme for each feature -- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, -where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or -hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`. + +- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, + `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. If `ignore=false` + (features to be encoded are listed explictly in `features`), then this can be a vector + of the same length as `features` to specify a different contrast encoding scheme for + each feature + +- `buildmatrix=nothing`: A function or other callable with signature + `buildmatrix(colname,k)`, where `colname` is the name of the feature levels and `k` is + it's length, and which returns contrast or hypothesis matrix with row/column ordering + consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or + `:hypothesis`. + $ignore_doc + $ordered_factor_doc # Operations -- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and - return the new table. Features that are neither `Multiclass` nor `OrderedFactor` - are always left unchanged. +- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or + `OrderedFactor features of `Xnew` specified by hyper-parameters, and return the new + table. Features that are neither `Multiclass` nor `OrderedFactor` are always left + unchanged. # Fitted parameters The fields of `fitted_params(mach)` are: -- `vector_given_value_given_feature`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency. +- `vector_given_value_given_feature`: A dictionary that maps each level for each column in + a subset of the categorical features of X into its frequency. # Report @@ -138,7 +148,7 @@ schema(X) encoder = ContrastEncoder( features = [:name, :favnum], - ignore = false, + ignore = false, mode = [:dummy, :helmert], ) mach = fit!(machine(encoder, X)) @@ -157,4 +167,4 @@ julia > Xnew See also [`OneHotEncoder`](@ref) """ -ContrastEncoder \ No newline at end of file +ContrastEncoder diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl index 7c08f41..2d4f981 100644 --- a/src/encoders/missingness_encoding/interface_mlj.jl +++ b/src/encoders/missingness_encoding/interface_mlj.jl @@ -77,10 +77,10 @@ MMI.metadata_model( """ $(MMI.doc_header(MissingnessEncoder)) -`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing"). -By this, missingness will be treated as a new -level by any subsequent model. This assumes that the categorical features have raw -types that are in `Char`, `AbstractString`, and `Number`. +`MissingnessEncoder` maps any missing level of a categorical feature into a new level +(e.g., "Missing"). By this, missingness will be treated as a new level by any subsequent +model. This assumes that the categorical features have raw types that are in `Char`, +`AbstractString`, and `Number`. # Training data @@ -97,25 +97,32 @@ Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters $features_doc + $ignore_doc + $ordered_factor_doc -- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A -dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value -signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` -then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'` -and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. + +- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => + 'm', )`: A dictionary where the possible values for keys are the types in `Char`, + `AbstractString`, and `Number` and where each value signifies the new level to map into + given a column raw super type. By default, if the raw type of the column subtypes + `AbstractString` then missing values will be replaced with `"missing"` and if the raw + type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number` + then the new value is the lowest value in the column - 1. # Operations -- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and - return the new table. Features that are neither `Multiclass` nor `OrderedFactor` - are always left unchanged. +- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or + `OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new + table. Features that are neither `Multiclass` nor `OrderedFactor` are always left + unchanged. # Fitted parameters The fields of `fitted_params(mach)` are: -- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing` +- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` + into some value according to `label_for_missing` # Report @@ -154,4 +161,4 @@ julia> Xnew See also [`CardinalityReducer`](@ref) """ -MissingnessEncoder \ No newline at end of file +MissingnessEncoder diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl index 3b9443b..36e9b82 100644 --- a/src/encoders/target_encoding/interface_mlj.jl +++ b/src/encoders/target_encoding/interface_mlj.jl @@ -49,7 +49,7 @@ struct TargetEncoderResult{ } <: MMI.MLJType # target statistic for each level of each categorical feature y_stat_given_feat_level::Dict{A, A} - task::S # "Regression", "Classification" + task::S # "Regression", "Classification" num_classes::I # num_classes in case of classification y_classes::A # y_classes in case of classification @@ -120,7 +120,7 @@ MMI.target_in_fit(::Type{<:TargetEncoder}) = true """ $(MMI.doc_header(TargetEncoder)) -`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables +`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables into continuous ones using statistics from the target variable. # Training data @@ -133,34 +133,42 @@ Here: $X_doc_mlj -- `y` is the target, which can be any `AbstractVector` whose element - scitype is `Continuous` or `Count` for regression problems and - `Multiclass` or `OrderedFactor` for classification problems; check the scitype with `schema(y)` +- `y` is the target, which can be any `AbstractVector` whose element scitype is + `Continuous` or `Count` for regression problems and `Multiclass` or `OrderedFactor` for + classification problems; check the scitype with `schema(y)` Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters $features_doc + $ignore_doc + $ordered_factor_doc -- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as described in [1] -- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` then m will be computed using - empirical Bayes estimation as described in [1] + +- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as + described in [1] + +- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` + then m will be computed using empirical Bayes estimation as described in [1] # Operations -- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and - return the new table. Features that are neither `Multiclass` nor `OrderedFactor` - are always left unchanged. +- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or + `OrderedFactor features of `Xnew` specified by hyper-parameters, and return the new + table. Features that are neither `Multiclass` nor `OrderedFactor` are always left + unchanged. # Fitted parameters The fields of `fitted_params(mach)` are: - `task`: Whether the task is `Classification` or `Regression` -- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode each categorical feature. It maps each - level in each categorical feature to a statistic computed over the target. + +- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode + each categorical feature. It maps each level in each categorical feature to a statistic + computed over the target. # Report @@ -174,13 +182,13 @@ $encoded_features_doc using MLJ # Define categorical features -A = ["g", "b", "g", "r", "r",] +A = ["g", "b", "g", "r", "r",] B = [1.0, 2.0, 3.0, 4.0, 5.0,] -C = ["f", "f", "f", "m", "f",] +C = ["f", "f", "f", "m", "f",] D = [true, false, true, false, true,] E = [1, 2, 3, 4, 5,] -# Define the target variable +# Define the target variable y = ["c1", "c2", "c3", "c1", "c2",] # Combine into a named tuple @@ -219,11 +227,11 @@ julia > schema(Xnew) ``` # Reference -[1] Micci-Barreca, Daniele. - “A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems” +[1] Micci-Barreca, Daniele. + “A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems” SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. See also [`OneHotEncoder`](@ref) """ -TargetEncoder \ No newline at end of file +TargetEncoder diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl index 201d268..4e21ddc 100644 --- a/src/transformers/cardinality_reducer/interface_mlj.jl +++ b/src/transformers/cardinality_reducer/interface_mlj.jl @@ -88,10 +88,10 @@ MMI.metadata_model( """ $(MMI.doc_header(CardinalityReducer)) -`CardinalityReducer` maps any level of a categorical feature that occurs with -frequency < `min_frequency` into a new level (e.g., "Other"). This is useful when some categorical features have -high cardinality and many levels are infrequent. This assumes that the categorical features have raw -types that are in `Union{AbstractString, Char, Number}`. +`CardinalityReducer` maps any level of a categorical feature that occurs with frequency `< +min_frequency` into a new level (e.g., "Other"). This is useful when some categorical +features have high cardinality and many levels are infrequent. This assumes that the +categorical features have raw types that are in `Union{AbstractString, Char, Number}`. # Training data @@ -109,28 +109,36 @@ Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters $features_doc + $ignore_doc + $ordered_factor_doc -- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be -an integer or a float which decides whether raw counts or normalized frequencies are used. -- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A -dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies -the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString` -then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'` -and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1. + +- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < + `min_frequency` will be mapped to a new level. Could be an integer or a float which + decides whether raw counts or normalized frequencies are used. + +- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => + 'O', )`: A dictionary where the possible values for keys are the types in `Char`, + `AbstractString`, and `Number` and each value signifies the new level to map into given + a column raw super type. By default, if the raw type of the column subtypes + `AbstractString` then the new value is `"Other"` and if the raw type subtypes `Char` + then the new value is `'O'` and if the raw type subtypes `Number` then the new value is + the lowest value in the column - 1. # Operations -- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and - return the new table. Features that are neither `Multiclass` nor `OrderedFactor` - are always left unchanged. +- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or + `OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new + table. Features that are neither `Multiclass` nor `OrderedFactor` are always left + unchanged. # Fitted parameters The fields of `fitted_params(mach)` are: -- `new_cat_given_col_val`: A dictionary that maps each level in a - categorical feature to a new level (either itself or the new level specified in `label_for_infrequent`) +- `new_cat_given_col_val`: A dictionary that maps each level in a categorical feature to a + new level (either itself or the new level specified in `label_for_infrequent`) # Report @@ -176,4 +184,4 @@ Dict{CategoricalArrays.CategoricalValue{Int64, UInt32}, Float64} with 2 entries: See also [`FrequencyEncoder`](@ref) """ -CardinalityReducer \ No newline at end of file +CardinalityReducer