Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ meh/*.ipynb
.DS_Store
/*.jl
scratchpad/
examples/test.jl
6 changes: 5 additions & 1 deletion src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ function frequency_encoder_fit(
ignore::Bool = true,
ordered_factor::Bool = false,
normalize::Bool = false,
output_type::Type = Float32,
)
# 1. Define feature mapper
function feature_mapper(col, name)
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
feat_levels = levels(col)
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
level => frequency_map[level] for level in feat_levels
)
return statistic_given_feat_val
end

Expand Down
6 changes: 5 additions & 1 deletion src/encoders/frequency_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
ignore::Bool
ordered_factor::Bool
normalize::Bool
output_type::Type
end;

# 2. Constructor
Expand All @@ -14,8 +15,9 @@ function FrequencyEncoder(;
ignore = true,
ordered_factor = false,
normalize = false,
output_type = Float32,
)
return FrequencyEncoder(features, ignore, ordered_factor, normalize)
return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type)
end;


Expand All @@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
ignore = transformer.ignore,
ordered_factor = transformer.ordered_factor,
normalize = transformer.normalize,
output_type = transformer.output_type,
)
fitresult = generic_cache[:statistic_given_feat_val]

Expand Down Expand Up @@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`.
- `ignore=true`: Whether to exclude or include the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.

# Operations

Expand Down
20 changes: 13 additions & 7 deletions src/encoders/missingness_encoding/missingness_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function missingness_encoder_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
label_for_missing::Dict{<:Type, <:Any} = Dict(
label_for_missing::Dict{<:Type, <:Any} = Dict(
AbstractString => "missing",
Char => 'm',
),
Expand All @@ -40,8 +40,8 @@ function missingness_encoder_fit(

# 1. Define feature mapper
function feature_mapper(col, name)
col_type = nonmissingtype(eltype(col)).parameters[1]
feat_levels = levels(col; skipmissing=true)
feat_levels = levels(col; skipmissing = true)
col_type = nonmissingtype(eltype(feat_levels))

# Ensure column type is valid (can't test because never occurs)
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
Expand All @@ -58,7 +58,7 @@ function missingness_encoder_fit(

# Check no collision between keys(label_for_missing) and feat_levels
for value in values(label_for_missing)
if !ismissing(value)
if !ismissing(value)
if value in feat_levels
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
end
Expand All @@ -73,7 +73,7 @@ function missingness_encoder_fit(
break
end
end

# Nonmissing levels remain as is
label_for_missing_given_feature = Dict{Missing, col_type}()

Expand All @@ -91,7 +91,8 @@ function missingness_encoder_fit(

# 2. Pass it to generic_fit
label_for_missing_given_feature, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:label_for_missing_given_feature => label_for_missing_given_feature,
Expand All @@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
"""
function missingness_encoder_transform(X, cache::Dict)
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
return generic_transform(
X,
label_for_missing_given_feature;
ignore_unknown = true,
ensure_categorical = true,
)
end

6 changes: 5 additions & 1 deletion src/encoders/ordinal_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
ignore::Bool
ordered_factor::Bool
output_type::Type
end;

# 2. Constructor
function OrdinalEncoder(;
features = Symbol[],
ignore = true,
ordered_factor = false,
output_type = Float32,
)
return OrdinalEncoder(features, ignore, ordered_factor)
return OrdinalEncoder(features, ignore, ordered_factor, output_type)
end;


Expand All @@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
transformer.features;
ignore = transformer.ignore,
ordered_factor = transformer.ordered_factor,
output_type = transformer.output_type,
)
fitresult =
generic_cache[:index_given_feat_level]
Expand Down Expand Up @@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.

# Operations

Expand Down
5 changes: 3 additions & 2 deletions src/encoders/ordinal_encoding/ordinal_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them

- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
# Returns (in a dict)

- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
Expand All @@ -21,12 +21,13 @@ function ordinal_encoder_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
output_type::Type = Float32,
)
# 1. Define feature mapper
function feature_mapper(col, name)
feat_levels = levels(col)
index_given_feat_val =
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels))
return index_given_feat_val
end

Expand Down
3 changes: 2 additions & 1 deletion src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,9 @@ function target_encoder_fit(

# 3. Define function to compute the new value(s) for each level given a column
function feature_mapper(col, name)
feat_levels = levels(col)
y_stat_given_feat_level_for_col =
Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}()
Dict{eltype(feat_levels), Any}()
for level in levels(col)
# Get the targets of an example that belong to this level
targets_for_level = y[col.==level]
Expand Down
50 changes: 33 additions & 17 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ function generic_fit(X,
feat_col = Tables.getcolumn(X, feat_name)
feat_type = elscitype(feat_col)
feat_has_allowed_type =
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
feat_type <: Union{Missing, Multiclass} ||
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
if feat_has_allowed_type # then should be encoded
push!(encoded_features, feat_name)
# Compute the dict using the given feature_mapper function
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
mapping_per_feat_level[feat_name] =
feature_mapper(feat_col, feat_name, args...; kwargs...)
end
end
return mapping_per_feat_level, encoded_features
Expand All @@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)

new_column_names = []
while conflict
suffix = repeat("_", count)
suffix = repeat("_", count)
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
conflict = any(name -> name in existing_names, new_column_names)
count += 1
Expand All @@ -85,22 +87,29 @@ end
"""
**Private method.**

Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- In both cases it attempts to preserve the type of the table.
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
assumption is necessary because any column in X must correspond to a constant number of features
assumption is necessary because any column in X must correspond to a constant number of features
in the output table (which is equal to k).
- Features not in the dictionary are mapped to themselves (i.e., not changed).
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
- If `ensure_categorical` is true, then any input categorical column will remain categorical
"""
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
function generic_transform(
X,
mapping_per_feat_level;
single_feat = true,
ignore_unknown = false,
ensure_categorical = false,
)
feat_names = Tables.schema(X).names
new_feat_names = Symbol[]
new_cols = []
Expand All @@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
error(
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
)
end
end

if single_feat
level2scalar = mapping_per_feat_level[feat_name]
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
if ensure_categorical
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
else
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
end

push!(new_cols, new_col)
push!(new_feat_names, feat_name)
else
level2vector = mapping_per_feat_level[feat_name]
new_multi_col = map(x->get(level2vector, x, x), col)
new_multi_col = map(x -> get(level2vector, x, x), col)
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
push!(new_cols, new_multi_col...)

Expand All @@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
end
end

transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
# Attempt to preserve table type
transformed_X = Tables.materializer(X)(transformed_X)
return transformed_X
end
end
19 changes: 12 additions & 7 deletions src/transformers/cardinality_reducer/cardinality_reducer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,20 @@ function cardinality_reducer_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
min_frequency::Real = 3,
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
min_frequency::Real = 3,
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
AbstractString => "Other",
Char => 'O',
),
)
)
supportedtypes_list = [Char, AbstractString, Number]
supportedtypes = Union{supportedtypes_list...}

# 1. Define feature mapper
function feature_mapper(col, name)
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
col_type = eltype(col).parameters[1]
feat_levels = levels(col)
col_type = eltype(feat_levels)

# Ensure column type is valid (can't test because never occurs)
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
Expand Down Expand Up @@ -88,7 +88,11 @@ function cardinality_reducer_fit(
elseif elgrandtype == Number
new_cat_given_col_val[level] = minimum(feat_levels) - 1
else
throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent)))
throw(
ArgumentError(
UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent),
),
)
end
end
end
Expand All @@ -98,7 +102,8 @@ function cardinality_reducer_fit(

# 2. Pass it to generic_fit
new_cat_given_col_val, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:new_cat_given_col_val => new_cat_given_col_val,
Expand All @@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r
"""
function cardinality_reducer_transform(X, cache::Dict)
new_cat_given_col_val = cache[:new_cat_given_col_val]
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true)
end
38 changes: 36 additions & 2 deletions test/encoders/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ end
cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy)
k = length(levels(X.name))
contrast_matrix = get_dummy_contrast(k)
print()
for (i, level) in enumerate(levels(X.name))
println(cache[:vector_given_value_given_feature])
@test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :]
end
end
Expand Down Expand Up @@ -289,4 +287,40 @@ end

# Test report
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
end


@testset "Test Contrast Encoder Output Types" begin
X = (
name = categorical(["Ben", "John", "Mary", "John"]),
height = [1.85, 1.67, 1.5, 1.67],
favnum = categorical([7, 5, 10, 1]),
age = [23, 23, 14, 23],
)

methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis]
matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis]

for (i, method) in enumerate(methods)
encoder = ContrastEncoder(
features = [:name, :favnum],
ignore = false,
mode = method,
buildmatrix=matrix_func[i]
)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)

# Test Consistency with Types
scs = schema(Xnew).scitypes
ts = schema(Xnew).types

# Check scitypes for previously continuos or categorical features
@test all(scs[1:end-1] .== Continuous)
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
# Check scitypes for previously Count feature
last_type, last_sctype = ts[end], scs[end]
@test last_type <: Integer && isconcretetype(last_type)
@test last_sctype <: Count
end
end
Loading
Loading