Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to FillImputer #289

Merged
merged 3 commits into from
Aug 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Expand All @@ -35,6 +36,7 @@ Parameters = "^0.12"
Requires = "^0.5, ^1"
ScientificTypes = "^0.8"
StatsBase = "^0.32,^0.33"
StableRNGs = "^0.1"
Tables = "^0.2,^1.0"
julia = "1"

Expand Down
4 changes: 2 additions & 2 deletions src/MLJModels.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module MLJModels
module MLJModels

import MLJModelInterface
import MLJModelInterface: MODEL_TRAITS
Expand Down Expand Up @@ -28,7 +28,7 @@ export ConstantRegressor, ConstantClassifier,
# from model/Transformers
export FeatureSelector, StaticTransformer, UnivariateDiscretizer,
UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,
OneHotEncoder, ContinuousEncoder, FillImputer
OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer

const srcdir = dirname(@__FILE__) # the directory containing this file

Expand Down
178 changes: 150 additions & 28 deletions src/builtins/Transformers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ const N_VALUES_THRESH = 16 # for BoxCoxTransformation
const FILL_IMPUTER_DESCR = "Imputes missing data with a fixed value "*
"computed on the non-missing values. A different imputing function "*
"can be specified for `Continuous`, `Count` and `Finite` data. "
const UNIVARIATE_FILL_IMPUTER_DESCR = "Univariate form of FillImpututer. "*
FILL_IMPUTER_DESCR
const FEATURE_SELECTOR_DESCR = "Filter features (columns) of a table by name."
const UNIVARIATE_STD_DESCR = "Standardize (whiten) univariate data."
const UNIVARIATE_DISCR_DESCR = "Discretize a continuous variable via "*
Expand All @@ -28,9 +30,60 @@ const CONTINUOUS_ENCODER_DESCR = "Convert all `Finite` (categorical) and "*

round_median(v::AbstractVector) = v -> round(eltype(v), median(v))

_median = e -> skipmissing(e) |> median
_round_median = e -> skipmissing(e) |> (f -> round(eltype(f), median(f)))
_mode = e -> skipmissing(e) |> mode
_median(e) = skipmissing(e) |> median
_round_median(e) = skipmissing(e) |> (f -> round(eltype(f), median(f)))
_mode(e) = skipmissing(e) |> mode

@with_kw_noshow mutable struct UnivariateFillImputer <: Unsupervised
continuous_fill::Function = _median
count_fill::Function = _round_median
finite_fill::Function = _mode
end

function MLJBase.fit(transformer::UnivariateFillImputer,
verbosity::Integer,
v)

filler(v, ::Type) = throw(ArgumentError(
"Imputation is not supported for vectors "*
"of elscitype $(elscitype(v))."))
filler(v, ::Type{<:Union{Continuous,Missing}}) =
transformer.continuous_fill(v)
filler(v, ::Type{<:Union{Count,Missing}}) =
transformer.count_fill(v)
filler(v, ::Type{<:Union{Finite,Missing}}) =
transformer.finite_fill(v)

fitresult = (filler=filler(v, elscitype(v)),)
cache = nothing
report = nothing

return fitresult, cache, report

end

function MLJBase.transform(transformer::UnivariateFillImputer,
fitresult,
vnew)

filler = fitresult.filler
scitype(filler) <: elscitype(vnew) || error(
"Attempting to impute a value of scitype $(scitype(filler)) "*
"into a vector of incompatible elscitype, namely $(elscitype(vnew)). ")

if Missing <: elscitype(vnew)
w = copy(vnew) # transform must be non-mutating
w[ismissing.(w)] .= filler
w_tight = convert.(nonmissing(eltype(w)), w)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ablaom I know this has already been merged. I hope you don't mind me making a slight change here.

 w = copy(vnew) # transform must be non-mutating
        w[ismissing.(w)] .= filler
        w_tight = convert.(nonmissing(eltype(w)), w)

the above code has too much allocations (w = copy(vnew) and w_tight = convert.(nonmissing(eltype(w)), w). Since a new array must always be created due the requirement that transform must be non-mutating , i feel that the following rewrite is slightly more efficient (Any slight increase in efficiency is required. right?)

w_tight = similar(vnew, nonmissing(eltype(vnew)))
 @inbounds for i in eachindex(vnew)
       ismissing(vnew[i]) ? (w_tight[i] = filler ) : (w_tight[i] = vnew[i])
     end

Maybe the following contrived example can help show why?

using Random, BenchmarkTools
function h1!(vnew, filler) #code avoiding double allocation
       w = similar(vnew, nonmissingtype(eltype(vnew)))
       @inbounds for i in eachindex(vnew)
       ismissing(vnew[i]) ? (w[i] = filler ) : (w[i] = vnew[i])
       end
       w
end
function h2!(vnew, filler) #code with double allocations
       w = copy(vnew) # transform must be non-mutating
               w[ismissing.(w)] .= filler
               w_tight = convert.(nonmissingtype(eltype(w)), w)
       w_tight
end
n = [repeat([missing],10000)..., rand(20000)...]; #array containing missing values
n1 = copy(n)
n2 = copy(n)
shuffle!(n);
n3 = copy(n);
n4 = copy(n);
julia> @btime h1!($n1, 0);
  38.010 μs (2 allocations: 234.45 KiB)
julia> @btime h2!($n2, 0);
  144.883 μs (9 allocations: 584.45 KiB)
julia> @btime h1!($n3, 0);
  166.380 μs (2 allocations: 234.45 KiB)

julia> @btime h2!($n4, 0);
  173.802 μs (9 allocations: 584.45 KiB)

h1! compared to h2! is slightly faster and involves little allocations (this becomes important if this code is called by other functions).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes great idea! Can you make a new PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And thank you for investigating.

else
w_tight = vnew
end

return w_tight
end

MLJBase.fitted_params(::UnivariateFillImputer, fitresult) = fitresult


"""
FillImputer(
Expand Down Expand Up @@ -61,44 +114,101 @@ $FILL_IMPUTER_DESCR
end

function MLJBase.fit(transformer::FillImputer, verbosity::Int, X)
if isempty(transformer.features)
features = Tables.schema(X).names |> collect
else
features = transformer.features

s = MLJBase.schema(X)
features_seen = s.names |> collect # "seen" = "seen in fit"
scitypes_seen = s.scitypes |> collect

features = isempty(transformer.features) ? features_seen :
transformer.features

issubset(features, features_seen) || throw(ArgumentError(
"Some features specified do not exist in the supplied table. "))

# get corresponding scitypes:
mask = map(features_seen) do ftr
ftr in features
end
features = features_seen[mask] # `features` re-ordered
scitypes = scitypes_seen[mask]
features_and_scitypes = zip(features, scitypes) |> collect

# now keep those features that are imputable:
function isimputable(ftr, T::Type)
if verbosity > 0 && !isempty(transformer.features)
@info "Feature $ftr will not be imputed "*
"(imputation for $T not supported). "
end
return false
end
isimputable(ftr, ::Type{<:Union{Continuous,Missing}}) = true
isimputable(ftr, ::Type{<:Union{Count,Missing}}) = true
isimputable(ftr, ::Type{<:Union{Finite,Missing}}) = true
mask = map(features_and_scitypes) do tup
isimputable(tup...)
end
fitresult = features
features_to_be_imputed = features[mask]

univariate_transformer =
UnivariateFillImputer(continuous_fill=transformer.continuous_fill,
count_fill=transformer.count_fill,
finite_fill=transformer.finite_fill)
univariate_fitresult(ftr) = MLJBase.fit(univariate_transformer,
verbosity - 1,
selectcols(X, ftr))[1]

fitresult_given_feature =
Dict(ftr=> univariate_fitresult(ftr) for ftr in features_to_be_imputed)

fitresult = (features_seen=features_seen,
univariate_transformer=univariate_transformer,
fitresult_given_feature=fitresult_given_feature)
report = nothing
cache = nothing

return fitresult, cache, report
end

function MLJBase.transform(transformer::FillImputer, fitresult, X)
features = Tables.schema(X).names
# check that the features match that of the transformer
all(e -> e in fitresult, features) ||
error("Attempting to transform table with feature labels not seen in fit. ")

cols = map(features) do ftr
features_seen = fitresult.features_seen # seen in fit
univariate_transformer = fitresult.univariate_transformer
fitresult_given_feature = fitresult.fitresult_given_feature

all_features = Tables.schema(X).names

# check that no new features have appeared:
all(e -> e in features_seen, all_features) || throw(ArgumentError(
"Attempting to transform table with "*
"feature labels not seen in fit.\n"*
"Features seen in fit = $features_seen.\n"*
"Current features = $([all_features...]). "))

features = tuple(keys(fitresult_given_feature)...)

cols = map(all_features) do ftr
col = MLJBase.selectcols(X, ftr)
if eltype(col) >: Missing
T = scitype_union(col)
if T <: Union{Continuous,Missing}
filler = transformer.continuous_fill(col)
elseif T <: Union{Count,Missing}
filler = transformer.count_fill(col)
elseif T <: Union{Finite,Missing}
filler = transformer.finite_fill(col)
end
col = copy(col) # carries the same name but not attached to the same memory
col[ismissing.(col)] .= filler
col = convert.(nonmissing(eltype(col)), col)
if ftr in features
fr = fitresult_given_feature[ftr]
return transform(univariate_transformer, fr, col)
end
col
return col
end
named_cols = NamedTuple{features}(tuple(cols...))

named_cols = NamedTuple{all_features}(tuple(cols...))
return MLJBase.table(named_cols, prototype=X)

end

function MLJBase.fitted_params(::FillImputer, fr)
dict = fr.fitresult_given_feature
filler_given_feature = Dict(ftr=>dict[ftr].filler for ftr in keys(dict))
return (features_seen_in_fit=fr.features_seen,
univariate_transformer=fr.univariate_transformer,
filler_given_feature=filler_given_feature)
end


##
## FOR FEATURE (COLUMN) SELECTION
##
Expand Down Expand Up @@ -985,7 +1095,7 @@ end
metadata_pkg.(
(FeatureSelector, UnivariateStandardizer,
UnivariateDiscretizer, Standardizer,
UnivariateBoxCoxTransformer,
UnivariateBoxCoxTransformer, UnivariateFillImputer,
OneHotEncoder, FillImputer, ContinuousEncoder),
name = "MLJModels",
uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7",
Expand All @@ -994,6 +1104,16 @@ metadata_pkg.(
license = "MIT",
is_wrapper = false)

metadata_model(UnivariateFillImputer,
input = Union{AbstractVector{<:Union{Continuous,Missing}},
AbstractVector{<:Union{Count,Missing}},
AbstractVector{<:Union{Finite,Missing}}},
output = Union{AbstractVector{<:Continuous},
AbstractVector{<:Count},
AbstractVector{<:Finite}},
descr = UNIVARIATE_FILL_IMPUTER_DESCR,
path = "MLJModels.UnivariateFillImputer")

metadata_model(FillImputer,
input = Table,
output = Table,
Expand Down Expand Up @@ -1049,3 +1169,5 @@ metadata_model(ContinuousEncoder,
weights = false,
descr = CONTINUOUS_ENCODER_DESCR,
path = "MLJModels.ContinuousEncoder")


Loading