diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f49313b..623860f 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -12,4 +12,4 @@ jobs: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} - ssh: ${{ secrets.DOCUMENTER_KEY }} + ssh: ${{ secrets.DOCUMENTER_KEY }} \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e4906a..a4cf307 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,11 +13,12 @@ jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: fail-fast: false matrix: version: - - '1.0' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia. os: - ubuntu-latest @@ -29,6 +30,27 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} + - name: "Replace julia libstdcxx ubuntu + julia v1.6" + shell: bash + if: ${{ matrix.version == '1.6' && matrix.os == 'ubuntu-latest' }} + # The following is needed for Julia <=1.8.3 on Linux OS + # due to old version of libstcxx used by Julia + # taken from https://github.com/hhaensel/ReplaceLibstdcxx.jl/blob/main/src/ReplaceLibstdcxx.jl + run: | + julia -e ' + libs = filter(x -> ! occursin("32", x), getindex.(split.(readlines(pipeline(`ldconfig -p`, `grep libstdc`)), r"\s*=>\s*"), 2)) + source_dir = dirname(libs[end]) + julia_lib_dir = joinpath(dirname(Sys.BINDIR), "lib", "julia") + julia_lib_file = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(julia_lib_dir, join = true)), 1, nothing) + julia_lib_version = match(r"so(\.\d+)\.", julia_lib_file).captures[1] + source_lib = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(source_dir, join = true)), 1, nothing) + julia_lib = joinpath(dirname(Sys.BINDIR), "lib", "julia", "libstdc++.so") + for src in [julia_lib, julia_lib * julia_lib_version] + islink(src) && rm(src, force = true) + symlink(source_lib, src) + @info read(`ls -al $src`, String) + end + ' - uses: actions/cache@v1 env: cache-name: cache-artifacts @@ -65,19 +87,23 @@ jobs: end end event_name = "${{ github.event_name }}" + ref = "${{ github.ref }}" + ref_is_master = ref == "refs/heads/master" + ref_is_dev = ref == "refs/heads/dev" + ref_is_tag = startswith(ref, "refs/tags/") if event_name == "pull_request" base_ref = "${{ github.base_ref }}" head_ref = "${{ github.head_ref }}" base_repository = "${{ github.repository }}" head_repository = "${{ github.event.pull_request.head.repo.full_name }}" - build_docs = (base_ref == "master") && (head_ref == "dev") && (base_repository == head_repository) + is_not_fork = base_repository == head_repository + build_docs = (base_ref == "master") && (head_ref == "dev") && (is_not_fork) elseif event_name == "push" - ref = "${{ github.ref }}" - build_docs = (ref == "refs/heads/master") || (startswith(ref, "refs/tags/")) + build_docs = ref_is_master || ref_is_dev || ref_is_tag elseif event_name == "schedule" - build_docs = ref == "refs/heads/master" + build_docs = ref_is_master || ref_is_dev elseif event_name == "workflow_dispatch" - build_docs = ref == "refs/heads/master" + build_docs = ref_is_master || ref_is_dev else build_docs = false end diff --git a/Project.toml b/Project.toml index bd2d192..7473f24 100644 --- a/Project.toml +++ b/Project.toml @@ -4,21 +4,45 @@ authors = ["Anthony D. Blaom "] version = "0.1.0" [deps] -Example = "7876af07-990d-54b4-ab0e-23690620f79a" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Example = "0.5" -MLJModelInterface = "1" -ScientificTypesBase = "1, 2, 3" -julia = "1" +Aqua = "0.8" +Distributions = "0.25" +julia = "1.6" +MLJBase = "1.1" +MLJTuning = "0.8" +MLJDecisionTreeInterface = "0.4" +MLJScikitLearnInterface = "0.6" +MLJModelInterface = "1.4" +ScientificTypesBase = "3" +StableRNGs = "1" +StatisticalMeasures = "0.1" +Tables = "1.2" +Test = "1.6" [extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f" +MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" +MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Distributions", "MLJBase", "StableRNGs", "Test"] +test = [ + "Aqua", + "Distributions", + "MLJBase", + "MLJTuning", + "MLJDecisionTreeInterface", + "MLJScikitLearnInterface", + "StableRNGs", + "StatisticalMeasures", + "Test" +] \ No newline at end of file diff --git a/README.md b/README.md index f85f2f0..46555e8 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,104 @@ # FeatureSelection.jl -This repository is a template for creating repositories that contain -glue code between (i) packages providing machine learning algorithms; and (ii) -the machine learning toolbox -[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) - that is, -for so-called *interface-only packages*. - -## When to use this template - -This template is intended for use when a package providing a machine -learning model algorithm is not hosting the code that implements the -MLJ model API, and a separate package for this purpose is to be -created. This repo is itself a working implementation but should -be used in conjunction with the more detailed [model implementation -guidelines](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/). - -## How to use this template - -1. Clone this repository or use it as a template if available from your organization. - -2. Rename this repository, replacing the word "Example" with the name of the model-providing package. - -1. Develop the contents of src/MLJExampleInterface.jl appropriately. - -2. Rename src/MLJExampleInterface.jl appropriately. - -3. Remove Example from Project.toml and instead add the model-providing package. - -3. **GENERATE A NEW UUID in Project.toml** and change the Project.toml - name and author appropriately. - -1. You may want to remove the Distributions test dependency if you don't need it. - -4. Replace every instance of "Example" in this README.md with the name - of the model-providing package and adjust the organization name in - the link. - -5. Remove everything in this REAMDE.md except what is below the line - you are currently reading 😉. - - -# MLJ.jl <--> Example.jl - -Repository implementing the [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) model interface for models provided by -[Example.jl](https://github.com/JuliaLang/Example.jl). - -| Linux | Coverage | -| :------------ | :------- | -| [![Build Status](https://github.com/JuliaAI/MLJExampleInterface.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/MLJExampleInterface.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/MLJExampleInterface.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/MLJExampleInterface.jl?branch=master) | +| Linux | Coverage | Code Style +| :------------ | :------- | :------------- | +| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) | [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) | + +Repository housing feature selection algorithms for use with the machine learning toolbox +[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/). + +`FeatureSelector` model builds on contributions originally residing at [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl/blob/v0.16.15/src/builtins/Transformers.jl#L189-L266) + +# Installation +On a running instance of Julia with at least version 1.6 run +```julia +import Pkg; +Pkg.add("FeatureSelection") +``` + +# Example Usage +Lets build a supervised recursive feature eliminator with `RandomForestRegressor` +from DecisionTree.jl as our base model. +But first we need a dataset to train on. We shall create a synthetic dataset popularly +known in the R community as the friedman dataset#1. Notice how the target vector for this +dataset depends on only the first five columns of feature table. So we expect that our +recursive feature elimination should return the first columns as important features. +```julia +using MLJ, FeatureSelection +using StableRNGs +rng = StableRNG(10) +A = rand(rng, 50, 10) +X = MLJ.table(A) # features +y = @views( + 10 .* sin.( + pi .* A[:, 1] .* A[:, 2] + ) .+ 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5] +) # target +``` +Now we that we have our data we can create our recursive feature elimination model and +train it on our dataset +```julia +RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree +forest = RandomForestRegressor(rng=rng) +rfe = RecursiveFeatureElimination( + model = forest, n_features=5, step=1 +) # see doctring for description of defaults +mach = machine(rfe, X, y) +fit!(mach) +``` +We can inspect the feature importances in two ways: +```julia +# A variable with lower rank has more significance than a variable with higher rank. +# A variable with Higher feature importance is better than a variable with lower +# feature importance +report(mach).ranking # returns [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0] +feature_importances(mach) # returns dict of feature => importance pairs +``` +We can view the important features used by our model by inspecting the `fitted_params` +object. +```julia +p = fitted_params(mach) +p.features_left == [:x1, :x2, :x3, :x4, :x5] +``` +We can also call the `predict` method on the fitted machine, to predict using a +random forest regressor trained using only the important features, or call the `transform` +method, to select just those features from some new table including all the original +features. For more info, type `?RecursiveFeatureElimination` on a Julia REPL. + +Okay, let's say that we didn't know that our synthetic dataset depends on only five +columns from our feature table. We could apply cross fold validation +`StratifiedCV(nfolds=5)` with our recursive feature elimination model to select the +optimal value of `n_features` for our model. In this case we will use a simple Grid +search with root mean square as the measure. +```julia +rfe = RecursiveFeatureElimination(model = forest) +tuning_rfe_model = TunedModel( + model = rfe, + measure = rms, + tuning = Grid(rng=rng), + resampling = StratifiedCV(nfolds = 5), + range = range( + rfe, :n_features, values = 1:10 + ) +) +self_tuning_rfe_mach = machine(tuning_rfe_model, X, y) +fit!(self_tuning_rfe_mach) +``` +As before we can inspect the important features by inspecting the object returned by +`fitted_params` or `feature_importances` as shown below. +```julia +fitted_params(self_tuning_rfe_mach).best_fitted_params.features_left == [:x1, :x2, :x3, :x4, :x5] +feature_importances(self_tuning_rfe_mach) # returns dict of feature => importance pairs +``` +and call `predict` on the tuned model machine as shown below +```julia +Xnew = MLJ.table(rand(rng, 50, 10)) # create test data +predict(self_tuning_rfe_mach, Xnew) +``` +In this case, prediction is done using the best recursive feature elimination model gotten +from the tuning process above. + +For resampling methods different from cross-validation, and for other + `TunedModel` options, such as parallelization, see the + [Tuning Models](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/) section of the MLJ manual. +[MLJ Documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/) \ No newline at end of file diff --git a/src/FeatureSelection.jl b/src/FeatureSelection.jl new file mode 100644 index 0000000..2b39d53 --- /dev/null +++ b/src/FeatureSelection.jl @@ -0,0 +1,27 @@ +module FeatureSelection + +using MLJModelInterface, Tables, ScientificTypesBase + +export FeatureSelector, RecursiveFeatureElimination + +const MMI = MLJModelInterface + +## Includes +include("models/featureselector.jl") +include("models/rfe.jl") + +## Pkg Traits +MMI.metadata_pkg.( + ( + DeterministicRecursiveFeatureElimination, + ProbabilisticRecursiveFeatureElimination, + FeatureSelector + ), + package_name = "FeatureSelection", + package_uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6", + package_url = "https://github.com/JuliaAI/FeatureSelection.jl", + is_pure_julia = true, + package_license = "MIT" +) + +end # module diff --git a/src/MLJExampleInterface.jl b/src/MLJExampleInterface.jl deleted file mode 100644 index e0f7185..0000000 --- a/src/MLJExampleInterface.jl +++ /dev/null @@ -1,116 +0,0 @@ -module MLJExampleInterface - -# Example.jl doesn't actually provide machine learning models, so we -# provide a module below with the same name to furnish us with simple -# constant probabilistic classification. Note that `Example.fit` -# ignores the training features `Xmatrix`. - -module Example - -function fit(Xmatrix::Matrix, yint::AbstractVector{<:Integer}) - classes = sort(unique(yint)) - counts = [count(==(c), yint) for c in classes] - Θ = counts / sum(counts) -end - -predict(Xnew::Matrix, Θ) = vcat(fill(Θ', size(Xnew, 1))...) - -# julia> yint = rand([1,3,4], 100); - -# julia> Θ = fit(rand(100, 3), yint) -# 3-element Vector{Float64}: -# 0.35 -# 0.23 -# 0.42 - -# julia> predict(rand(5, 3), Θ) -# 5×3 Matrix{Float64}: -# 0.35 0.23 0.42 -# 0.35 0.23 0.42 -# 0.35 0.23 0.42 -# 0.35 0.23 0.42 -# 0.35 0.23 0.42 - -end # of module - - -### CONTINUATION OF TEMPLATE - -import .Example # substitute model-providing package name here (no dot) -import MLJModelInterface -import ScientificTypesBase - -const PKG = "Example" # substitute model-providing package name -const MMI = MLJModelInterface -const STB = ScientificTypesBase - -""" - CoolProbabilisticClassifier() - -A cool classifier that predicts `UnivariateFinite` probability -distributions. These are distributions for a finite sample space whose -elements are *labeled*. - -""" -MMI.@mlj_model mutable struct CoolProbabilisticClassifier <: MMI.Probabilistic - dummy_hyperparameter1::Float64 = 1.0::(_ ≥ 0) - dummy_hyperparameter2::Int = 1::(0 < _ ≤ 1) - dummy_hyperparameter3 -end - -function MMI.fit(::CoolProbabilisticClassifier, verbosity, X, y) - - Xmatrix = MMI.matrix(X) - - yint = MMI.int(y) - decode = MMI.decoder(y[1]) # for decoding int repr. - classes_seen = decode(sort(unique(yint))) # ordered by int repr. - - Θ = Example.fit(Xmatrix, yint) # probability vector - fitresult = (Θ, classes_seen) - report = (n_classes_seen = length(classes_seen),) - cache = nothing - - return fitresult, cache, report - -end - -function MMI.predict(::CoolProbabilisticClassifier, fitresult, Xnew) - Xmatrix = MMI.matrix(Xnew) - - Θ, classes_seen = fitresult - prob_matrix = Example.predict(Xmatrix, Θ) - - # `classes_seen` is a categorical vector whose pool actually - # includes *all* classes. The `UnivariateFinite` constructor - # automatically assigns zero probability to the unseen classes. - - return MMI.UnivariateFinite(classes_seen, prob_matrix) -end - -# for returning user-friendly form of the learned parameters: -function MMI.fitted_params(::CoolProbabilisticClassifier, fitresult) - Θ, classes_seen = fitresult - return (raw_probabilities = Θ, classes_seen_in_training = classes_seen) -end - - -## META DATA - -MMI.metadata_pkg(CoolProbabilisticClassifier, - name="$PKG", - uuid="7876af07-990d-54b4-ab0e-23690620f79a", - url="https://github.com/JuliaLang/Example.jl", - is_pure_julia=true, - license="MIT", - is_wrapper=false -) - -MMI.metadata_model(CoolProbabilisticClassifier, - input_scitype = MMI.Table(STB.Continuous), - target_scitype = AbstractVector{<:STB.Finite},# ie, a classifier - docstring = "Really cool classifier", # brief description - path = "$PKG.CoolProbabilisiticClassifier" - ) - -end # module diff --git a/src/models/featureselector.jl b/src/models/featureselector.jl new file mode 100644 index 0000000..a95045c --- /dev/null +++ b/src/models/featureselector.jl @@ -0,0 +1,167 @@ +# # FOR FEATURE (COLUMN) SELECTION + +mutable struct FeatureSelector <: Unsupervised + # features to be selected; empty means all + features::Union{Vector{Symbol}, Function} + ignore::Bool # features to be ignored +end + +# keyword constructor +function FeatureSelector( + ; + features::Union{AbstractVector{Symbol}, Function}=Symbol[], + ignore::Bool=false +) + transformer = FeatureSelector(features, ignore) + message = MMI.clean!(transformer) + isempty(message) || throw(ArgumentError(message)) + return transformer +end + +function MMI.clean!(transformer::FeatureSelector) + err = "" + if ( + typeof(transformer.features) <: AbstractVector{Symbol} && + isempty(transformer.features) && + transformer.ignore + ) + err *= "Features to be ignored must be specified in features field." + end + return err +end + +function MMI.fit(transformer::FeatureSelector, verbosity::Int, X) + all_features = Tables.schema(X).names + + if transformer.features isa AbstractVector{Symbol} + if isempty(transformer.features) + features = collect(all_features) + else + features = if transformer.ignore + !issubset(transformer.features, all_features) && verbosity > -1 && + @warn("Excluding non-existent feature(s).") + filter!(all_features |> collect) do ftr + !(ftr in transformer.features) + end + else + issubset(transformer.features, all_features) || + throw(ArgumentError("Attempting to select non-existent feature(s).")) + transformer.features |> collect + end + end + else + features = if transformer.ignore + filter!(all_features |> collect) do ftr + !(transformer.features(ftr)) + end + else + filter!(all_features |> collect) do ftr + transformer.features(ftr) + end + end + isempty(features) && throw( + ArgumentError("No feature(s) selected.\n The specified Bool-valued"* + " callable with the `ignore` option set to `$(transformer.ignore)` "* + "resulted in an empty feature set for selection") + ) + end + + fitresult = features + report = NamedTuple() + return fitresult, nothing, report +end + +MMI.fitted_params(::FeatureSelector, fitresult) = (features_to_keep=fitresult,) + +function MMI.transform(::FeatureSelector, features, X) + all(e -> e in Tables.schema(X).names, features) || + throw(ArgumentError("Supplied frame does not admit previously selected features.")) + return MMI.selectcols(X, features) +end + +## Traits definitions +MMI.metadata_model( + FeatureSelector, + input_scitype = Table, + output_scitype = Table, + load_path = "FeatureSelction.FeatureSelector" +) + +""" +$(MMI.doc_header(FeatureSelector)) + +Use this model to select features (columns) of a table, usually as +part of a model `Pipeline`. + + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +where + +- `X`: any table of input features, where "table" is in the sense of Tables.jl + +Train the machine using `fit!(mach, rows=...)`. + + +# Hyper-parameters + +- `features`: one of the following, with the behavior indicated: + + - `[]` (empty, the default): filter out all features (columns) which + were not encountered in training + + - non-empty vector of feature names (symbols): keep only the + specified features (`ignore=false`) or keep only unspecified + features (`ignore=true`) + + - function or other callable: keep a feature if the callable returns + `true` on its name. For example, specifying + `FeatureSelector(features = name -> name in [:x1, :x3], ignore = + true)` has the same effect as `FeatureSelector(features = [:x1, + :x3], ignore = true)`, namely to select all features, with the + exception of `:x1` and `:x3`. + +- `ignore`: whether to ignore or keep specified `features`, as + explained above + + +# Operations + +- `transform(mach, Xnew)`: select features from the table `Xnew` as + specified by the model, taking features seen during training into + account, if relevant + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `features_to_keep`: the features that will be selected + + +# Example + +``` +using MLJ + +X = (ordinal1 = [1, 2, 3], + ordinal2 = coerce(["x", "y", "x"], OrderedFactor), + ordinal3 = [10.0, 20.0, 30.0], + ordinal4 = [-20.0, -30.0, -40.0], + nominal = coerce(["Your father", "he", "is"], Multiclass)); + +selector = FeatureSelector(features=[:ordinal3, ], ignore=true); + +julia> transform(fit!(machine(selector, X)), X) +(ordinal1 = [1, 2, 3], + ordinal2 = CategoricalValue{Symbol,UInt32}["x", "y", "x"], + ordinal4 = [-20.0, -30.0, -40.0], + nominal = CategoricalValue{String,UInt32}["Your father", "he", "is"],) + +``` +""" +FeatureSelector \ No newline at end of file diff --git a/src/models/rfe.jl b/src/models/rfe.jl new file mode 100644 index 0000000..1b10260 --- /dev/null +++ b/src/models/rfe.jl @@ -0,0 +1,391 @@ +function warn_double_spec(arg, model) + return "Using `model=$arg`. Ignoring keyword specification `model=$model`. " +end + +const ERR_SPECIFY_MODEL = ArgumentError( + "You need to specify model as positional argument or specify `model=...`." +) + +const ERR_MODEL_TYPE = ArgumentError( + "Only `Deterministic` and `Probabilistic` model types supported." +) + +const ERR_FEATURE_IMPORTANCE_SUPPORT = ArgumentError( + "Model does not report feature importance, hence recursive feature algorithm "* + "can't be applied." +) + +const ERR_FEATURES_SEEN = ArgumentError( + "Features of new table must be same as those seen during fit process." +) + +const MODEL_TYPES = [ + :ProbabilisticRecursiveFeatureElimination, :DeterministicRecursiveFeatureElimination +] +const SUPER_TYPES = [:Deterministic, :Probabilistic] +const MODELTYPE_GIVEN_SUPERTYPES = zip(MODEL_TYPES, SUPER_TYPES) + +for (ModelType, ModelSuperType) in MODELTYPE_GIVEN_SUPERTYPES + ex = quote + mutable struct $ModelType{M<:Supervised} <: $ModelSuperType + model::M + n_features::Float64 + step::Float64 + end + end + eval(ex) +end + +eval(:(const RFE{M} = Union{$((Expr(:curly, modeltype, :M) for modeltype in MODEL_TYPES)...)})) + +# Common keyword constructor for both model types +""" + RecursiveFeatureElimination(model, n_features, step) + +This model implements a recursive feature elimination algorithm for feature selection. +It recursively removes features, training a base model on the remaining features and +evaluating their importance until the desired number of features is selected. + +Construct an instance with default hyper-parameters using the syntax +`model = RecursiveFeatureElimination(model=...)`. Provide keyword arguments to override +hyper-parameter defaults. + +# Training data +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +OR, if the base model supports weights, as + + mach = machine(model, X, y, w) + +Here: + +- `X` is any table of input features (eg, a `DataFrame`) whose columns are of the scitype + as that required by the base model; check column scitypes with `schema(X)` and column + scitypes required by base model with `input_scitype(basemodel)`. + +- `y` is the target, which can be any table of responses whose element scitype is + `Continuous` or `Finite` depending on the `target_scitype` required by the base model; + check the scitype with `scitype(y)`. + +- `w` is the observation weights which can either be `nothing`(default) or an + `AbstractVector` whoose element scitype is `Count` or `Continuous`. This is different + from `weights` kernel which is an hyperparameter to the model, see below. + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters +- model: A base model with a `fit` method that provides information on feature + feature importance (i.e `reports_feature_importances(model) == true`) + +- n_features::Real = 0: The number of features to select. If `0`, half of the + features are selected. If a positive integer, the parameter is the absolute number + of features to select. If a real number between 0 and 1, it is the fraction of features + to select. + +- step::Real=1: If the value of step is at least 1, it signifies the quantity of features to + eliminate in each iteration. Conversely, if step falls strictly within the range of + 0.0 to 1.0, it denotes the proportion (rounded down) of features to remove during each iteration. + +# Operations + +- `transform(mach, X)`: transform the input table `X` into a new table containing only +columns corresponding to features gotten from the RFE algorithm. + +- `predict(mach, X)`: transform the input table `X` into a new table same as in + +- `transform(mach, X)` above and predict using the fitted base model on the + transformed table. + +# Fitted parameters +The fields of `fitted_params(mach)` are: +- `features_left`: names of features remaining after recursive feature elimination. + +- `model_fitresult`: fitted parameters of the base model. + +# Report +The fields of `report(mach)` are: +- `ranking`: The feature ranking of each features in the training dataset. + +- `model_report`: report for the fitted base model. + +- `features`: names of features seen during the training process. + +# Examples +``` +using FeatureSelection, MLJ, StableRNGs + +RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree + +# Creates a dataset where the target only depends on the first 5 columns of the input table. +A = rand(rng, 50, 10); +y = 10 .* sin.( + pi .* A[:, 1] .* A[:, 2] + ) + 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]); +X = MLJ.table(A); + +# fit a rfe model +rf = RandomForestRegressor() +selector = RecursiveFeatureElimination(model = rf) +mach = machine(selector, X, y) +fit!(mach) + +# view the feature importances +feature_importances(mach) + +# predict using the base model +Xnew = MLJ.table(rand(rng, 50, 10)); +predict(mach, Xnew) + +``` + +""" +function RecursiveFeatureElimination( + args...; + model=nothing, + n_features::Real=0, + step::Real = 1 +) + # user can specify model as argument instead of kwarg: + length(args) < 2 || throw(ERR_TOO_MANY_ARGUMENTS) + if length(args) == 1 + arg = first(args) + model === nothing || + @warn warn_double_spec(arg, model) + model = arg + else + model === nothing && throw(ERR_SPECIFY_MODEL) + end + + #TODO: Check that the specifed model implements the predict method. + # probably add a trait to check this + MMI.reports_feature_importances(model) || throw(ERR_FEATURE_IMPORTANCE_SUPPORT) + if model isa Deterministic + selector = DeterministicRecursiveFeatureElimination{typeof(model)}( + model, Float64(n_features), Float64(step) + ) + elseif model isa Probabilistic + selector = ProbabilisticRecursiveFeatureElimination{typeof(model)}( + model, Float64(n_features), Float64(step) + ) + else + throw(ERR_MODEL_TYPE) + end + message = MMI.clean!(selector) + isempty(message) || @warn(message) + return selector +end + +function MMI.clean!(selector::RFE) + msg = "" + if selector.step <= 0 + msg *= "specified `step` must be greater than zero.\n" + "Resetting `step = 1`" + end + + if selector.n_features < 0 + msg *= "specified `step` must be non-negative.\n"* + "Resetting `n_features = 0`" + end + + return msg +end + +function MMI.fit(selector::RFE, verbosity::Int, X, y, args...) + args = (y, args...) + Xcols = Tables.Columns(X) + features = collect(Tables.columnnames(Xcols)) + nfeatures = length(features) + nfeatures < 2 && throw( + ArgumentError("The number of features in the feature matrix must be at least 2.") + ) + + # Compute required number of features to select + n_features_select = selector.n_features + ## zero indicates that half of the features be selected. + if n_features_select == 0 + n_features_select = div(nfeatures, 2) + elseif 0 < n_features_select < 1 + n_features_select = round(Int, n_features_select * nfeatures) + else + n_features_select = round(Int, n_features_select) + end + + step = selector.step + + if 0 < step < 1 + step = round(Int, max(1, step * n_features_select)) + else + step = round(Int, step) + end + + support = trues(nfeatures) + ranking = ones(Int, nfeatures) # every feature has equal rank initially + mask = trues(nfeatures) # for boolean indexing of ranking vector in while loop below + + # Elimination + features_left = features + n_features_left = length(features_left) + while n_features_left > n_features_select + # Rank the remaining features + model = selector.model + verbosity > 0 && @info("Fitting estimator with $(n_features_left) features.") + + data = MMI.reformat(model, MMI.selectcols(X, features_left), args...) + + fitresult, _, report = MMI.fit(model, verbosity - 1, data...) + + # Get absolute values of importance and rank them + importances = abs.( + last.( + MMI.feature_importances( + selector.model, + fitresult, + report + ) + ) + ) + + ranks = sortperm(importances) + + # Eliminate the worse features + threshold = min(step, n_features_left - n_features_select) + @views(support[support][ranks[1:threshold]]) .= false + mask .= support .== false + @views(ranking[mask]) .+= 1 + + # Remaining features + features_left = features[support] + n_features_left = length(features_left) + end + + # Set final attributes + data = MMI.reformat(selector.model, MMI.selectcols(X, features_left), args...) + verbosity > 0 && @info ("Fitting estimator with $(n_features_left) features.") + model_fitresult, _, model_report = MMI.fit(selector.model, verbosity - 1, data...) + + fitresult = ( + support = support, + model_fitresult = model_fitresult, + features_left = features_left, + features = features + ) + report = ( + ranking = ranking, + model_report = model_report + ) + + return fitresult, nothing, report + +end + +function MMI.fitted_params(model::RFE, fitresult) + ( + features_left = copy(fitresult.features_left), + model_fitresult = MMI.fitted_params(model.model, fitresult.model_fitresult) + ) +end + +function MMI.predict(model::RFE, fitresult, X) + X_ = reformat(model.model, MMI.transform(model, fitresult, X))[1] + yhat = MMI.predict(model.model, fitresult.model_fitresult, X_) + return yhat +end + +function MMI.transform(::RFE, fitresult, X) + sch = Tables.schema(Tables.columns(X)) + if (length(fitresult.features) == length(sch.names) && + !all(e -> e in sch.names, fitresult.features)) + throw( + ERR_FEATURES_SEEN + ) + end + return MMI.selectcols(X, fitresult.features_left) +end + +function MMI.feature_importances(::RFE, fitresult, report) + return Pair.(fitresult.features, Iterators.reverse(report.ranking)) +end + +function MMI.save(model::RFE, fitresult) + support = fitresult.support + atomic_fitresult = fitresult.model_fitresult + features_left = fitresult.features_left + features = fitresult.features + + atom = model.model + return ( + support = copy(support), + model_fitresult = MMI.save(atom, atomic_fitresult), + features_left = copy(features_left), + features = copy(features) + ) +end + +function MMI.restore(model::RFE, serializable_fitresult) + support = serializable_fitresult.support + atomic_serializable_fitresult = serializable_fitresult.model_fitresult + features_left = serializable_fitresult.features_left + features = serializable_fitresult.features + + atom = model.model + return ( + support = support, + model_fitresult = MMI.restore(atom, atomic_serializable_fitresult), + features_left = features_left, + features = features + ) +end + +## Traits definitions +function MMI.load_path(::Type{<:DeterministicRecursiveFeatureElimination}) + return "FeatureSelection.DeterministicRecursiveFeatureElimination" +end + +function MMI.load_path(::Type{<:ProbabilisticRecursiveFeatureElimination}) + return "FeatureSelection.ProbabilisticRecursiveFeatureElimination" +end + +for trait in [ + :supports_weights, + :supports_class_weights, + :is_pure_julia, + :is_wrapper, + :input_scitype, + :target_scitype, + :output_scitype, + :supports_training_losses, + :reports_feature_importances, + ] + + # try to get trait at level of types ("failure" here just + # means falling back to `Unknown`): + quote + MMI.$trait(::Type{<:RFE{M}}) where M = MMI.$trait(M) + end |> eval + + # needed because traits are not always deducable from + # the type (eg, `target_scitype` and `Pipeline` models): + eval(:(MMI.$trait(model::RFE) = MMI.$trait(model.model))) +end + +# ## Iteration parameter +# at level of types: +prepend(s::Symbol, ::Nothing) = nothing +prepend(s::Symbol, t::Symbol) = Expr(:(.), s, QuoteNode(t)) +prepend(s::Symbol, ex::Expr) = Expr(:(.), prepend(s, ex.args[1]), ex.args[2]) + +function MMI.iteration_parameter(::Type{<:RFE{M}}) where {M} + return prepend(:model, MMI.iteration_parameter(M)) +end + +# at level of instances: +function MMI.iteration_parameter(model::RFE) + return prepend(:model, MMI.iteration_parameter(model.model)) +end + +## TRAINING LOSSES SUPPORT +function MMI.training_losses(model::RFE, rfe_report) + return MMI.training_losses(model.model, rfe_report.model_report) +end \ No newline at end of file diff --git a/test/Aqua.jl b/test/Aqua.jl new file mode 100644 index 0000000..b6e3423 --- /dev/null +++ b/test/Aqua.jl @@ -0,0 +1,5 @@ +using Aqua + +@testset "Aqua.jl" begin + Aqua.test_all(FeatureSelection) +end \ No newline at end of file diff --git a/test/models/dummy_test_models.jl b/test/models/dummy_test_models.jl new file mode 100644 index 0000000..ae3b98a --- /dev/null +++ b/test/models/dummy_test_models.jl @@ -0,0 +1,67 @@ +module DummyTestModels + +using MLJBase +using Distributions + +## THE CONSTANT DETERMINISTIC REGRESSOR (FOR TESTING) +## + +struct DeterministicConstantRegressor <: MLJBase.Deterministic end + +function MLJBase.fit(::DeterministicConstantRegressor, verbosity::Int, X, y) + fitresult = mean(y) + cache = nothing + report = nothing + return fitresult, cache, report +end + +MLJBase.reformat(::DeterministicConstantRegressor, X) = (MLJBase.matrix(X),) +MLJBase.reformat(::DeterministicConstantRegressor, X, y) = (MLJBase.matrix(X), y) +MLJBase.selectrows(::DeterministicConstantRegressor, I, A) = (view(A, I, :),) +function MLJBase.selectrows(::DeterministicConstantRegressor, I, A, y) + return (view(A, I, :), y[I]) +end + +function MLJBase.predict(::DeterministicConstantRegressor, fitresult, Xnew) + return fill(fitresult, nrows(Xnew)) +end + +## THE EphemeralClassifier (FOR TESTING) +## Define a Deterministic Classifier with non-persistent `fitresult`, but which addresses +## this by overloading `save`/`restore`: +struct EphemeralClassifier <: MLJBase.Deterministic end +thing = [] + +function MLJBase.fit(::EphemeralClassifier, verbosity, X, y) + # if I serialize/deserialized `thing` then `id` below changes: + id = objectid(thing) + p = Distributions.fit(UnivariateFinite, y) + fitresult = (thing, id, p) + report = (features = MLJBase.schema(X).names,) + return fitresult, nothing, report +end + +function MLJBase.predict(::EphemeralClassifier, fitresult, X) + thing, id, p = fitresult + id == objectid(thing) || throw(ErrorException("dead fitresult")) + return [mode(p) for _ in 1:MLJBase.nrows(X)] +end + +function MLJBase.feature_importances(model::EphemeralClassifier, fitresult, report) + return [ftr => 1.0 for ftr in report.features] +end + +MLJBase.target_scitype(::Type{<:EphemeralClassifier}) = AbstractVector{OrderedFactor{2}} +MLJBase.reports_feature_importances(::Type{<:EphemeralClassifier}) = true + +function MLJBase.save(::EphemeralClassifier, fitresult) + thing, _, p = fitresult + return (thing, p) +end +function MLJBase.restore(::EphemeralClassifier, serialized_fitresult) + thing, p = serialized_fitresult + id = objectid(thing) + return (thing, id, p) +end + +end \ No newline at end of file diff --git a/test/models/featureselector.jl b/test/models/featureselector.jl new file mode 100644 index 0000000..d643128 --- /dev/null +++ b/test/models/featureselector.jl @@ -0,0 +1,70 @@ +#### FEATURE SELECTOR #### + +@testset "Feat Selector" begin + N = 100 + X = ( + Zn = rand(N), + Crim = rand(N), + x3 = categorical(rand("YN", N)), + x4 = categorical(rand("YN", N)) + ) + + # Test feature selection with `features=Symbol[]` + namesX = MLJBase.schema(X).names |> collect + selector = FeatureSelector() + f, = MLJBase.fit(selector, 1, X) + @test f == namesX + Xt = MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2)) + @test Set(MLJBase.schema(Xt).names) == Set(namesX) + @test length(Xt.Zn) == 2 + + # Test on selecting features if `features` keyword is defined + selector = FeatureSelector(features=[:Zn, :Crim]) + f, = MLJBase.fit(selector, 1, X) + @test MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2)) == + MLJBase.select(X, 1:2, [:Zn, :Crim]) + + # test on ignoring a feature, even if it's listed in the `features` + selector.ignore = true + f, = MLJBase.fit(selector, 1, X) + Xnew = MLJBase.transform(selector, f, X) + @test MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2)) == + MLJBase.select(X, 1:2, [:x3, :x4]) + + # test error about features selected or excluded in fit. + selector = FeatureSelector(features=[:x1, :mickey_mouse]) + @test_throws( + ArgumentError, + MLJBase.fit(selector, 1, X) + ) + selector.ignore = true + @test_logs( + (:warn, r"Excluding non-existent"), + MLJBase.fit(selector, 1, X) + ) + + # features must be specified if ignore=true + @test_throws ArgumentError FeatureSelector(ignore=true) + + # test logs for no features selected when using Bool-Callable function interface: + selector = FeatureSelector(features= x-> x == (:x1)) + @test_throws( + ArgumentError, + MLJBase.fit(selector, 1, X) + ) + selector.ignore = true + selector.features = x-> x in [:Zn, :Crim, :x3, :x4] + @test_throws( + ArgumentError, + MLJBase.fit(selector, 1, X) + ) + + # Test model Metadata + @test MLJBase.input_scitype(selector) == MLJBase.Table + @test MLJBase.output_scitype(selector) == MLJBase.Table +end + +# To be added with FeatureSelectorRule X = (n1=["a", "b", "a"], n2=["g", "g", "g"], n3=[7, 8, 9], +# n4 =UInt8[3,5,10], o1=[4.5, 3.6, 4.0], ) +# MLJBase.schema(X) +# Xc = coerce(X, :n1=>Multiclass, :n2=>Multiclass) \ No newline at end of file diff --git a/test/models/rfe.jl b/test/models/rfe.jl new file mode 100644 index 0000000..3910b36 --- /dev/null +++ b/test/models/rfe.jl @@ -0,0 +1,130 @@ +using .DummyTestModels +const DTM = DummyTestModels + +@testset "RecursiveFeatureElimination" begin + # Data For use in testset + X = rand(rng, 50, 10) + y = @views( + 10 .* sin.( + pi .* X[:, 1] .* X[:, 2] + ) + 20 .* (X[:, 3] .- 0.5).^ 2 .+ 10 .* X[:, 4] .+ 5 * X[:, 5] + ) + Xt = MLJBase.table(X) + Xnew = MLJBase.table(rand(rng, 50, 10)) + Xnew2 = MLJBase.table(rand(rng, 50, 10), names = [Symbol("y$i") for i in 1:10]) + + # Constructor + @test_throws FeatureSelection.ERR_SPECIFY_MODEL RecursiveFeatureElimination() + reg = DTM.DeterministicConstantRegressor() + @test_throws( + FeatureSelection.ERR_FEATURE_IMPORTANCE_SUPPORT, + RecursiveFeatureElimination(model = DTM.DeterministicConstantRegressor()) + ) + rf = MLJDecisionTreeInterface.RandomForestRegressor(rng = rng) + selector = RecursiveFeatureElimination(model = rf) + @test selector isa FeatureSelection.DeterministicRecursiveFeatureElimination + + # Fit + selector_mach = machine(selector, Xt, y) + fit!(selector_mach) + selector_fp = fitted_params(selector_mach) + @test propertynames(selector_fp) == (:features_left, :model_fitresult) + @test selector_fp.features_left == [:x1, :x2, :x3, :x4, :x5] + @test selector_fp.model_fitresult == MLJBase.fitted_params( + selector_mach.model.model, selector_mach.fitresult.model_fitresult + ) + @test feature_importances(selector_mach) == [ + :x1 => 6.0, :x2 => 5.0, :x3 => 4.0, :x4 => 3.0, :x5 => 2.0, + :x6 => 1.0, :x7 => 1.0, :x8 => 1.0, :x9 => 1.0, :x10 => 1.0 + ] + rpt = report(selector_mach) + @test rpt.ranking == [ + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 + ] + + # predict + yhat = predict(selector_mach, Xnew) + @test scitype(yhat) === AbstractVector{Continuous} + + # transform + trf = transform(selector_mach, Xnew) + sch = MLJBase.schema(trf) + @test sch.names === (:x1, :x2, :x3, :x4, :x5) + @test sch.scitypes === (Continuous, Continuous, Continuous, Continuous, Continuous) + @test_throws FeatureSelection.ERR_FEATURES_SEEN transform(selector_mach, Xnew2) +end + +@testset "Compare results for RFE with scikit-learn" begin + ## Sklearn rfe and rfecv + sklearn = MLJScikitLearnInterface.pyimport("sklearn") + make_friedman1 = MLJScikitLearnInterface.pyimport("sklearn.datasets"=>"make_friedman1") + RFE_sklearn = MLJScikitLearnInterface.pyimport("sklearn.feature_selection"=>"RFE") + RFECV_sklearn = MLJScikitLearnInterface.pyimport("sklearn.feature_selection"=>"RFECV") + SVR_sklearn = MLJScikitLearnInterface.pyimport("sklearn.svm"=>"SVR") + sklearn_svr_estimator = SVR_sklearn(kernel="linear") + sklearn_rfe_selector = RFE_sklearn( + sklearn_svr_estimator, n_features_to_select=5, step=1 + ) + sklearn_rfecv_selector = RFECV_sklearn( + sklearn_svr_estimator, step=1, cv=5 + ) + Xs, ys = make_friedman1(n_samples=50, n_features=10, random_state=0) + sklearn_rfe_selector = sklearn_rfe_selector.fit(Xs, ys) + sklearn_rfecv_selector = sklearn_rfecv_selector.fit(Xs, ys) + + ## MLJ RFE and RFE with CV + ## We use the same data and base estimator + ys = MLJScikitLearnInterface.pyconvert(Vector{Float64}, ys) + Xs = MLJScikitLearnInterface.pyconvert(Matrix{Float64}, Xs) + Xs = MLJBase.table(Xs) + SVR = SVMRegressor + MLJBase.reports_feature_importances(::SVR) = true + function MLJBase.feature_importances(::SVR, fitresult, report) + coef = MLJScikitLearnInterface.pyconvert(Array, fitresult[1].coef_); + imp = [Pair(Symbol("x$i"), coef[i]) for i in eachindex(coef)] + return imp + end + svm = SVR(kernel = "linear") + rfe = RecursiveFeatureElimination(model = svm, n_features=5) + mach = machine(rfe, Xs, ys) + fit!(mach) + rfecv = RecursiveFeatureElimination(model = svm) + tuning_rfe_model = TunedModel( + model = rfecv, + measure = rms, + tuning = Grid(rng=rng), + resampling = StratifiedCV(nfolds = 5), + range = range(rfecv, :n_features, values = 1:10) + ) + self_tuning_rfe_mach = machine(tuning_rfe_model, Xs, ys) + fit!(self_tuning_rfe_mach) + + ## Compare results + @test report(mach).ranking == MLJScikitLearnInterface.pyconvert( + Vector{Float64}, sklearn_rfe_selector.ranking_ + ) + @test( + report( + self_tuning_rfe_mach + ).best_report.ranking == MLJScikitLearnInterface.pyconvert( + Vector{Float64}, sklearn_rfecv_selector.ranking_ + ) + ) +end + +@testset "serialization for atomic models with non-persistent fitresults" begin + # https://github.com/alan-turing-institute/MLJ.jl/issues/1099 + X, y = (;x1=rand(8), x2 = rand(8)), categorical(collect("OXXXXOOX"), ordered=true) + rfe_model = RecursiveFeatureElimination( + DTM.EphemeralClassifier() + ) # i.e rfe based on a deterministic_classifier + mach = MLJBase.machine(rfe_model, X, y) + MLJBase.fit!(mach, verbosity=0) + yhat = MLJBase.predict(mach, MLJBase.selectrows(X, 1:2)) + io = IOBuffer() + MLJBase.save(io, mach) + seekstart(io) + mach2 = MLJBase.machine(io) + close(io) + @test MLJBase.predict(mach2, (; x1=rand(2), x2 = rand(2))) == yhat +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 67cdd20..07a5bac 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,24 +1,10 @@ -using MLJExampleInterface # substitute for correct interface pkg name -using Test -using MLJBase +using FeatureSelection, MLJBase, MLJTuning, MLJDecisionTreeInterface, + MLJScikitLearnInterface, StableRNGs, StatisticalMeasures, Test import Distributions -using StableRNGs # for RNGs stable across all julia versions -rng = StableRNGs.StableRNG(123) -@testset "cool classifier" begin - n = 100 - p = 3 - nclasses = 5 - X, y = make_blobs(n, p, centers=nclasses, rng=rng); - model = MLJExampleInterface.CoolProbabilisticClassifier() - mach = machine(model, X, y) - fit!(mach, rows=1:2, verbosity=0) - yhat = predict(mach, rows=3) - @test size(Distributions.pdf(yhat, levels(y)) ) == (1, nclasses) +const rng = StableRNG(123) - e = evaluate!(mach, measure=BrierLoss(), verbosity=0) - @test e.measurement[1] < 1.0 - - @test fitted_params(mach).classes_seen_in_training == levels(y) - @test report(mach).n_classes_seen == nclasses -end +include("Aqua.jl") +include("models/dummy_test_models.jl") +include("models/featureselector.jl") +include("models/rfe.jl")