JuliaAI · OkonSamuel · Apr 17, 2024 · Jan 16, 2024 · Jan 24, 2024 · Jan 29, 2024
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -12,4 +12,4 @@ jobs:
       - uses: JuliaRegistries/TagBot@v1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          ssh: ${{ secrets.DOCUMENTER_KEY }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,11 +13,12 @@ jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
     strategy:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
+          - '1.6' 
           - '1' # automatically expands to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest
@@ -29,6 +30,27 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
+      - name: "Replace julia libstdcxx ubuntu + julia v1.6"
+        shell: bash
+        if: ${{ matrix.version == '1.6' && matrix.os == 'ubuntu-latest' }}
+        # The following is needed for Julia <=1.8.3 on Linux OS 
+        # due to old version of libstcxx used by Julia
+        # taken from https://github.com/hhaensel/ReplaceLibstdcxx.jl/blob/main/src/ReplaceLibstdcxx.jl
+        run: |
+              julia -e '
+                libs = filter(x -> ! occursin("32", x), getindex.(split.(readlines(pipeline(`ldconfig -p`, `grep libstdc`)), r"\s*=>\s*"), 2))
+                source_dir = dirname(libs[end])
+                julia_lib_dir = joinpath(dirname(Sys.BINDIR), "lib", "julia")
+                julia_lib_file = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(julia_lib_dir, join = true)), 1, nothing)
+                julia_lib_version = match(r"so(\.\d+)\.", julia_lib_file).captures[1]
+                source_lib = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(source_dir, join = true)), 1, nothing)
+                julia_lib = joinpath(dirname(Sys.BINDIR), "lib", "julia", "libstdc++.so")
+                for src in [julia_lib, julia_lib * julia_lib_version]
+                  islink(src) && rm(src, force = true)
+                  symlink(source_lib, src)
+                  @info read(`ls -al $src`, String)
+                end
+                '
       - uses: actions/cache@v1
         env:
           cache-name: cache-artifacts
@@ -65,19 +87,23 @@ jobs:
                end
            end
            event_name = "${{ github.event_name }}"
+           ref = "${{ github.ref }}"
+           ref_is_master = ref == "refs/heads/master"
+           ref_is_dev    = ref == "refs/heads/dev"
+           ref_is_tag    = startswith(ref, "refs/tags/")
            if event_name == "pull_request"
                base_ref = "${{ github.base_ref }}"
                head_ref = "${{ github.head_ref }}"
                base_repository = "${{ github.repository }}"
                head_repository = "${{ github.event.pull_request.head.repo.full_name }}"
-               build_docs = (base_ref == "master") && (head_ref == "dev") && (base_repository == head_repository)
+               is_not_fork = base_repository == head_repository
+               build_docs = (base_ref == "master") && (head_ref == "dev") && (is_not_fork)
            elseif event_name == "push"
-               ref = "${{ github.ref }}"
-               build_docs = (ref == "refs/heads/master") || (startswith(ref, "refs/tags/"))
+               build_docs = ref_is_master || ref_is_dev || ref_is_tag
            elseif event_name == "schedule"
-               build_docs = ref == "refs/heads/master"
+               build_docs = ref_is_master || ref_is_dev
            elseif event_name == "workflow_dispatch"
-               build_docs = ref == "refs/heads/master"
+               build_docs = ref_is_master || ref_is_dev
            else
                build_docs = false
            end

diff --git a/Project.toml b/Project.toml
@@ -4,21 +4,45 @@ authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
 version = "0.1.0"
 
 [deps]
-Example = "7876af07-990d-54b4-ab0e-23690620f79a"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Example = "0.5"
-MLJModelInterface = "1"
-ScientificTypesBase = "1, 2, 3"
-julia = "1"
+Aqua = "0.8"
+Distributions = "0.25"
+julia = "1.6"
+MLJBase = "1.1"
+MLJTuning = "0.8"
+MLJDecisionTreeInterface = "0.4"
+MLJScikitLearnInterface = "0.6"
+MLJModelInterface = "1.4"
+ScientificTypesBase = "3"
+StableRNGs = "1"
+StatisticalMeasures = "0.1"
+Tables = "1.2"
+Test = "1.6"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
+MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
+MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Distributions", "MLJBase", "StableRNGs", "Test"]
+test = [
+    "Aqua",
+    "Distributions",
+    "MLJBase",
+    "MLJTuning",
+    "MLJDecisionTreeInterface",
+    "MLJScikitLearnInterface",
+    "StableRNGs", 
+    "StatisticalMeasures",
+    "Test"
+]
diff --git a/README.md b/README.md
@@ -1,50 +1,104 @@
 # FeatureSelection.jl
 
-This repository is a template for creating repositories that contain
-glue code between (i) packages providing machine learning algorithms; and (ii)
-the machine learning toolbox
-[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) - that is,
-for so-called *interface-only packages*.
-
-## When to use this template
-
-This template is intended for use when a package providing a machine
-learning model algorithm is not hosting the code that implements the
-MLJ model API, and a separate package for this purpose is to be
-created. This repo is itself a working implementation but should
-be used in conjunction with the more detailed [model implementation
-guidelines](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/).
-
-## How to use this template
-
-1. Clone this repository or use it as a template if available from your organization. 
-
-2. Rename this repository, replacing the word "Example" with the name of the model-providing package.
-
-1. Develop the contents of src/MLJExampleInterface.jl appropriately.
-
-2. Rename src/MLJExampleInterface.jl appropriately.
-
-3. Remove Example from Project.toml and instead add the model-providing package.
-
-3. **GENERATE A NEW UUID in Project.toml** and change the Project.toml
-   name and author appropriately.
-
-1. You may want to remove the Distributions test dependency if you don't need it.
-
-4. Replace every instance of "Example" in this README.md with the name
-   of the model-providing package and adjust the organization name in
-   the link.
-
-5. Remove everything in this REAMDE.md except what is below the line
-   you are currently reading &#128521;.
-
-
-# MLJ.jl <--> Example.jl
-
-Repository implementing the [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) model interface for models provided by
-[Example.jl](https://github.com/JuliaLang/Example.jl).
-
-| Linux | Coverage |
-| :------------ | :------- |
-| [![Build Status](https://github.com/JuliaAI/MLJExampleInterface.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/MLJExampleInterface.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/MLJExampleInterface.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/MLJExampleInterface.jl?branch=master) |
+| Linux | Coverage | Code Style
+| :------------ | :------- | :------------- |
+| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) | [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) |
+
+Repository housing feature selection algorithms for use with the machine learning toolbox
+[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/).
+
+`FeatureSelector` model builds on contributions originally residing at [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl/blob/v0.16.15/src/builtins/Transformers.jl#L189-L266)
+
+# Installation
+On a running instance of Julia with at least version 1.6 run
+```julia
+import Pkg;
+Pkg.add("FeatureSelection")
+```
+
+# Example Usage
+Lets build a supervised recursive feature eliminator with `RandomForestRegressor` 
+from DecisionTree.jl as our base model.
+But first we need a dataset to train on. We shall create a synthetic dataset popularly 
+known in the R community as the friedman dataset#1. Notice how the target vector for this 
+dataset depends on only the first five columns of feature table. So we expect that our 
+recursive feature elimination should return the first columns as important features.
+```julia
+using MLJ, FeatureSelection
+using StableRNGs
+rng = StableRNG(10)
+A = rand(rng, 50, 10)
+X = MLJ.table(A) # features
+y = @views(
+    10 .* sin.(
+        pi .* A[:, 1] .* A[:, 2]
+    ) .+ 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]
+) # target
+```
+Now we that we have our data we can create our recursive feature elimination model and 
+train it on our dataset
+```julia
+RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree
+forest = RandomForestRegressor(rng=rng)
+rfe = RecursiveFeatureElimination(
+    model = forest, n_features=5, step=1
+) # see doctring for description of defaults
+mach = machine(rfe, X, y)
+fit!(mach)
+```
+We can inspect the feature importances in two ways:
+```julia
+# A variable with lower rank has more significance than a variable with higher rank.
+# A variable with Higher feature importance is better than a variable with lower 
+# feature importance
+report(mach).ranking # returns [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+feature_importances(mach) # returns dict of feature => importance pairs
+```
+We can view the important features used by our model by inspecting the `fitted_params` 
+object.
+```julia
+p = fitted_params(mach)
+p.features_left == [:x1, :x2, :x3, :x4, :x5]
+```
+We can also call the `predict` method on the fitted machine, to predict using a 
+random forest regressor trained using only the important features, or call the `transform` 
+method, to select just those features from some new table including all the original 
+features. For more info, type `?RecursiveFeatureElimination` on a Julia REPL.
+
+Okay, let's say that we didn't know that our synthetic dataset depends on only five 
+columns from our feature table. We could apply cross fold validation 
+`StratifiedCV(nfolds=5)` with our recursive feature elimination model to select the 
+optimal value of `n_features` for our model. In this case we will use a simple Grid 
+search with root mean square as the measure. 
+```julia
+rfe = RecursiveFeatureElimination(model = forest)
+tuning_rfe_model  = TunedModel(
+    model = rfe,
+    measure = rms,
+    tuning = Grid(rng=rng),
+    resampling = StratifiedCV(nfolds = 5),
+    range = range(
+        rfe, :n_features, values = 1:10
+    )
+)
+self_tuning_rfe_mach = machine(tuning_rfe_model, X, y)
+fit!(self_tuning_rfe_mach)
+```
+As before we can inspect the important features by inspecting the object returned by 
+`fitted_params` or `feature_importances` as shown below.
+```julia
+fitted_params(self_tuning_rfe_mach).best_fitted_params.features_left == [:x1, :x2, :x3, :x4, :x5]
+feature_importances(self_tuning_rfe_mach) # returns dict of feature => importance pairs
+```
+and call `predict` on the tuned model machine as shown below
+```julia
+Xnew = MLJ.table(rand(rng, 50, 10)) # create test data
+predict(self_tuning_rfe_mach, Xnew)
+```
+In this case, prediction is done using the best recursive feature elimination model gotten 
+from the tuning process above.
+
+For resampling methods different from cross-validation, and for other
+ `TunedModel` options, such as parallelization, see the 
+ [Tuning Models](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/) section of the MLJ manual.
+[MLJ Documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/)
diff --git a/src/FeatureSelection.jl b/src/FeatureSelection.jl
@@ -0,0 +1,27 @@
+module FeatureSelection
+
+using MLJModelInterface, Tables, ScientificTypesBase
+
+export FeatureSelector, RecursiveFeatureElimination
+
+const MMI = MLJModelInterface
+
+## Includes
+include("models/featureselector.jl")
+include("models/rfe.jl")
+
+## Pkg Traits
+MMI.metadata_pkg.(
+    (
+        DeterministicRecursiveFeatureElimination,
+        ProbabilisticRecursiveFeatureElimination, 
+        FeatureSelector
+    ),
+    package_name       = "FeatureSelection",
+    package_uuid       = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6",
+    package_url        = "https://github.com/JuliaAI/FeatureSelection.jl",
+    is_pure_julia      = true,
+    package_license    = "MIT"
+)
+
+end # module