Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/TagBot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ jobs:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
38 changes: 32 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
version:
- '1.0'
- '1.6'
- '1' # automatically expands to the latest stable 1.x release of Julia.
os:
- ubuntu-latest
Expand All @@ -29,6 +30,27 @@ jobs:
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- name: "Replace julia libstdcxx ubuntu + julia v1.6"
shell: bash
if: ${{ matrix.version == '1.6' && matrix.os == 'ubuntu-latest' }}
# The following is needed for Julia <=1.8.3 on Linux OS
# due to old version of libstcxx used by Julia
# taken from https://github.com/hhaensel/ReplaceLibstdcxx.jl/blob/main/src/ReplaceLibstdcxx.jl
run: |
julia -e '
libs = filter(x -> ! occursin("32", x), getindex.(split.(readlines(pipeline(`ldconfig -p`, `grep libstdc`)), r"\s*=>\s*"), 2))
source_dir = dirname(libs[end])
julia_lib_dir = joinpath(dirname(Sys.BINDIR), "lib", "julia")
julia_lib_file = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(julia_lib_dir, join = true)), 1, nothing)
julia_lib_version = match(r"so(\.\d+)\.", julia_lib_file).captures[1]
source_lib = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(source_dir, join = true)), 1, nothing)
julia_lib = joinpath(dirname(Sys.BINDIR), "lib", "julia", "libstdc++.so")
for src in [julia_lib, julia_lib * julia_lib_version]
islink(src) && rm(src, force = true)
symlink(source_lib, src)
@info read(`ls -al $src`, String)
end
'
- uses: actions/cache@v1
env:
cache-name: cache-artifacts
Expand Down Expand Up @@ -65,19 +87,23 @@ jobs:
end
end
event_name = "${{ github.event_name }}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we can just dump all the documenter.jl stuff since we are not using it. in any case, i don't think we need all that complicated logic for this package.

ref = "${{ github.ref }}"
ref_is_master = ref == "refs/heads/master"
ref_is_dev = ref == "refs/heads/dev"
ref_is_tag = startswith(ref, "refs/tags/")
if event_name == "pull_request"
base_ref = "${{ github.base_ref }}"
head_ref = "${{ github.head_ref }}"
base_repository = "${{ github.repository }}"
head_repository = "${{ github.event.pull_request.head.repo.full_name }}"
build_docs = (base_ref == "master") && (head_ref == "dev") && (base_repository == head_repository)
is_not_fork = base_repository == head_repository
build_docs = (base_ref == "master") && (head_ref == "dev") && (is_not_fork)
elseif event_name == "push"
ref = "${{ github.ref }}"
build_docs = (ref == "refs/heads/master") || (startswith(ref, "refs/tags/"))
build_docs = ref_is_master || ref_is_dev || ref_is_tag
elseif event_name == "schedule"
build_docs = ref == "refs/heads/master"
build_docs = ref_is_master || ref_is_dev
elseif event_name == "workflow_dispatch"
build_docs = ref == "refs/heads/master"
build_docs = ref_is_master || ref_is_dev
else
build_docs = false
end
Expand Down
36 changes: 30 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,45 @@ authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
version = "0.1.0"

[deps]
Example = "7876af07-990d-54b4-ab0e-23690620f79a"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Example = "0.5"
MLJModelInterface = "1"
ScientificTypesBase = "1, 2, 3"
julia = "1"
Aqua = "0.8"
Distributions = "0.25"
julia = "1.6"
MLJBase = "1.1"
MLJTuning = "0.8"
MLJDecisionTreeInterface = "0.4"
MLJScikitLearnInterface = "0.6"
MLJModelInterface = "1.4"
ScientificTypesBase = "3"
StableRNGs = "1"
StatisticalMeasures = "0.1"
Tables = "1.2"
Test = "1.6"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Distributions", "MLJBase", "StableRNGs", "Test"]
test = [
"Aqua",
"Distributions",
"MLJBase",
"MLJTuning",
"MLJDecisionTreeInterface",
"MLJScikitLearnInterface",
"StableRNGs",
"StatisticalMeasures",
"Test"
]
150 changes: 102 additions & 48 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,50 +1,104 @@
# FeatureSelection.jl

This repository is a template for creating repositories that contain
glue code between (i) packages providing machine learning algorithms; and (ii)
the machine learning toolbox
[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) - that is,
for so-called *interface-only packages*.

## When to use this template

This template is intended for use when a package providing a machine
learning model algorithm is not hosting the code that implements the
MLJ model API, and a separate package for this purpose is to be
created. This repo is itself a working implementation but should
be used in conjunction with the more detailed [model implementation
guidelines](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/).

## How to use this template

1. Clone this repository or use it as a template if available from your organization.

2. Rename this repository, replacing the word "Example" with the name of the model-providing package.

1. Develop the contents of src/MLJExampleInterface.jl appropriately.

2. Rename src/MLJExampleInterface.jl appropriately.

3. Remove Example from Project.toml and instead add the model-providing package.

3. **GENERATE A NEW UUID in Project.toml** and change the Project.toml
name and author appropriately.

1. You may want to remove the Distributions test dependency if you don't need it.

4. Replace every instance of "Example" in this README.md with the name
of the model-providing package and adjust the organization name in
the link.

5. Remove everything in this REAMDE.md except what is below the line
you are currently reading &#128521;.


# MLJ.jl <--> Example.jl

Repository implementing the [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) model interface for models provided by
[Example.jl](https://github.com/JuliaLang/Example.jl).

| Linux | Coverage |
| :------------ | :------- |
| [![Build Status](https://github.com/JuliaAI/MLJExampleInterface.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/MLJExampleInterface.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/MLJExampleInterface.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/MLJExampleInterface.jl?branch=master) |
| Linux | Coverage | Code Style
| :------------ | :------- | :------------- |
| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) | [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) |

Repository housing feature selection algorithms for use with the machine learning toolbox
[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/).

`FeatureSelector` model builds on contributions originally residing at [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl/blob/v0.16.15/src/builtins/Transformers.jl#L189-L266)

# Installation
On a running instance of Julia with at least version 1.6 run
```julia
import Pkg;
Pkg.add("FeatureSelection")
```

# Example Usage
Lets build a supervised recursive feature eliminator with `RandomForestRegressor`
from DecisionTree.jl as our base model.
But first we need a dataset to train on. We shall create a synthetic dataset popularly
known in the R community as the friedman dataset#1. Notice how the target vector for this
dataset depends on only the first five columns of feature table. So we expect that our
recursive feature elimination should return the first columns as important features.
```julia
using MLJ, FeatureSelection
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am proposing that FeatureSelection be a dep of MLJ with all names re-exported. So you won't need FeatureSelection here, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah

using StableRNGs
rng = StableRNG(10)
A = rand(rng, 50, 10)
X = MLJ.table(A) # features
y = @views(
10 .* sin.(
pi .* A[:, 1] .* A[:, 2]
) .+ 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]
) # target
```
Now we that we have our data we can create our recursive feature elimination model and
train it on our dataset
```julia
RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree
forest = RandomForestRegressor(rng=rng)
rfe = RecursiveFeatureElimination(
model = forest, n_features=5, step=1
) # see doctring for description of defaults
mach = machine(rfe, X, y)
fit!(mach)
```
We can inspect the feature importances in two ways:
```julia
# A variable with lower rank has more significance than a variable with higher rank.
# A variable with Higher feature importance is better than a variable with lower
# feature importance
report(mach).ranking # returns [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
feature_importances(mach) # returns dict of feature => importance pairs
```
We can view the important features used by our model by inspecting the `fitted_params`
object.
```julia
p = fitted_params(mach)
p.features_left == [:x1, :x2, :x3, :x4, :x5]
```
We can also call the `predict` method on the fitted machine, to predict using a
random forest regressor trained using only the important features, or call the `transform`
method, to select just those features from some new table including all the original
features. For more info, type `?RecursiveFeatureElimination` on a Julia REPL.

Okay, let's say that we didn't know that our synthetic dataset depends on only five
columns from our feature table. We could apply cross fold validation
`StratifiedCV(nfolds=5)` with our recursive feature elimination model to select the
optimal value of `n_features` for our model. In this case we will use a simple Grid
search with root mean square as the measure.
```julia
rfe = RecursiveFeatureElimination(model = forest)
tuning_rfe_model = TunedModel(
model = rfe,
measure = rms,
tuning = Grid(rng=rng),
resampling = StratifiedCV(nfolds = 5),
range = range(
rfe, :n_features, values = 1:10
)
)
self_tuning_rfe_mach = machine(tuning_rfe_model, X, y)
fit!(self_tuning_rfe_mach)
```
As before we can inspect the important features by inspecting the object returned by
`fitted_params` or `feature_importances` as shown below.
```julia
fitted_params(self_tuning_rfe_mach).best_fitted_params.features_left == [:x1, :x2, :x3, :x4, :x5]
feature_importances(self_tuning_rfe_mach) # returns dict of feature => importance pairs
```
and call `predict` on the tuned model machine as shown below
```julia
Xnew = MLJ.table(rand(rng, 50, 10)) # create test data
predict(self_tuning_rfe_mach, Xnew)
```
In this case, prediction is done using the best recursive feature elimination model gotten
from the tuning process above.

For resampling methods different from cross-validation, and for other
`TunedModel` options, such as parallelization, see the
[Tuning Models](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/) section of the MLJ manual.
[MLJ Documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/)
27 changes: 27 additions & 0 deletions src/FeatureSelection.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
module FeatureSelection

using MLJModelInterface, Tables, ScientificTypesBase

export FeatureSelector, RecursiveFeatureElimination

const MMI = MLJModelInterface

## Includes
include("models/featureselector.jl")
include("models/rfe.jl")

## Pkg Traits
MMI.metadata_pkg.(
(
DeterministicRecursiveFeatureElimination,
ProbabilisticRecursiveFeatureElimination,
FeatureSelector
),
package_name = "FeatureSelection",
package_uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6",
package_url = "https://github.com/JuliaAI/FeatureSelection.jl",
is_pure_julia = true,
package_license = "MIT"
)

end # module
Loading