diff --git a/.travis.yml b/.travis.yml index 3b311a04f..29282c438 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,7 @@ after_success: jobs: include: - stage: "Documentation" - julia: 1.1 + julia: 1.2 os: linux # disable global before_script in order not to install Compose twice before_script: diff --git a/docs/Project.toml b/docs/Project.toml index 4ca116109..35b242382 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -4,7 +4,9 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -12,6 +14,8 @@ MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" @@ -19,3 +23,5 @@ TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [compat] Documenter = "~0.22" +Missings = "<0.4.2" +julia = "1.2" diff --git a/docs/make.jl b/docs/make.jl index 472657671..d0f495887 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -10,17 +10,24 @@ using MLJModels.Constant using MLJModels using ScientificTypes +# using Literate +# Literate.markdown("common_mlj_workflows.jl", ".", +# codefence = "```@example workflows" => "```") + pages = Any["Getting Started"=>"index.md", - "Evaluating model performance"=>"evaluating_model_performance.md", + "Common MLJ Workflows" => "common_mlj_workflows.md", + "Model Search" => "model_search.md", + "Machines" => "machines.md", + "Evaluating Model Performance"=>"evaluating_model_performance.md", "Performance Measures"=> "performance_measures.md", - "Tuning models"=>"tuning_models.md", + "Tuning Models"=>"tuning_models.md", "Built-in Transformers" => "built_in_transformers.md", "Composing Models" => "composing_models.md", "Homogeneous Ensembles" => "homogeneous_ensembles.md", "Simple User Defined Models" => "simple_user_defined_models.md", "Adding Models for General Use" => "adding_models_for_general_use.md", "Benchmarking" => "benchmarking.md", - "Working with tasks" => "working_with_tasks.md", + "Working with Tasks" => "working_with_tasks.md", "Internals"=>"internals.md", "Glossary"=>"glossary.md", "API"=>"api.md", diff --git a/docs/src/common_mlj_workflows.ipynb b/docs/src/common_mlj_workflows.ipynb new file mode 100644 index 000000000..58786215d --- /dev/null +++ b/docs/src/common_mlj_workflows.ipynb @@ -0,0 +1,1855 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Common MLJ Workflows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "using MLJ\n", + "using RDatasets\n", + "channing = dataset(\"boot\", \"channing\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspecting metadata, including column scientific types:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(names = (:Sex, :Entry, :Exit, :Time, :Cens),\n", + " types = (CategoricalString{UInt8}, Int32, Int32, Int32, Int32),\n", + " scitypes = (Multiclass{2}, Count, Count, Count, Count),\n", + " nrows = 462,)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema(channing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Unpacking data and correcting for wrong scitypes:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m──────────────────────────\u001b[0m┬\u001b[0m────────────\u001b[0m┬\u001b[0m───────────────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m Sex \u001b[0m│\u001b[0m\u001b[1m Entry \u001b[0m│\u001b[0m\u001b[1m Cens \u001b[0m│\u001b[0m\n", + "│\u001b[0m\u001b[90m CategoricalString{UInt8} \u001b[0m│\u001b[0m\u001b[90m Float64 \u001b[0m│\u001b[0m\u001b[90m CategoricalValue{Int32,UInt8} \u001b[0m│\u001b[0m\n", + "│\u001b[0m\u001b[90m Multiclass{2} \u001b[0m│\u001b[0m\u001b[90m Continuous \u001b[0m│\u001b[0m\u001b[90m Multiclass{2} \u001b[0m│\u001b[0m\n", + "├\u001b[0m──────────────────────────\u001b[0m┼\u001b[0m────────────\u001b[0m┼\u001b[0m───────────────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m Male \u001b[0m│\u001b[0m 782.0 \u001b[0m│\u001b[0m 1 \u001b[0m│\u001b[0m\n", + "│\u001b[0m Male \u001b[0m│\u001b[0m 1020.0 \u001b[0m│\u001b[0m 1 \u001b[0m│\u001b[0m\n", + "│\u001b[0m Male \u001b[0m│\u001b[0m 856.0 \u001b[0m│\u001b[0m 1 \u001b[0m│\u001b[0m\n", + "│\u001b[0m Male \u001b[0m│\u001b[0m 915.0 \u001b[0m│\u001b[0m 1 \u001b[0m│\u001b[0m\n", + "└\u001b[0m──────────────────────────\u001b[0m┴\u001b[0m────────────\u001b[0m┴\u001b[0m───────────────────────────────\u001b[0m┘\u001b[0m\n" + ] + } + ], + "source": [ + "y, X = unpack(channing,\n", + " ==(:Exit), # y is the :Exit column\n", + " !=(:Time); # X is the rest, except :Time\n", + " :Exit=>Continuous,\n", + " :Entry=>Continuous,\n", + " :Cens=>Multiclass)\n", + "first(X, 4) |> pretty" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4-element Array{Float64,1}:\n", + " 909.0\n", + " 1128.0\n", + " 969.0\n", + " 957.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y[1:4]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading a built-in supervised dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m──────────────\u001b[0m┬\u001b[0m─────────────\u001b[0m┬\u001b[0m──────────────\u001b[0m┬\u001b[0m─────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m sepal_length \u001b[0m│\u001b[0m\u001b[1m sepal_width \u001b[0m│\u001b[0m\u001b[1m petal_length \u001b[0m│\u001b[0m\u001b[1m petal_width \u001b[0m│\u001b[0m\n", + "│\u001b[0m\u001b[90m Float64 \u001b[0m│\u001b[0m\u001b[90m Float64 \u001b[0m│\u001b[0m\u001b[90m Float64 \u001b[0m│\u001b[0m\u001b[90m Float64 \u001b[0m│\u001b[0m\n", + "│\u001b[0m\u001b[90m Continuous \u001b[0m│\u001b[0m\u001b[90m Continuous \u001b[0m│\u001b[0m\u001b[90m Continuous \u001b[0m│\u001b[0m\u001b[90m Continuous \u001b[0m│\u001b[0m\n", + "├\u001b[0m──────────────\u001b[0m┼\u001b[0m─────────────\u001b[0m┼\u001b[0m──────────────\u001b[0m┼\u001b[0m─────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m 5.1 \u001b[0m│\u001b[0m 3.5 \u001b[0m│\u001b[0m 1.4 \u001b[0m│\u001b[0m 0.2 \u001b[0m│\u001b[0m\n", + "│\u001b[0m 4.9 \u001b[0m│\u001b[0m 3.0 \u001b[0m│\u001b[0m 1.4 \u001b[0m│\u001b[0m 0.2 \u001b[0m│\u001b[0m\n", + "│\u001b[0m 4.7 \u001b[0m│\u001b[0m 3.2 \u001b[0m│\u001b[0m 1.3 \u001b[0m│\u001b[0m 0.2 \u001b[0m│\u001b[0m\n", + "│\u001b[0m 4.6 \u001b[0m│\u001b[0m 3.1 \u001b[0m│\u001b[0m 1.5 \u001b[0m│\u001b[0m 0.2 \u001b[0m│\u001b[0m\n", + "└\u001b[0m──────────────\u001b[0m┴\u001b[0m─────────────\u001b[0m┴\u001b[0m──────────────\u001b[0m┴\u001b[0m─────────────\u001b[0m┘\u001b[0m\n" + ] + } + ], + "source": [ + "X, y = @load_iris;\n", + "first(X, 4) |> pretty" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4-element CategoricalArray{String,1,UInt32}:\n", + " \"setosa\"\n", + " \"setosa\"\n", + " \"setosa\"\n", + " \"setosa\"" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y[1:4]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Searching for a supervised model:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "37-element Array{NamedTuple,1}:\n", + " (name = ARDRegressor, package_name = ScikitLearn, ... ) \n", + " (name = AdaBoostRegressor, package_name = ScikitLearn, ... ) \n", + " (name = BaggingRegressor, package_name = ScikitLearn, ... ) \n", + " (name = BayesianRidgeRegressor, package_name = ScikitLearn, ... ) \n", + " (name = ConstantRegressor, package_name = MLJModels, ... ) \n", + " (name = DecisionTreeRegressor, package_name = DecisionTree, ... ) \n", + " (name = DeterministicConstantRegressor, package_name = MLJModels, ... ) \n", + " (name = ElasticNetCVRegressor, package_name = ScikitLearn, ... ) \n", + " (name = ElasticNetRegressor, package_name = ScikitLearn, ... ) \n", + " (name = EpsilonSVR, package_name = LIBSVM, ... ) \n", + " (name = GaussianProcessRegressor, package_name = ScikitLearn, ... ) \n", + " (name = GradientBoostingRegressor, package_name = ScikitLearn, ... ) \n", + " (name = HuberRegressor, package_name = ScikitLearn, ... ) \n", + " ⋮ \n", + " (name = OrthogonalMatchingPursuitRegressor, package_name = ScikitLearn, ... )\n", + " (name = PassiveAggressiveRegressor, package_name = ScikitLearn, ... ) \n", + " (name = RandomForestRegressor, package_name = ScikitLearn, ... ) \n", + " (name = RidgeCVRegressor, package_name = ScikitLearn, ... ) \n", + " (name = RidgeRegressor, package_name = MultivariateStats, ... ) \n", + " (name = RidgeRegressor, package_name = ScikitLearn, ... ) \n", + " (name = SGDRegressor, package_name = ScikitLearn, ... ) \n", + " (name = SVMLRegressor, package_name = ScikitLearn, ... ) \n", + " (name = SVMNuRegressor, package_name = ScikitLearn, ... ) \n", + " (name = SVMRegressor, package_name = ScikitLearn, ... ) \n", + " (name = TheilSenRegressor, package_name = ScikitLearn, ... ) \n", + " (name = XGBoostRegressor, package_name = XGBoost, ... ) " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = @load_boston\n", + "models(matching(X, y))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[35mDecisionTreeRegressor from DecisionTree.jl.\u001b[39m\n", + "\u001b[35m[Documentation](https://github.com/bensadeghi/DecisionTree.jl).\u001b[39m\n", + "(name = \"DecisionTreeRegressor\",\n", + " package_name = \"DecisionTree\",\n", + " is_supervised = true,\n", + " docstring = \"DecisionTreeRegressor from DecisionTree.jl.\\n[Documentation](https://github.com/bensadeghi/DecisionTree.jl).\",\n", + " hyperparameter_types = [\"Float64\", \"Int64\", \"Int64\", \"Int64\", \"Float64\", \"Int64\", \"Bool\"],\n", + " hyperparameters = Symbol[:pruning_purity_threshold, :max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune],\n", + " implemented_methods = Symbol[:fit, :predict, :clean!, :fitted_params],\n", + " is_pure_julia = true,\n", + " is_wrapper = false,\n", + " load_path = \"MLJModels.DecisionTree_.DecisionTreeRegressor\",\n", + " package_license = \"unknown\",\n", + " package_url = \"https://github.com/bensadeghi/DecisionTree.jl\",\n", + " package_uuid = \"7806a523-6efd-50cb-b5f6-3fa6f1930dbb\",\n", + " prediction_type = :deterministic,\n", + " supports_weights = false,\n", + " input_scitype = ScientificTypes.Table{_s13} where _s13<:(AbstractArray{_s12,1} where _s12<:Continuous),\n", + " target_scitype = AbstractArray{_s491,1} where _s491<:Continuous,)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "models(matching(X, y))[6]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More refined searches:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4-element Array{NamedTuple,1}:\n", + " (name = DecisionTreeRegressor, package_name = DecisionTree, ... ) \n", + " (name = DeterministicConstantRegressor, package_name = MLJModels, ... )\n", + " (name = KNNRegressor, package_name = NearestNeighbors, ... ) \n", + " (name = RidgeRegressor, package_name = MultivariateStats, ... ) " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "models() do model\n", + " matching(model, X, y) &&\n", + " model.prediction_type == :deterministic &&\n", + " model.is_pure_julia\n", + "end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Searching for an unsupervised model:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9-element Array{NamedTuple,1}:\n", + " (name = FeatureSelector, package_name = MLJModels, ... ) \n", + " (name = ICA, package_name = MultivariateStats, ... ) \n", + " (name = KMeans, package_name = Clustering, ... ) \n", + " (name = KMedoids, package_name = Clustering, ... ) \n", + " (name = KernelPCA, package_name = MultivariateStats, ... )\n", + " (name = OneClassSVM, package_name = LIBSVM, ... ) \n", + " (name = OneHotEncoder, package_name = MLJModels, ... ) \n", + " (name = PCA, package_name = MultivariateStats, ... ) \n", + " (name = Standardizer, package_name = MLJModels, ... ) " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "models(matching(X))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Getting the metadata entry for a given model type:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[35mRidgeRegressor from MultivariateStats.jl.\u001b[39m\n", + "\u001b[35m[Documentation](https://github.com/JuliaStats/MultivariateStats.jl).\u001b[39m\n", + "(name = \"RidgeRegressor\",\n", + " package_name = \"MultivariateStats\",\n", + " is_supervised = true,\n", + " docstring = \"RidgeRegressor from MultivariateStats.jl.\\n[Documentation](https://github.com/JuliaStats/MultivariateStats.jl).\",\n", + " hyperparameter_types = [\"Float64\"],\n", + " hyperparameters = Symbol[:lambda],\n", + " implemented_methods = Symbol[:fit, :predict, :clean!, :fitted_params],\n", + " is_pure_julia = true,\n", + " is_wrapper = false,\n", + " load_path = \"MLJModels.MultivariateStats_.RidgeRegressor\",\n", + " package_license = \"unknown\",\n", + " package_url = \"https://github.com/JuliaStats/MultivariateStats.jl\",\n", + " package_uuid = \"6f286f6a-111f-5878-ab1e-185364afe411\",\n", + " prediction_type = :deterministic,\n", + " supports_weights = false,\n", + " input_scitype = ScientificTypes.Table{_s13} where _s13<:(AbstractArray{_s12,1} where _s12<:Continuous),\n", + " target_scitype = AbstractArray{Continuous,1},)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "info(\"PCA\")\n", + "info(\"RidgeRegressor\", pkg=\"MultivariateStats\") # a model type in multiple packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### *More on model matching*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `model` is in the list returned by `models(test)` exactly when\n", + " `test(model) == true`. (Here `model` is some model type metadata\n", + " entry, as returned by `info(...)`.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `matching(model, X, y) == true` exactly when `model` is supervised\n", + " and admits inputs and targets with the scientific types of `X` and\n", + " `y`, respectively." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `matching(model, X) == true` exaclty when `model` is unsupervised\n", + " and admits inputs with the scientific types of `X`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- The testing objects `matching(model)`, `matching(X, y)` and `matching(X)`,\n", + " which are callable and `Bool`-valued, are just the curried versions of\n", + " the above. So, for example, `matching(X, y)(model) =\n", + " matching(model, X, y)`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiating a model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading model code:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(pruning_purity = 1.0,\n", + " max_depth = -1,\n", + " min_samples_leaf = 1,\n", + " min_samples_split = 2,\n", + " min_purity_increase = 0.0,\n", + " n_subfeatures = 0,\n", + " display_depth = 5,\n", + " post_prune = false,\n", + " merge_purity_threshold = 0.9,\n", + " pdf_smoothing = 0.05,)\u001b[34m @ 7…72\u001b[39m" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@load DecisionTreeClassifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instantiating a model:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(pruning_purity = 1.0,\n", + " max_depth = 4,\n", + " min_samples_leaf = 1,\n", + " min_samples_split = 5,\n", + " min_purity_increase = 0.0,\n", + " n_subfeatures = 0,\n", + " display_depth = 5,\n", + " post_prune = false,\n", + " merge_purity_threshold = 0.9,\n", + " pdf_smoothing = 0.05,)\u001b[34m @ 7…86\u001b[39m" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = DecisionTreeClassifier(min_samples_split=5, max_depth=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: A model type \"DecisionTreeClassifier\" is already loaded. \n", + "│ No new code loaded. \n", + "└ @ MLJModels /Users/anthony/Dropbox/Julia7/MLJ/MLJModels/src/loading.jl:41\n" + ] + }, + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = @load DecisionTreeClassifier\n", + "model.min_samples_split = 5\n", + "model.max_depth = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating a model:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[33mEvaluating over 5 folds: 100%[=========================] Time: 0:00:02\u001b[39m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m─────────\u001b[0m┬\u001b[0m───────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m─────────\u001b[0m┼\u001b[0m───────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m rms \u001b[0m│\u001b[0m 8.668102471357711 \u001b[0m│\u001b[0m\n", + "│\u001b[0m mav \u001b[0m│\u001b[0m 6.047643564356435 \u001b[0m│\u001b[0m\n", + "└\u001b[0m─────────\u001b[0m┴\u001b[0m───────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = MLJBase.Measure[rms, mav],\n", + " measurement = [8.668102471357711, 6.047643564356435],\n", + " per_fold = Array{Float64,1}[[8.525465870955774, 8.52461967445231, 10.74455588603451, 9.393386761519249, 6.152484163826722], [6.489306930693069, 5.434059405940592, 7.613069306930692, 6.033663366336635, 4.668118811881189]],\n", + " per_observation = Missing[missing, missing],)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = @load_boston\n", + "model = @load KNNRegressor\n", + "evaluate(model, X, y, resampling=CV(nfolds=5), measure=[rms, mav])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic fit/evaluate/predict by hand:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: A model type \"DecisionTreeClassifier\" is already loaded. \n", + "│ No new code loaded. \n", + "└ @ MLJModels /Users/anthony/Dropbox/Julia7/MLJ/MLJModels/src/loading.jl:41\n" + ] + }, + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "using RDatasets\n", + "vaso = dataset(\"robustbase\", \"vaso\"); # a DataFrame\n", + "y, X = unpack(vaso, ==(:Y), c -> true; :Y => Multiclass)\n", + "\n", + "tree_model = @load DecisionTreeClassifier\n", + "tree_model.max_depth=2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bind the model and data together in a *machine* , which will\n", + "additionally store the learned parameters (*fitresults*) when fit:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[34mMachine{DecisionTreeClassifier} @ 1…17\u001b[39m\n" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree = machine(tree_model, X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split row indices into training and evaluation rows:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1234); # 70:30 split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fit on train and evaluate on test:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{DecisionTreeClassifier} @ 1…17\u001b[39m.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/machines.jl:141\n" + ] + }, + { + "data": { + "text/plain": [ + "1.135369212298553" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit!(tree, rows=train)\n", + "yhat = predict(tree, rows=test);\n", + "mean(cross_entropy(yhat, y[test]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict on new data:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3-element Array{UnivariateFinite{Int64,UInt8,Float64},1}:\n", + " UnivariateFinite(0=>0.2727272727272727, 1=>0.7272727272727273) \n", + " UnivariateFinite(0=>0.02439024390243903, 1=>0.9756097560975611)\n", + " UnivariateFinite(0=>0.02439024390243903, 1=>0.9756097560975611)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Xnew = (Volume=3*rand(3), Rate=3*rand(3))\n", + "predict(tree, Xnew) # a vector of distributions" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3-element Array{CategoricalValue{Int64,UInt8},1}:\n", + " 1\n", + " 1\n", + " 1" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_mode(tree, Xnew) # a vector of point-predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### *More on machines (implementation detail)*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Under the hood, calling `fit!` on a machine calls either\n", + "`MLJBase.fit` or `MLJBase.update` depending on the machine's\n", + "internal state, as recorded in additional fields `previous_model`\n", + "and `rows`. These lower level methods dispatch on the model and a\n", + "view of the data depending on the optional `rows` keyword argument\n", + "of `fit!` (all rows by default). In this way, if a model `update`\n", + "method is implemented, calls to `fit!` can avoid redundant\n", + "calculations for certain kinds of model mutations (eg, increasing\n", + "the number of epochs in a neural network)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is a complete list of the fields of a machine:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `model` - the struct containing the hyperparameters to be used\n", + "in calls to `fit!`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `fitresult` - the learned parameters in a raw form, initially undefined" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `args` - a tuple of the data (in the supervised learning example above, `args = (X, y)`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `report` - outputs of training not encoded in `fitresult` (eg, feature rankings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `previous_model` - a deep copy of the model used in the last call to `fit!`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `rows` - a copy of the row indices used in last call to `fit!`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `cache`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More performance evaluation examples:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import LossFunctions.ZeroOneLoss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluating model + data directly:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m───────────────\u001b[0m┬\u001b[0m────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m───────────────\u001b[0m┼\u001b[0m────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m cross_entropy \u001b[0m│\u001b[0m 1.135369212298553 \u001b[0m│\u001b[0m\n", + "│\u001b[0m ZeroOneLoss \u001b[0m│\u001b[0m 0.4166666666666667 \u001b[0m│\u001b[0m\n", + "└\u001b[0m───────────────\u001b[0m┴\u001b[0m────────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = Any[cross_entropy, ZeroOneLoss()],\n", + " measurement = [1.135369212298553, 0.4166666666666667],\n", + " per_fold = Array{Float64,1}[[1.135369212298553], [0.4166666666666667]],\n", + " per_observation = Array{Array{Float64,1},1}[[[0.10536051565782628, 3.7135720667043075, 0.10536051565782628, 2.3025850929940455, 0.10536051565782628, 0.3184537311185346, 0.02469261259037141, 0.3184537311185346, 0.3184537311185346, 1.2992829841302609, 3.7135720667043075, 1.2992829841302609]], [[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate(tree_model, X, y,\n", + " resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),\n", + " measure=[cross_entropy, ZeroOneLoss()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If a machine is already defined, as above:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m───────────────\u001b[0m┬\u001b[0m────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m───────────────\u001b[0m┼\u001b[0m────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m cross_entropy \u001b[0m│\u001b[0m 1.135369212298553 \u001b[0m│\u001b[0m\n", + "│\u001b[0m ZeroOneLoss \u001b[0m│\u001b[0m 0.4166666666666667 \u001b[0m│\u001b[0m\n", + "└\u001b[0m───────────────\u001b[0m┴\u001b[0m────────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = Any[cross_entropy, ZeroOneLoss()],\n", + " measurement = [1.135369212298553, 0.4166666666666667],\n", + " per_fold = Array{Float64,1}[[1.135369212298553], [0.4166666666666667]],\n", + " per_observation = Array{Array{Float64,1},1}[[[0.10536051565782628, 3.7135720667043075, 0.10536051565782628, 2.3025850929940455, 0.10536051565782628, 0.3184537311185346, 0.02469261259037141, 0.3184537311185346, 0.3184537311185346, 1.2992829841302609, 3.7135720667043075, 1.2992829841302609]], [[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]]],)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate!(tree,\n", + " resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234),\n", + " measure=[cross_entropy, ZeroOneLoss()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using cross-validation:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[33mEvaluating over 5 folds: 100%[=========================] Time: 0:00:00\u001b[39m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m───────────────\u001b[0m┬\u001b[0m────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m───────────────\u001b[0m┼\u001b[0m────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m cross_entropy \u001b[0m│\u001b[0m 0.8107153382628913 \u001b[0m│\u001b[0m\n", + "│\u001b[0m ZeroOneLoss \u001b[0m│\u001b[0m 0.4 \u001b[0m│\u001b[0m\n", + "└\u001b[0m───────────────\u001b[0m┴\u001b[0m────────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = Any[cross_entropy, ZeroOneLoss()],\n", + " measurement = [0.8107153382628913, 0.4],\n", + " per_fold = Array{Float64,1}[[0.44130929246809064, 1.2635805032959784, 0.6459172309118898, 0.8778906002819279, 0.8248790643565697], [0.5714285714285714, 0.2857142857142857, 0.2857142857142857, 0.5714285714285714, 0.2857142857142857]],\n", + " per_observation = Array{Array{Float64,1},1}[[[0.02469261259037141, 0.02469261259037141, 0.7537718023763802, 0.7537718023763802, 0.7537718023763802, 0.7537718023763802, 0.02469261259037141], [0.3483066942682157, 0.3483066942682157, 0.3483066942682157, 0.3483066942682157, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141], [0.02469261259037141, 0.1823215567939546, 0.1823215567939546, 2.0149030205422647, 1.791759469228055, 0.1823215567939546, 0.1431008436406733], [1.3862943611198906, 1.3862943611198906, 1.3862943611198906, 0.2876820724517809, 0.02469261259037141, 0.2876820724517809, 1.3862943611198906], [0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 0.8109302162163288, 0.587786664902119, 0.587786664902119]], [[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]],)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate!(tree, resampling=CV(nfolds=5, shuffle=true, rng=1234),\n", + " measure=[cross_entropy, ZeroOneLoss()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With user-specified train/evaluation pairs of row indices:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[33mEvaluating over 3 folds: 100%[=========================] Time: 0:00:00\u001b[39m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m───────────────\u001b[0m┬\u001b[0m─────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m───────────────\u001b[0m┼\u001b[0m─────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m cross_entropy \u001b[0m│\u001b[0m 0.895254695800462 \u001b[0m│\u001b[0m\n", + "│\u001b[0m ZeroOneLoss \u001b[0m│\u001b[0m 0.24136008918617616 \u001b[0m│\u001b[0m\n", + "└\u001b[0m───────────────\u001b[0m┴\u001b[0m─────────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = Any[cross_entropy, ZeroOneLoss()],\n", + " measurement = [0.895254695800462, 0.24136008918617616],\n", + " per_fold = Array{Float64,1}[[0.7538091986662944, 1.1473950551467866, 0.7845598335883047], [0.30434782608695654, 0.30434782608695654, 0.11538461538461539]],\n", + " per_observation = Array{Array{Float64,1},1}[[[0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836, 1.9459101490553135, 0.15415067982725836, 0.02469261259037141, 1.9459101490553135, 1.9459101490553135 … 0.15415067982725836, 1.9459101490553135, 0.15415067982725836, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 1.9459101490553135, 0.15415067982725836, 0.15415067982725836, 0.15415067982725836], [0.02469261259037141, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141 … 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141], [0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.02469261259037141 … 0.02469261259037141, 0.6931471805599453, 3.7135720667043075, 0.02469261259037141, 0.6931471805599453, 0.6931471805599453, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141, 0.6931471805599453]], [[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0 … 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]],)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f1, f2, f3 = 1:13, 14:26, 27:36\n", + "pairs = [(f1, vcat(f2, f3)), (f2, vcat(f3, f1)), (f3, vcat(f1, f2))];\n", + "evaluate!(tree,\n", + " resampling=pairs,\n", + " measure=[cross_entropy, ZeroOneLoss()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Changing a hyperparameter and re-evaluating:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[33mEvaluating over 5 folds: 100%[=========================] Time: 0:00:00\u001b[39m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌\u001b[0m───────────────\u001b[0m┬\u001b[0m─────────────────────\u001b[0m┐\u001b[0m\n", + "│\u001b[0m\u001b[1m measure \u001b[0m│\u001b[0m\u001b[1m measurement \u001b[0m│\u001b[0m\n", + "├\u001b[0m───────────────\u001b[0m┼\u001b[0m─────────────────────\u001b[0m┤\u001b[0m\n", + "│\u001b[0m cross_entropy \u001b[0m│\u001b[0m 0.7857788118033404 \u001b[0m│\u001b[0m\n", + "│\u001b[0m ZeroOneLoss \u001b[0m│\u001b[0m 0.37142857142857133 \u001b[0m│\u001b[0m\n", + "└\u001b[0m───────────────\u001b[0m┴\u001b[0m─────────────────────\u001b[0m┘\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "(measure = Any[cross_entropy, ZeroOneLoss()],\n", + " measurement = [0.7857788118033404, 0.37142857142857133],\n", + " per_fold = Array{Float64,1}[[0.5192479199123463, 1.1617214839057737, 0.7334426224354447, 0.6982881261612496, 0.816193906601888], [0.42857142857142855, 0.2857142857142857, 0.2857142857142857, 0.5714285714285714, 0.2857142857142857]],\n", + " per_observation = Array{Array{Float64,1},1}[[[0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 1.1786549963416462, 1.1786549963416462, 1.1786549963416462, 0.02469261259037141], [0.6061358035703156, 0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 3.7135720667043075, 0.02469261259037141], [0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 1.0986122886681098, 0.02469261259037141, 0.2231435513142097], [0.9808292530117262, 0.9808292530117262, 0.9808292530117262, 0.4700036292457356, 0.02469261259037141, 0.4700036292457356, 0.9808292530117262], [0.02469261259037141, 0.02469261259037141, 0.02469261259037141, 3.7135720667043075, 1.252762968495368, 0.3364722366212129, 0.3364722366212129]], [[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]],)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree_model.max_depth = 3\n", + "evaluate!(tree,\n", + " resampling=CV(nfolds=5, shuffle=true, rng=1234),\n", + " measure=[cross_entropy, ZeroOneLoss()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspecting training results:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fit a ordinary least square model to some synthetic data:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{LinearRegressor} @ 7…80\u001b[39m.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/machines.jl:141\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mMachine{LinearRegressor} @ 7…80\u001b[39m\n" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x1 = rand(100)\n", + "x2 = rand(100)\n", + "\n", + "X = (x1=x1, x2=x2)\n", + "y = x1 - 2x2 + 0.1*rand(100);\n", + "\n", + "ols_model = @load LinearRegressor pkg=GLM\n", + "ols = machine(ols_model, X, y)\n", + "fit!(ols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get a named tuple representing the learned parameters,\n", + "human-readable if appropriate:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(coef = [0.9985128951528446, -1.9981845372437947],\n", + " intercept = 0.05141139704717806,)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fitted_params(ols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get other training-related information:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(deviance = 0.08067317714592058,\n", + " dof_residual = 97.0,\n", + " stderror = [0.009386992893038402, 0.00995817861943297, 0.0073417739672351065],\n", + " vcov = [8.811563557395346e-5 -9.558303404671843e-6 -4.056936372724475e-5; -9.558303404671843e-6 9.916532141653193e-5 -4.6982822143496706e-5; -4.056936372724475e-5 -4.6982822143496706e-5 5.390164498597111e-5],)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report(ols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic fit/transform for unsupervised models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load data:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([125, 100, 130, 9, 70, 148, 39, 64, 6, 107 … 134, 114, 52, 74, 44, 61, 83, 18, 122, 26], [97, 78, 30, 108, 101, 24, 85, 91, 135, 96 … 112, 144, 140, 72, 109, 41, 106, 147, 47, 5])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = @load_iris\n", + "train, test = partition(eachindex(y), 0.7, shuffle=true, rng=123)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instantiate and fit the model/machine:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{PCA} @ 9…33\u001b[39m.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/machines.jl:141\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mMachine{PCA} @ 9…33\u001b[39m\n" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@load PCA\n", + "pca_model = PCA(maxoutdim=2)\n", + "pca = machine(pca_model, X)\n", + "fit!(pca, rows=train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform selected data bound to the machine:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "transform(pca, rows=test);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform new data:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(x1 = [4.819158264829177, 4.8208386973047, 5.111185670643473],\n", + " x2 = [-4.4441147103696315, -4.4288641941901625, -4.71503950609489],)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Xnew = (sepal_length=rand(3), sepal_width=rand(3),\n", + " petal_length=rand(3), petal_width=rand(3));\n", + "transform(pca, Xnew)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inverting learned transformations" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{UnivariateStandardizer} @ 9…82\u001b[39m.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/machines.jl:141\n" + ] + } + ], + "source": [ + "y = rand(100);\n", + "stand_model = UnivariateStandardizer()\n", + "stand = machine(stand_model, y)\n", + "fit!(stand)\n", + "z = transform(stand, y);\n", + "@assert inverse_transform(stand, z) ≈ y # true" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nested hyperparameter tuning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load data:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(150×4 DataFrame\n", + "│ Row │ sepal_length │ sepal_width │ petal_length │ petal_width │\n", + "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", + "├─────┼──────────────┼─────────────┼──────────────┼─────────────┤\n", + "│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │\n", + "│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │\n", + "│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │\n", + "│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │\n", + "│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │\n", + "│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │\n", + "│ 7 │ 4.6 │ 3.4 │ 1.4 │ 0.3 │\n", + "│ 8 │ 5.0 │ 3.4 │ 1.5 │ 0.2 │\n", + "│ 9 │ 4.4 │ 2.9 │ 1.4 │ 0.2 │\n", + "│ 10 │ 4.9 │ 3.1 │ 1.5 │ 0.1 │\n", + "⋮\n", + "│ 140 │ 6.9 │ 3.1 │ 5.4 │ 2.1 │\n", + "│ 141 │ 6.7 │ 3.1 │ 5.6 │ 2.4 │\n", + "│ 142 │ 6.9 │ 3.1 │ 5.1 │ 2.3 │\n", + "│ 143 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │\n", + "│ 144 │ 6.8 │ 3.2 │ 5.9 │ 2.3 │\n", + "│ 145 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │\n", + "│ 146 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │\n", + "│ 147 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │\n", + "│ 148 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │\n", + "│ 149 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │\n", + "│ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │, CategoricalString{UInt32}[\"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\" … \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\", \"virginica\"])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = @load_iris" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a model with nested hyperparameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: A model type \"DecisionTreeClassifier\" is already loaded. \n", + "│ No new code loaded. \n", + "└ @ MLJModels /Users/anthony/Dropbox/Julia7/MLJ/MLJModels/src/loading.jl:41\n" + ] + }, + { + "data": { + "text/plain": [ + "MLJ.ProbabilisticEnsembleModel(atom = DecisionTreeClassifier(pruning_purity = 1.0,\n", + " max_depth = -1,\n", + " min_samples_leaf = 1,\n", + " min_samples_split = 2,\n", + " min_purity_increase = 0.0,\n", + " n_subfeatures = 0,\n", + " display_depth = 5,\n", + " post_prune = false,\n", + " merge_purity_threshold = 0.9,\n", + " pdf_smoothing = 0.05,),\n", + " weights = Float64[],\n", + " bagging_fraction = 0.8,\n", + " rng = MersenneTwister(UInt32[0x71271325, 0x5861ba72, 0x34abacc2, 0x27102d83]),\n", + " n = 300,\n", + " parallel = true,\n", + " out_of_bag_measure = Any[],)\u001b[34m @ 8…22\u001b[39m" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree_model = @load DecisionTreeClassifier\n", + "forest_model = EnsembleModel(atom=tree_model, n=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect all hyperparameters, even nested ones (returns nested named tuple):" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(atom = (pruning_purity = 1.0,\n", + " max_depth = -1,\n", + " min_samples_leaf = 1,\n", + " min_samples_split = 2,\n", + " min_purity_increase = 0.0,\n", + " n_subfeatures = 0,\n", + " display_depth = 5,\n", + " post_prune = false,\n", + " merge_purity_threshold = 0.9,\n", + " pdf_smoothing = 0.05,),\n", + " weights = Float64[],\n", + " bagging_fraction = 0.8,\n", + " rng = MersenneTwister(UInt32[0x71271325, 0x5861ba72, 0x34abacc2, 0x27102d83]),\n", + " n = 300,\n", + " parallel = true,\n", + " out_of_bag_measure = Any[],)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params(forest_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define ranges for hyperparameters to be tuned:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.NumericRange(field = :bagging_fraction,\n", + " lower = 0.5,\n", + " upper = 1.0,\n", + " scale = :log10,)\u001b[34m @ 1…28\u001b[39m" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r1 = range(forest_model, :bagging_fraction, lower=0.5, upper=1.0, scale=:log10)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.NumericRange(field = :(atom.n_subfeatures),\n", + " lower = 1,\n", + " upper = 4,\n", + " scale = :linear,)\u001b[34m @ 1…75\u001b[39m" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2 = range(forest_model, :(atom.n_subfeatures), lower=1, upper=4) # nested" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wrap the model in a tuning strategy:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.ProbabilisticTunedModel(model = MLJ.ProbabilisticEnsembleModel(atom = \u001b[34mDecisionTreeClassifier @ 1…80\u001b[39m,\n", + " weights = Float64[],\n", + " bagging_fraction = 0.8,\n", + " rng = MersenneTwister(UInt32[0x71271325, 0x5861ba72, 0x34abacc2, 0x27102d83]),\n", + " n = 300,\n", + " parallel = true,\n", + " out_of_bag_measure = Any[],),\n", + " tuning = Grid(resolution = 12,\n", + " parallel = true,),\n", + " resampling = CV(nfolds = 6,\n", + " shuffle = false,\n", + " rng = MersenneTwister(UInt32[0x71271325, 0x5861ba72, 0x34abacc2, 0x27102d83]),),\n", + " measure = MLJBase.CrossEntropy(),\n", + " weights = nothing,\n", + " operation = StatsBase.predict,\n", + " ranges = MLJ.NumericRange{T,Symbol} where T[\u001b[34mNumericRange @ 1…28\u001b[39m, \u001b[34mNumericRange @ 1…75\u001b[39m],\n", + " full_report = true,\n", + " train_best = true,)\u001b[34m @ 1…49\u001b[39m" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuned_forest = TunedModel(model=forest_model,\n", + " tuning=Grid(resolution=12),\n", + " resampling=CV(nfolds=6),\n", + " ranges=[r1, r2],\n", + " measure=cross_entropy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bound the wrapped model to data:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[34mMachine{ProbabilisticTunedModel} @ 1…60\u001b[39m\n" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuned = machine(tuned_forest, X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fitting the resultant machine optimizes the hyperaparameters specified in\n", + "`range`, using the specified resampling strategy and performance\n", + "measure, and retrains on all data bound to the machine:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{ProbabilisticTunedModel} @ 1…60\u001b[39m.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/machines.jl:141\n", + "┌ Info: Mimimizing cross_entropy. \n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/tuning.jl:160\n", + "\u001b[33mIterating over a 48-point grid: 100%[=========================] Time: 0:00:40\u001b[39m\n", + "┌ Info: Training best model on all supplied data.\n", + "└ @ MLJ /Users/anthony/Dropbox/Julia7/MLJ/MLJ/src/tuning.jl:252\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mMachine{ProbabilisticTunedModel} @ 1…60\u001b[39m\n" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit!(tuned)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspecting the optimal model:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(best_model = \u001b[34mProbabilisticEnsembleModel{DecisionTreeClassifier} @ 1…63\u001b[39m,)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F = fitted_params(tuned)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.ProbabilisticEnsembleModel(atom = DecisionTreeClassifier(pruning_purity = 1.0,\n", + " max_depth = -1,\n", + " min_samples_leaf = 1,\n", + " min_samples_split = 2,\n", + " min_purity_increase = 0.0,\n", + " n_subfeatures = 3,\n", + " display_depth = 5,\n", + " post_prune = false,\n", + " merge_purity_threshold = 0.9,\n", + " pdf_smoothing = 0.05,),\n", + " weights = Float64[],\n", + " bagging_fraction = 0.5,\n", + " rng = MersenneTwister(UInt32[0x71271325, 0x5861ba72, 0x34abacc2, 0x27102d83]),\n", + " n = 300,\n", + " parallel = true,\n", + " out_of_bag_measure = Any[],)\u001b[34m @ 1…63\u001b[39m" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.best_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspecting details of tuning procedure:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(parameter_names = [\"bagging_fraction\" \"atom.n_subfeatures\"],\n", + " parameter_scales = Symbol[:log10 :linear],\n", + " parameter_values = Any[0.5 1; 0.5325205447199813 1; … ; 0.9389309106617063 4; 1.0 4],\n", + " measurements = [0.23836844761285972, 0.24310768116519496, 0.23155959227133427, 0.2358303191590729, 0.23388918367157183, 0.23944002555125055, 0.22931761600908399, 0.22924432030705047, 0.22621287086704908, 0.23123283225576788 … 0.1830737398891659, 0.19017188641338933, 0.2062563314942637, 0.2041996514962502, 0.210012168891926, 0.21305031478959782, 0.22735490003858747, 0.2359797272653158, 0.2584476524785048, 0.32572198859316304],\n", + " best_measurement = 0.17477371176810844,)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report(tuned)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To plot result of a 2D parameter tune, use `using Plots; pyplot();\n", + "plot(tuned)`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predicting on new data using the optimized model:" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3-element Array{UnivariateFinite{String,UInt32,Float64},1}:\n", + " UnivariateFinite(setosa=>0.9677419354838652, versicolor=>0.01612903225806445, virginica=>0.01612903225806445)\n", + " UnivariateFinite(setosa=>0.9677419354838652, versicolor=>0.01612903225806445, virginica=>0.01612903225806445)\n", + " UnivariateFinite(setosa=>0.9677419354838652, versicolor=>0.01612903225806445, virginica=>0.01612903225806445)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict(tuned, Xnew)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.2.0", + "language": "julia", + "name": "julia-1.2" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.2.0" + } + }, + "nbformat": 4, + "nbformat_minor": 3 +} diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md new file mode 100644 index 000000000..7114ee40e --- /dev/null +++ b/docs/src/common_mlj_workflows.md @@ -0,0 +1,438 @@ +# Common MLJ Workflows + +## Data ingestion + +```@example workflows +using MLJ; color_off() #hide +using RDatasets +channing = dataset("boot", "channing") +first(channing, 4) +``` + +Inspecting metadata, including column scientific types: + +```@example workflows +schema(channing) +``` + +Unpacking data and correcting for wrong scitypes: + +```@example workflows +y, X = unpack(channing, + ==(:Exit), # y is the :Exit column + !=(:Time); # X is the rest, except :Time + :Exit=>Continuous, + :Entry=>Continuous, + :Cens=>Multiclass) +first(X, 4) +``` + +```@example workflows +y[1:4] +``` + +Loading a built-in supervised dataset: + +```@example workflows +X, y = @load_iris; +first(X, 4) +``` + +```@example workflows +y[1:4] +``` + +## Model search + +*Reference:* [Model Search](model_search.md) + +Searching for a supervised model: + +```@example workflows +X, y = @load_boston +models(matching(X, y)) +``` + +```@example workflows +models(matching(X, y))[6] +``` + +More refined searches: + +```@example workflows +models() do model + matching(model, X, y) && + model.prediction_type == :deterministic && + model.is_pure_julia +end +``` + +Searching for an unsupervised model: + +```@example workflows +models(matching(X)) +``` + +Getting the metadata entry for a given model type: + +```@example workflows +info("PCA") +info("RidgeRegressor", pkg="MultivariateStats") # a model type in multiple packages +``` + +## Instantiating a model + +*Reference:* [Getting Started](index.md) + +```@example workflows +@load DecisionTreeClassifier +model = DecisionTreeClassifier(min_samples_split=5, max_depth=4) +``` + +or + +```@julia +model = @load DecisionTreeClassifier +model.min_samples_split = 5 +model.max_depth = 4 +``` + +## Evaluating a model + +*Reference:* [Evaluating Model Performance](evaluating_model_performance.md) + + +```@example workflows +X, y = @load_boston +model = @load KNNRegressor +evaluate(model, X, y, resampling=CV(nfolds=5), measure=[rms, mav]) +``` + +## Basic fit/evaluate/predict by hand: + +*Reference:* [Getting Started](index.md), [Machines](machines.md), +[Evaluating Model Performance](evaluating_model_performance.md), [Performance Measures](performance_measures.md) + +```@example workflows +using RDatasets +vaso = dataset("robustbase", "vaso"); # a DataFrame +first(vaso, 3) +``` + +```@example workflows +y, X = unpack(vaso, ==(:Y), c -> true; :Y => Multiclass) + +tree_model = @load DecisionTreeClassifier +tree_model.max_depth=2; nothing # hide +``` + +Bind the model and data together in a *machine* , which will +additionally store the learned parameters (*fitresults*) when fit: + +```@example workflows +tree = machine(tree_model, X, y) +``` + +Split row indices into training and evaluation rows: + +```@example workflows +train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1234); # 70:30 split +``` + +Fit on train and evaluate on test: + +```@example workflows +fit!(tree, rows=train) +yhat = predict(tree, rows=test); +mean(cross_entropy(yhat, y[test])) +``` + +Predict on new data: + +```@example workflows +Xnew = (Volume=3*rand(3), Rate=3*rand(3)) +predict(tree, Xnew) # a vector of distributions +``` + +```@example workflows +predict_mode(tree, Xnew) # a vector of point-predictions +``` + +## More performance evaluation examples + +```@example workflows +import LossFunctions.ZeroOneLoss +``` + +Evaluating model + data directly: + +```@example workflows +evaluate(tree_model, X, y, + resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234), + measure=[cross_entropy, ZeroOneLoss()]) +``` + +If a machine is already defined, as above: + +```@example workflows +evaluate!(tree, + resampling=Holdout(fraction_train=0.7, shuffle=true, rng=1234), + measure=[cross_entropy, ZeroOneLoss()]) +``` + +Using cross-validation: + +```@example workflows +evaluate!(tree, resampling=CV(nfolds=5, shuffle=true, rng=1234), + measure=[cross_entropy, ZeroOneLoss()]) +``` + +With user-specified train/evaluation pairs of row indices: + +```@example workflows +f1, f2, f3 = 1:13, 14:26, 27:36 +pairs = [(f1, vcat(f2, f3)), (f2, vcat(f3, f1)), (f3, vcat(f1, f2))]; +evaluate!(tree, + resampling=pairs, + measure=[cross_entropy, ZeroOneLoss()]) +``` + +Changing a hyperparameter and re-evaluating: + +```@example workflows +tree_model.max_depth = 3 +evaluate!(tree, + resampling=CV(nfolds=5, shuffle=true, rng=1234), + measure=[cross_entropy, ZeroOneLoss()]) +``` + +## Inspecting training results + +Fit a ordinary least square model to some synthetic data: + +```@example workflows +x1 = rand(100) +x2 = rand(100) + +X = (x1=x1, x2=x2) +y = x1 - 2x2 + 0.1*rand(100); + +ols_model = @load LinearRegressor pkg=GLM +ols = machine(ols_model, X, y) +fit!(ols) +``` + +Get a named tuple representing the learned parameters, +human-readable if appropriate: + +```@example workflows +fitted_params(ols) +``` + +Get other training-related information: + +```@example workflows +report(ols) +``` + +## Basic fit/transform for unsupervised models + +Load data: + +```@example workflows +X, y = @load_iris +train, test = partition(eachindex(y), 0.97, shuffle=true, rng=123) +``` + +Instantiate and fit the model/machine: + +```@example workflows +@load PCA +pca_model = PCA(maxoutdim=2) +pca = machine(pca_model, X) +fit!(pca, rows=train) +``` + +Transform selected data bound to the machine: + +```@example workflows +transform(pca, rows=test); +``` + +Transform new data: + +```@example workflows +Xnew = (sepal_length=rand(3), sepal_width=rand(3), + petal_length=rand(3), petal_width=rand(3)); +transform(pca, Xnew) +``` + +## Inverting learned transformations + +```@example workflows +y = rand(100); +stand_model = UnivariateStandardizer() +stand = machine(stand_model, y) +fit!(stand) +z = transform(stand, y); +@assert inverse_transform(stand, z) ≈ y # true +``` + +## Nested hyperparameter tuning + +*Reference:* [Tuning Models](tuning_models.md) + +```@example workflows +X, y = @load_iris; nothing # hide +``` + +Define a model with nested hyperparameters: + +```@example workflows +tree_model = @load DecisionTreeClassifier +forest_model = EnsembleModel(atom=tree_model, n=300) +``` + +Inspect all hyperparameters, even nested ones (returns nested named tuple): + +```@example workflows +params(forest_model) +``` + +Define ranges for hyperparameters to be tuned: + +```@example workflows +r1 = range(forest_model, :bagging_fraction, lower=0.5, upper=1.0, scale=:log10) +``` + +```@example workflows +r2 = range(forest_model, :(atom.n_subfeatures), lower=1, upper=4) # nested +``` + +Wrap the model in a tuning strategy: + +```@example workflows +tuned_forest = TunedModel(model=forest_model, + tuning=Grid(resolution=12), + resampling=CV(nfolds=6), + ranges=[r1, r2], + measure=cross_entropy) +``` + +Bound the wrapped model to data: + +```@example workflows +tuned = machine(tuned_forest, X, y) +``` + +Fitting the resultant machine optimizes the hyperaparameters specified +in `range`, using the specified `tuning` and `resampling` strategies +and performance `measure` (possibly a vector of measures), and +retrains on all data bound to the machine: + +```@example workflows +fit!(tuned) +``` + +Inspecting the optimal model: + +```@example workflows +F = fitted_params(tuned) +``` + +```@example workflows +F.best_model +``` + +Inspecting details of tuning procedure: + +```@example workflows +report(tuned) +``` + +Visualizing these results: + +```julia +using Plots +plot(tuned) +``` + +![](workflows_tuning_plot.png) + +Predicting on new data using the optimized model: + +```@example workflows +predict(tuned, Xnew) +``` + +# Constructing a linear pipeline + +*Reference:* [Composing Models](composing_models.md) + +Constructing a linear (unbranching) pipeline with a learned target +transformation/inverse transformation: + +```@example workflows +X, y = @load_reduced_ames +@load KNNRegressor +pipe = @pipeline MyPipe(X -> coerce(X, :age=>Continuous), + hot = OneHotEncoder(), + knn = KNNRegressor(K=3), + target = UnivariateStandardizer()) +``` + +Evaluating the pipeline (just as you would any other model): + +```@example workflows +pipe.knn.K = 2 +pipe.hot.drop_last = true +evaluate(pipe, X, y, resampling=Holdout(), measure=rms, verbosity=2) +``` + +Constructing a linear (unbranching) pipeline with a static (unlearned) +target transformation/inverse transformation: + +```@example workflows +@load DecisionTreeRegressor +pipe2 = @pipeline MyPipe2(X -> coerce(X, :age=>Continuous), + hot = OneHotEncoder(), + tree = DecisionTreeRegressor(max_depth=4), + target = y -> log.(y), + inverse = z -> exp.(z)) +``` + +# Creating a homogeneous ensemble of models + +*Reference:* [Homogeneous Ensembles](homogeneous_ensembles.md) + +```@example workflows +X, y = @load_iris +tree_model = @load DecisionTreeClassifier +forest_model = EnsembleModel(atom=tree_model, bagging_fraction=0.8, n=300) +forest = machine(forest_model, X, y) +evaluate!(forest, measure=cross_entropy) +``` + +# Performance curves + +Generate a plot of performance, as a function of some hyperparameter +(building on the preceding example): + +```@example workflows +r = range(forest_model, :n, lower=1, upper=1000, scale=:log10) +curve = MLJ.learning_curve!(forest, + range=r, + resampling=Holdout(), + measure=cross_entropy, + n=4, + verbosity=0) +``` + +```julia +using Plots +plot(curve.parameter_values, curve.measurements, xlab=curve.parameter_name, xscale=curve.parameter_scale) +``` + +![](workflows_learning_curves.png) + + diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md index e63886983..766edcfac 100644 --- a/docs/src/evaluating_model_performance.md +++ b/docs/src/evaluating_model_performance.md @@ -1,8 +1,9 @@ -# Evaluation of supervised models +# Evaluating Model Performance -MLJ allows quick evaluation of a model's performance against a battery -of selected losses or scores. For more on available performance -measures, see [Performance Measures](performance_measures.md). +MLJ allows quick evaluation of a supervised model's performance +against a battery of selected losses or scores. For more on available +performance measures, see [Performance +Measures](performance_measures.md). In addition to hold-out and cross-validation, the user can specify their own list of train/evaluation pairs of row indices for diff --git a/docs/src/index.md b/docs/src/index.md index 1134485d9..842a01352 100755 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,8 +1,8 @@ -# Getting Started +### [Installation](https://github.com/alan-turing-institute/MLJ.jl/blob/master/README.md) | [Cheatsheet](mlj_cheatsheet.md) | [Workflows](common_mlj_workflows.md) | [Workflows](common_mlj_workflows.md) + -#### [Installation instructions](https://github.com/alan-turing-institute/MLJ.jl/blob/master/README.md) +# Getting Started -#### [Cheatsheet](mlj_cheatsheet.md) ```@setup doda import Base.eval # hack b/s auto docs put's code in baremodule @@ -12,10 +12,9 @@ MLJ.color_off() seed!(1234) ``` +### Choosing and evaluating a model -### Plug-and-play model evaluation - -To load some data add +To load some demonstration data, add [RDatasets](https://github.com/JuliaStats/RDatasets.jl) to your load path and enter ```@repl doda @@ -27,13 +26,20 @@ and then split the data into input and target parts: ```@repl doda using MLJ y, X = unpack(iris, ==(:Species), colname -> true); -first(X, 3) +first(X, 3) |> pretty +``` + +To seach MJL's [model registry](model_search.md) for models that can +be immediately trained on the data, we run + +```@repl doda +models(matching(X, y)) ``` In MLJ a *model* is a struct storing the hyperparameters of the learning algorithm indicated by the struct name. -Assuming the DecisionTree package is in your load path, we can use +Assuming the DecisionTree.jl package is in your load path, we can use `@load` to load the code defining the `DecisionTreeClassifier` model type. This macro also returns an instance, with default hyperparameters. @@ -44,7 +50,7 @@ Drop the `verbosity=1` declaration for silent loading: tree_model = @load DecisionTreeClassifier verbosity=1 ``` -*Important:* DecisionTree and most other packages implementing machine +*Important:* DecisionTree.jl and most other packages implementing machine learning algorithms for use in MLJ are not MLJ dependencies. If such a package is not in your load path you will receive an error explaining how to add the package to your current environment. @@ -57,7 +63,7 @@ evaluate(tree_model, X, y, ``` Evaluating against multiple performance measures is also possible. See -[Evaluating model performance](evaluating_model_performance.md) for details. +[Evaluating Model Performance](evaluating_model_performance.md) for details. ### Fit and predict @@ -92,20 +98,35 @@ broadcast(pdf, yhat[3:5], "virginica") # predicted probabilities of virginica mode.(yhat[3:5]) ``` -One can explicitly get modes by using `predict_mode` instead of `predict`: +Or, one can explicitly get modes by using `predict_mode` instead of +`predict`: ```@repl doda predict_mode(tree, rows=test[3:5]) ``` -Machines have an internal state which allows them to avoid redundant -calculations when retrained, in certain conditions - for example when -increasing the number of trees in a random forest, or the number of -epochs in a neural network. The machine building syntax also -anticipates a more general syntax for composing multiple models, as -explained in [Composing Models](composing_models.md). +Unsupervised models have a `transform` method instead of `predict`, +and may optionally implement an `inverse_transform` method: -There is a version of `evaluate` for machines as well as models: +```@repl doda +v = [1, 2, 3, 4] +stand_model = UnivariateStandardizer() +stand = machine(stand_model, v) +fit!(stand) +w = transform(stand, v) +inverse_transform(stand, w) +``` + +[Machines](machines.md) have an internal state which allows them to +avoid redundant calculations when retrained, in certain conditions - +for example when increasing the number of trees in a random forest, or +the number of epochs in a neural network. The machine building syntax +also anticipates a more general syntax for composing multiple models, +as explained in [Composing Models](composing_models.md). + +There is a version of `evaluate` for machines as well as models. An +exclamation point is added to the method name because machines are +generally mutated when trained: ```@repl doda evaluate!(tree, resampling=Holdout(fraction_train=0.5, shuffle=true), @@ -124,9 +145,10 @@ evaluate!(tree, resampling=Holdout(fraction_train=0.5, shuffle=true), ### Next steps To learn a little more about what MLJ can do, take the MLJ -[tour](https://github.com/alan-turing-institute/MLJ.jl/blob/master/examples/tour/tour.ipynb), -and then return to the manual as needed. *Read at least the remainder -of this page before considering serious use of MLJ.* +[tour](https://github.com/alan-turing-institute/MLJ.jl/blob/master/examples/tour/tour.ipynb) +or browse [Common MLJ Workflows](common_mlj_workflows), returning to +the manual as needed. *Read at least the remainder of this page before +considering serious use of MLJ.* ### Prerequisites @@ -155,9 +177,11 @@ Each supervised model in MLJ declares the permitted *scientific type* of the inputs `X` and targets `y` that can be bound to it in the first constructor above, rather than specifying specific machine types (such as `Array{Float32, 2}`). Similar remarks apply to the input `X` of an -unsupervised model. Scientific types are julia types defined in the +unsupervised model. + +Scientific types are julia types defined in the package -[ScientificTypes.jl](https://github.com/alan-turing-institute/ScientificTypes.jl), +rows=nothing, verbosity::Int=1, force::Bool=false), which also defines the convention used here (and there called *mlj*) for assigning a specific scientific type (interpretation) to each julia object (see the `scitype` examples below). @@ -165,7 +189,12 @@ julia object (see the `scitype` examples below). The basic "scalar" scientific types are `Continuous`, `Multiclass{N}`, `OrderedFactor{N}` and `Count`. Be sure you read [Container element types](@ref) below to be guarantee your scalar data is interpreted -correctly. Additionally, most data containers - such as tuples, +correctly. Tools exist to coerce the data to have the appropriate +scientfic type; see +[ScientificTypes.jl](https://github.com/alan-turing-institute/ScientificTypes.jl) +or run `?coerce` for details. + +Additionally, most data containers - such as tuples, vectors, matrices and tables - have a scientific type. @@ -246,9 +275,6 @@ entry, using `info`: info("DecisionTreeClassifier") ``` -See also [Working with tasks](working_with_tasks.md) on searching for -models solving a specified task. - #### Container element types @@ -273,9 +299,6 @@ are the key aspects of that convention: - In particular, *integers* (including `Bool`s) *cannot be used to represent categorical data.* -To coerce the scientific type of a vector or table, use the `coerce` -method (re-exported from -[ScientificTypes.jl](https://github.com/alan-turing-institute/ScientificTypes.jl)). diff --git a/docs/src/internals.md b/docs/src/internals.md index 09d1cd637..90ddae3f7 100755 --- a/docs/src/internals.md +++ b/docs/src/internals.md @@ -1,4 +1,4 @@ -# Internals +G# Internals ### The machine interface, simplified @@ -45,7 +45,8 @@ function fit!(machine::Machine; rows=nothing, force=false, verbosity=1) rows = (:) end - rows_have_changed = (!isdefined(mach, :previous_rows) || rows != mach.previous_rows) + rows_have_changed = (!isdefined(mach, :previous_rows) || + rows != mach.previous_rows) args = [MLJ.selectrows(arg, rows) for arg in mach.args] diff --git a/docs/src/machines.md b/docs/src/machines.md new file mode 100644 index 000000000..b27f63a7c --- /dev/null +++ b/docs/src/machines.md @@ -0,0 +1,95 @@ +# Machines + +Under the hood, calling `fit!` on a machine calls either `MLJBase.fit` +or `MLJBase.update`, depending on the machine's internal state, as +recorded in additional fields `previous_model` and +`previous_rows`. These lower-level `fit` and `update` methods dispatch +on the model and a view of the data defined by the optional `rows` +keyword argument of `fit!` (all rows by default). In this way, if a +model `update` method is implemented, calls to `fit!` can avoid +redundant calculations for certain kinds of model mutations (eg, +increasing the number of epochs in a neural network). + +The interested reader can learn more on machine internals by examining +the simplified code excerpt in [Internals](internals.md). + +```@example machines +using MLJ; color_off() # hide +forest = EnsembleModel(atom=(@load DecisionTreeClassifier), n=20); +X, y = @load_iris; +mach = machine(forest, X, y) +fit!(mach, verbosity=2); +``` + +Generally, changing a hyperparameter triggers retraining on calls to +subsequent `fit!`: +```@repl machines +forest.bagging_fraction=0.5 +fit!(mach, verbosity=2); +``` + +However, for this iterative model, increasing the iteration parameter +only adds models to the existing ensemble: + +```@repl machines +forest.n=15 +fit!(mach, verbosity=2); +``` + +Call `fit!` again without making a change and no retraining occurs: + +```@repl machines +fit!(mach); +``` + +However, retraining can be forced: + +```@repl machines +fit!(mach, force=true); +``` + +And is retriggered if the view of the data changes: + +```@repl machines +fit!(mach, rows=1:100); +``` + +```@repl machines +fit!(mach, rows=1:100); +``` + +For a supervised machine the `predict` method calls a lower-level +`MLJBase.predict` method, dispatched on the underlying model and the +`fitresult` (see below). To see `predict` in action, as well as its +unsupervised cousins `transform` and `inverse_transform`, see +[Getting Started](getting_started.md). + +Here is a complete list of the fields of a machine: + +- `model` - the struct containing the hyperparameters to be used in + calls to `fit!` + +- `fitresult` - the learned parameters in a raw form, initially undefined + +- `args` - a tuple of the data (in the supervised learning example above, `args = (X, y)`) + +- `report` - outputs of training not encoded in `fitresult` (eg, feature rankings) + +- `previous_model` - a deep copy of the model used in the last call to `fit!` + +- `previous_rows` - a copy of the row indices used in last call to `fit!` + +- `cache` + +Instead of data `X` and `y`, the `machine` constructor can be provided +`Node` or `Source` objects ("dynamic data") to obtain a +`NodalMachine`, rather than a regular `Machine` object, which includes +the same fields listed above. See [Composing +Models](composing_models.md) for more on this advanced feature. + + +### API Reference + +```@docs +fit! +``` diff --git a/docs/src/mlj_cheatsheet.md b/docs/src/mlj_cheatsheet.md index 6e96060e6..5143ee057 100644 --- a/docs/src/mlj_cheatsheet.md +++ b/docs/src/mlj_cheatsheet.md @@ -10,22 +10,34 @@ MLJ_VERSION # version of MLJ for this cheatsheet #### Model search and code loading -`models()` to list all registered models. +`info("PCA")` retrieves registry metadata for the model called "PCA" + +`info("RidgeRegressor", pkg="MultivariateStats")` retrieves metadata +for "RidgeRegresssor", which is provided by multiple packages + +`models()` lists metadata of every registered model. -`models(x -> x.is_supervised && x.is_pure_julia)` to find all supervised models written in pure julia. +`models(x -> x.is_supervised && x.is_pure_julia)` lists all supervised models written in pure julia. +`models(matching(X))` lists all unsupervised models compatible with input `X`. -`info("PCA")` retrieves registry metadata on the model called "PCA" +`models(matching(X, y))` lists all supervised modesl compatible with input/target `X/y`. -`info("RidgeRegressor", pkg="MultivariateStats")` retrieves metadata -for "RidgeRegresssor", which is provided by multiple packages +With additional conditions: +```julia +models(matching(X, y)) do model + model.prediction_type == :probabilistic && + model.is_pure_julia +end +``` `tree = @load DecisionTreeClassifier` to load code and instantiate "DecisionTreeClassifier" model `tree2 = DecisionTreeClassifier(max_depth=2)` instantiates a model type already in scope -`ridge = @load RidgeRegressor pkg=MultivariateStats` load and instantiate a "RidgeRegressor" model +`ridge = @load RidgeRegressor pkg=MultivariateStats` loads and +instantiates a model provided by multiple packages #### Scitypes and coercion @@ -51,10 +63,11 @@ Use `schema(X)` to get the column scitypes of a table `X` ### Ingesting data -Splitting any table into target and input: +Splitting any table into target and input (note semicolon): ```julia -using RDatasets; channing = dataset("boot", "channing") +using RDatasets +channing = dataset("boot", "channing") y, X = unpack(channing, ==(:Exit), # y is the :Exit column !=(:Time); # X is the rest, except :Time @@ -128,6 +141,7 @@ or a list of pairs of row indices: `evaluate(model, X, y, resampling=CV(), measure=rms, operation=predict, weights=..., verbosity=1)` `evaluate!(mach, resampling=Holdout(), measure=[rms, mav], operation=predict, weights=..., verbosity=1)` +`evaluate!(mach, resampling=[(fold1, fold2), (fold2, fold1)], measure=rms)` #### Ranges for tuning @@ -216,7 +230,7 @@ Supervised, with final node `yhat` returning point-predictions: Supervised, with `yhat` final node returning probabilistic predictions: -`@from_network Composite(knn=network_knn) <= yhat is_probabistic=true` +`@from_network Composite(knn=network_knn) <= yhat is_probabilistic=true` Unsupervised, with final node `Xout`: diff --git a/docs/src/model_search.md b/docs/src/model_search.md new file mode 100644 index 000000000..5d5956cdf --- /dev/null +++ b/docs/src/model_search.md @@ -0,0 +1,104 @@ +# Model Search + +MLJ has a model registry, allowing the user to search models and their +properties, without loading all the packages containing model code. In +turn, this allows one to efficiently find all models solving a given +machine learning task. The task itself is specified with the help of +the `matching` method, and the search executed with the `models` +methods, as detailed below. + +### Model metadata + +*Terminology.* In this section the word "model" refers to the metadata +entry in the registry of an actual model `struct`, as appearing +elsewhere in the manual. One can obtain such an entry with the `info` +command: + +```@setup tokai +using MLJ +MLJ.color_off() +``` + +```@repl tokai +info("PCA") +``` + +If two models with the same name occur in different packages, the +package name must be specified, as in `info("LinearRegressor", +pkg="GLM")`. + + +### General model queries + +We list all models with `models()`, and list the models for which code is already +loaded with `localmodels()`: + +```@repl tokai +localmodels() +localmodels()[2] +``` + +If `models` is passed any `Bool`-valued function `test`, it returns every `model` for which `test(model)` is true, as in + +```@repl tokai +test(model) = model.is_supervised && + MLJ.Table(Continuous) <: model.input_scitype && + AbstractVector{<:Multiclass{3}} <: model.target_scitype && + model.prediction_type == :deterministic +models(test) +``` + +Multiple test arguments may be passed to `models`, which are applied conjunctively. + + +### Matching models to data + +Common searches are streamlined with the help of the `matching` +command, defined as follows: + +- `matching(model, X, y) == true` exactly when `model` is supervised + and admits inputs and targets with the scientific types of `X` and + `y`, respectively + +- `matching(model, X) == true` exactly when `model` is unsupervised + and admits inputs with the scientific types of `X`. + +So, to search for all supervised probablistic models handling input +`X` and target `y`, one can define the testing function `task` by + +```julia +task(model) = matching(model, X, y)) && model.is_probabilistic +``` + +And execute the search with + +```julia +models(task) +``` + +Also defined are `Bool`-valued callable objects `matching(model)`, +`matching(X, y)` and `matching(X)`, with obvious behaviour. For example, +`matching(X, y)(model) = matching(model, X, y)`. + +So, to search for all models compatible with input `X` and target `y`, +for example, one executes + +```julia +models(matching(X, y)) +``` + +while the preceding search can be compactly written + +```julia +models(matching(X, y)) do model + model.prediction_type == :probabilistic +end +``` + +### API + +```@docs +models +localmodels +matching +``` diff --git a/docs/src/workflows_learning_curves.png b/docs/src/workflows_learning_curves.png new file mode 100644 index 000000000..814165108 Binary files /dev/null and b/docs/src/workflows_learning_curves.png differ diff --git a/docs/src/workflows_learning_curves_large.png b/docs/src/workflows_learning_curves_large.png new file mode 100644 index 000000000..26025fd5a Binary files /dev/null and b/docs/src/workflows_learning_curves_large.png differ diff --git a/docs/src/workflows_tuning_plot.png b/docs/src/workflows_tuning_plot.png new file mode 100644 index 000000000..aadf74652 Binary files /dev/null and b/docs/src/workflows_tuning_plot.png differ diff --git a/docs/src/workflows_tuning_plot_large.png b/docs/src/workflows_tuning_plot_large.png new file mode 100644 index 000000000..aadf74652 Binary files /dev/null and b/docs/src/workflows_tuning_plot_large.png differ diff --git a/docs/src/working_with_tasks.jl b/docs/src/working_with_tasks.jl index 2ccf22fc7..a8a910dbf 100644 --- a/docs/src/working_with_tasks.jl +++ b/docs/src/working_with_tasks.jl @@ -1,7 +1,13 @@ # Working with Tasks -*Warning.* The task API described here is likely change soon, with the notion of -task being not bound to any particular data set. + +*Warning.* The formal task API described here may be depreciated in +the future. It is now recommended that the user use the `matching` +method desribed in [Model Search](model_search.md) to match models to +machine learning tasks (which can be understood there as simply +`Bool`-valued functions on models). + +--- In MLJ a *task* is a synthesis of three elements: *data*, an *interpretation* of that data, and a *learning objective*. Once one has a @@ -125,10 +131,4 @@ supervised unsupervised ``` -```@docs -models -``` -```@docs -localmodels -``` diff --git a/src/MLJ.jl b/src/MLJ.jl index 642539210..6d7bf315c 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -5,19 +5,20 @@ module MLJ export MLJ_VERSION # defined in include files: -export @curve, @pcurve, pretty, # utilities.jl - coerce, supervised, unsupervised, # tasks.jl - report, # machines.jl - Holdout, CV, evaluate!, Resampler, # resampling.jl - Params, params, set_params!, # parameters.jl - strange, iterator, # parameters.jl - Grid, TunedModel, learning_curve!, # tuning.jl - EnsembleModel, # ensembles.jl - rebind!, # networks.jl - machines, sources, anonymize!, # composites.jl - @from_network, # composites.jl - fitresults, # composites.jl - @pipeline # pipelines.jl +export @curve, @pcurve, pretty, # utilities.jl + coerce, supervised, unsupervised, # tasks.jl + report, # machines.jl + Holdout, CV, evaluate!, Resampler, # resampling.jl + Params, params, set_params!, # parameters.jl + strange, iterator, # parameters.jl + Grid, TunedModel, learning_curve!, # tuning.jl + EnsembleModel, # ensembles.jl + rebind!, # networks.jl + machines, sources, anonymize!, # composites.jl + @from_network, # composites.jl + fitresults, # composites.jl + @pipeline, # pipelines.jl + matching # matching.jl # defined in include files "machines.jl and "networks.jl": export Machine, NodalMachine, machine, AbstractNode, @@ -28,7 +29,7 @@ export pdf, mode, median, mean, shuffle!, categorical, shuffle, levels, levels! export std, support # re-export from MLJBase and ScientificTypes: -export nrows, nfeatures, +export nrows, nfeatures, color_off, color_on, selectrows, selectcols, SupervisedTask, UnsupervisedTask, MLJTask, Deterministic, Probabilistic, Unsupervised, Supervised, @@ -116,6 +117,7 @@ include("resampling.jl") # resampling strategies and model evaluation include("parameters.jl") # hyperparameter ranges and grid generation include("tuning.jl") include("ensembles.jl") # homogeneous ensembles +include("model_matching.jl")# inferring model search criterion from data include("tasks.jl") # enhancements to MLJBase task interface include("scitypes.jl") # extensions to ScientificTypes.sictype include("plotrecipes.jl") diff --git a/src/model_matching.jl b/src/model_matching.jl new file mode 100644 index 000000000..58845fa6c --- /dev/null +++ b/src/model_matching.jl @@ -0,0 +1,145 @@ +# Note. `ModelProxy` is the type of a model's metadata entry (a named +# tuple). So, `info("PCA")` has this type, for example. + + +## BASIC IDEA + +if false + + matching(model::MLJModels.ModelProxy, X) = + !(model.is_supervised) && scitype(X) <: model.input_scitype + + matching(model::MLJModels.ModelProxy, X, y) = + model.is_supervised && + scitype(X) <: model.input_scitype && + scitype(y) <: model.target_scitype + + matching(model::MLJModels.ModelProxy, X, y, w::AbstractVector{<:Real}) = + model.is_supervised && + model.supports_weights && + scitype(X) <: model.input_scitype && + scitype(y) <: model.target_scitype + +end + + +## IMPLEMENTATION + + +struct ModelChecker{is_supervised, + supports_weights, + input_scitype, + target_scitype} end + +function Base.getproperty(::ModelChecker{is_supervised, + supports_weights, + input_scitype, + target_scitype}, + field::Symbol) where {is_supervised, + supports_weights, + input_scitype, + target_scitype} + if field === :is_supervised + return is_supervised + elseif field === :supports_weights + return supports_weights + elseif field === :input_scitype + return input_scitype + elseif field === :target_scitype + return target_scitype + else + throw(ArgumentError("Unsupported property. ")) + end +end + +Base.propertynames(::ModelChecker) = + (:is_supervised, :supports_weights, :input_scitype, :target_scitype) + +function _as_named_tuple(s::ModelChecker) + names = propertynames(s) + NamedTuple{names}(Tuple(getproperty(s, p) for p in names)) +end + +# function Base.show(io::IO, ::MIME"text/plain", S::ModelChecker) +# show(io, MIME("text/plain"), _as_named_tuple(S)) +# end + +""" + matching(model, X, y) + +Returns `true` exactly when the registry metadata entry `model` is +supervised and admits inputs and targets with the scientific types of +`X` and `y`, respectively. + + matching(model, X) + +Returns `true` exactly when `model` is unsupervised and admits inputs +with the scientific types of `X`. + + matching(model), matching(X, y), matching(X) + +Curried versions of the preceding methods, i.e., `Bool`-valued +callable objects satisfying `matching(X, y)(model) = matching(model, +X, y)`, etc. + +### Example + + models(matching(X)) + +Finds all unsupervised models compatible with input data `X`. + + models() do model + matching(model, X, y) && model.prediction_type == :probabilistic + end + +Finds all supervised models compatible with input data `X` and target +data `y` and making probabilistic predictions. + + +See also [`models`](@ref) + +""" +matching(X) = ModelChecker{false,false,scitype(X),missing}() +matching(X, y) = ModelChecker{true,false,scitype(X),scitype(y)}() +matching(X, y, w) = ModelChecker{true,true,scitype(X),scitype(y)}() + +(f::ModelChecker{false,false,XS,missing})(model::MLJModels.ModelProxy) where XS = + !(model.is_supervised) && + XS <: model.input_scitype + +(f::ModelChecker{true,false,XS,yS})(model::MLJModels.ModelProxy) where {XS,yS} = + model.is_supervised && + XS <: model.input_scitype && + yS <: model.target_scitype + +(f::ModelChecker{true,true,XS,yS})(model::MLJModels.ModelProxy) where {XS,yS} = + model.is_supervised && + model.supports_weights && + XS <: model.input_scitype && + yS <: model.target_scitype + +(f::ModelChecker)(name::String; pkg=nothing) = f(info(name, pkg=pkg)) +(f::ModelChecker)(realmodel::Model) = f(info(realmodel)) + +matching(model::MLJModels.ModelProxy, args...) = matching(args...)(model) +matching(name::String, args...; pkg=nothing) = + matching(info(name, pkg=pkg), args...) +matching(realmodel::Model, args...) = matching(info(realmodel), args...) + + +## DUAL NOTION + +struct DataChecker + model::MLJModels.ModelProxy +end + +matching(model::MLJModels.ModelProxy) = DataChecker(model) +matching(name::String; pkg=nothing) = DataChecker(info(name, pkg=pkg)) +matching(realmodel::Model) = matching(info(realmodel)) + +(f::DataChecker)(args...) = matching(f.model, args...) + + + + + diff --git a/src/networks.jl b/src/networks.jl index 2fd0a4797..6da36d8fb 100644 --- a/src/networks.jl +++ b/src/networks.jl @@ -296,15 +296,14 @@ MLJBase.selectrows(X::AbstractNode, r) = X(rows=r) (y::Node{Nothing})(Xnew) = (y.operation)([arg(Xnew) for arg in y.args]...) """ -$SIGNATURES + fit!(N::Node; rows=nothing, verbosity::Int=1, force::Bool=false) + +Train all machines in the learning network terminating at node `N`, in an +appropriate order. These machines are those returned by `machines(N)`. -Train the machines of all dynamic nodes in the learning network terminating at -`N` in an appropriate order. """ -function fit!(y::Node; rows=nothing, verbosity::Int=1, force::Bool=false) - if rows === nothing - rows = (:) - end +function fit!(y::Node; rows=nothing, verbosity::Int=1, +force::Bool=false) if rows === nothing rows = (:) end # get non-source nodes: nodes_ = filter(nodes(y)) do n diff --git a/test/model_matching.jl b/test/model_matching.jl new file mode 100644 index 000000000..a73595874 --- /dev/null +++ b/test/model_matching.jl @@ -0,0 +1,30 @@ +module TestModelMatching + +using MLJ +using Test + +X = (a = rand(5), b = categorical(1:5)) +y = rand(5) +w = rand(5) + +@test matching(X) == MLJ.ModelChecker{false,false,scitype(X),missing}() +@test matching(X, y) == MLJ.ModelChecker{true,false,scitype(X),scitype(y)}() +@test matching(X, y, w) == MLJ.ModelChecker{true,true,scitype(X),scitype(y)}() + +@test !matching("RidgeRegressor", pkg="MultivariateStats", X) +@test matching("FeatureSelector", X) + +m1 = models(matching(X)) +@test issubset([info("FeatureSelector"), + info("OneHotEncoder"), + info("Standardizer")], m1) + +@test !("PCA" in m1) +@test !(info("PCA") in m1) + +m2 = models(matching(X, y)) +@test info("ConstantRegressor") in m2 +@test !(info("DecisionTreeRegressor") in m2) + +end +true diff --git a/test/runtests.jl b/test/runtests.jl index 2ffe0ff1e..e814044fe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -46,6 +46,10 @@ end @test include("ensembles.jl") end +@testset "matching models to data" begin + @test include("model_matching.jl") +end + @testset "tasks" begin @test include("tasks.jl") end