JuliaAI · ablaom · May 19, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 24, 2024
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -15,6 +15,7 @@ MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
+ParallelKMeans = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
 StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"

diff --git a/docs/src/about_mlj.md b/docs/src/about_mlj.md
@@ -110,7 +110,7 @@ X, y = @load_reduced_ames;
 Evaluating the "self-tuning" pipeline model's performance using 5-fold
 cross-validation (implies multiple layers of nested resampling):
 
-```julia
+```julia-repl
 julia> evaluate(self_tuning_pipe, X, y,
                 measures=[l1, l2],
                 resampling=CV(nfolds=5, rng=123),
@@ -229,19 +229,19 @@ installed in a new
 [environment](https://julialang.github.io/Pkg.jl/v1/environments/) to
 avoid package conflicts. You can do this with
 
-```julia
+```julia-repl
 julia> using Pkg; Pkg.activate("my_MLJ_env", shared=true)
 ```
 
 Installing MLJ is also done with the package manager:
 
-```julia
+```julia-repl
 julia> Pkg.add("MLJ")
 ```
 
 **Optional:** To test your installation, run
 
-```julia
+```julia-repl
 julia> Pkg.test("MLJ")
 ```
 
@@ -252,7 +252,7 @@ environment to make model-specific code available. This
 happens automatically when you use MLJ's interactive load command
 `@iload`, as in
 
-```julia
+```julia-repl
 julia> Tree = @iload DecisionTreeClassifier # load type
 julia> tree = Tree() # instance
 ```

diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md
@@ -38,16 +38,10 @@ coerce!(channing, :Sex => Multiclass)
 ```julia
 import RDatasets
 channing = RDatasets.dataset("boot", "channing")
+```
 
-julia> first(channing, 4)
-4×5 DataFrame
- Row │ Sex   Entry  Exit   Time   Cens
-     │ Cat…  Int32  Int32  Int32  Int32
-─────┼──────────────────────────────────
-   1 │ Male    782    909    127      1
-   2 │ Male   1020   1128    108      1
-   3 │ Male    856    969    113      1
-   4 │ Male    915    957     42      1
+```@example workflows
+first(channing, 4)
 ```
 
 Inspecting metadata, including column scientific types:
@@ -61,17 +55,17 @@ Horizontally splitting data and shuffling rows.
 Here `y` is the `:Exit` column and `X` a table with everything else:
 
 ```@example workflows
-y, X =  unpack(channing, ==(:Exit), rng=123);
+y, X = unpack(channing, ==(:Exit), rng=123);
 nothing # hide
 ```
 
 Here `y` is the `:Exit` column and `X` everything else except `:Time`:
 
 ```@example workflows
-y, X =  unpack(channing,
-               ==(:Exit),
-               !=(:Time);
-               rng=123);
+y, X = unpack(channing,
+              ==(:Exit),
+              !=(:Time);
+              rng=123);
 scitype(y)
 ```
 
@@ -115,7 +109,7 @@ nothing # hide
 Or, if already horizontally split:
 
 ```@example workflows
-(Xtrain, Xtest), (ytrain, ytest)  = partition((X, y), 0.6, multi=true,  rng=123)
+(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.6, multi=true, rng=123)
 ```
 
 
@@ -183,7 +177,7 @@ tree = Tree(min_samples_split=5, max_depth=4)
 
 or
 
-```@julia
+```julia
 tree = (@load DecisionTreeClassifier)()
 tree.min_samples_split = 5
 tree.max_depth = 4
@@ -251,7 +245,7 @@ Note `LogLoss()` has aliases `log_loss` and `cross_entropy`.
 Predict on the new data set:
 
 ```@example workflows
-Xnew = (FL = rand(3), RW = rand(3), CL = rand(3), CW = rand(3), BD =rand(3))
+Xnew = (FL = rand(3), RW = rand(3), CL = rand(3), CW = rand(3), BD = rand(3))
 predict(mach, Xnew)      # a vector of distributions
 ```
 
@@ -538,7 +532,8 @@ curve = learning_curve(mach,
 
 ```julia
 using Plots
-plot(curve.parameter_values, curve.measurements, xlab=curve.parameter_name, xscale=curve.parameter_scale)
+plot(curve.parameter_values, curve.measurements,
+     xlab=curve.parameter_name, xscale=curve.parameter_scale)
 ```
 
 ![](img/workflows_learning_curve.png)
@@ -558,7 +553,7 @@ curve = learning_curve(mach,
 
 ```julia
 plot(curve.parameter_values, curve.measurements,
-xlab=curve.parameter_name, xscale=curve.parameter_scale)
+     xlab=curve.parameter_name, xscale=curve.parameter_scale)
 ```
 
 ![](img/workflows_learning_curves.png)
diff --git a/docs/src/controlling_iterative_models.md b/docs/src/controlling_iterative_models.md
@@ -253,7 +253,6 @@ In the code, `wrapper` is an object that wraps the training machine
 in this example).
 
 ```julia
-
 import IterationControl # or MLJ.IterationControl
 
 struct IterateFromList

diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md
@@ -27,7 +27,7 @@ using MLJ
 X = (a=rand(12), b=rand(12), c=rand(12));
 y = X.a + 2X.b + 0.05*rand(12);
 model = (@load RidgeRegressor pkg=MultivariateStats verbosity=0)()
-cv=CV(nfolds=3)
+cv = CV(nfolds=3)
 evaluate(model, X, y, resampling=cv, measure=l2, verbosity=0)
 ```
 
@@ -51,8 +51,8 @@ Multiple measures are specified as a vector:
 evaluate!(
     mach,
     resampling=cv,
-    measures=[l1, rms, rmslp1], 
-	verbosity=0,
+    measures=[l1, rms, rmslp1],
+    verbosity=0,
 )
 ```
 
@@ -70,7 +70,7 @@ evaluate!(
     mach,
     resampling=CV(nfolds=3),
     measure=[l2, rsquared],
-    weights=weights, 
+    weights=weights,
 )
 ```
 
@@ -91,8 +91,8 @@ fold1 = 1:6; fold2 = 7:12;
 evaluate!(
     mach,
     resampling = [(fold1, fold2), (fold2, fold1)],
-    measures=[l1, l2], 
-	verbosity=0,
+    measures=[l1, l2],
+    verbosity=0,
 )
 ```
 

diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -5,14 +5,14 @@ For an outline of MLJ's **goals** and **features**, see
 
 This page introduces some MLJ basics, assuming some familiarity with
 machine learning. For a complete list of other MLJ learning resources,
-see [Learning MLJ](@ref). 
+see [Learning MLJ](@ref).
 
 MLJ collects together the functionality provided by mutliple packages. To learn how to
 install components separately, run `using MLJ; @doc MLJ`.
 
 This section introduces only the most basic MLJ operations and
 concepts. It assumes MLJ has been successfully installed. See
-[Installation](@ref) if this is not the case. 
+[Installation](@ref) if this is not the case.
 
 
 ```@setup doda
@@ -31,7 +31,7 @@ column vectors:
 ```@repl doda
 using MLJ
 iris = load_iris();
-selectrows(iris, 1:3)  |> pretty
+selectrows(iris, 1:3) |> pretty
 schema(iris)
 ```
 
@@ -114,8 +114,8 @@ computing the mode of each prediction):
 ```@repl doda
 evaluate(tree, X, y,
          resampling=CV(shuffle=true),
-                 measures=[log_loss, accuracy],
-                 verbosity=0)
+         measures=[log_loss, accuracy],
+         verbosity=0)
 ```
 
 Under the hood, `evaluate` calls lower level functions `predict` or

diff --git a/docs/src/internals.md b/docs/src/internals.md
@@ -49,7 +49,7 @@ function fit!(mach::Machine; rows=nothing, force=false, verbosity=1)
     end
 
     rows_have_changed  = (!isdefined(mach, :previous_rows) ||
-	    rows != mach.previous_rows)
+            rows != mach.previous_rows)
 
     args = [MLJ.selectrows(arg, rows) for arg in mach.args]
 

diff --git a/docs/src/learning_curves.md b/docs/src/learning_curves.md
@@ -48,7 +48,7 @@ used using `rngs=...` (an integer automatically generates the number
 specified):
 
 ```@example hooking
-atom.lambda= 7.3
+atom.lambda = 7.3
 r_n = range(ensemble, :n, lower=1, upper=50)
 curves = MLJ.learning_curve(mach;
                             range=r_n,

diff --git a/docs/src/learning_networks.md b/docs/src/learning_networks.md
@@ -320,18 +320,18 @@ has the same signature as `MLJModelInterface.fit`):
 import MLJBase
 function MLJBase.prefit(composite::CompositeA, verbosity, X, y)
 
-        # the learning network from above:
-        Xs = source(X)
-        ys = source(y)
-        mach1 = machine(:preprocessor, Xs)
-        x = transform(mach1, Xs)
-        mach2 = machine(:classifier, x, ys)
-        yhat = predict(mach2, x)
-
-        verbosity > 0 && @info "I'm a noisy fellow!"
-
-        # return "learning network interface":
-        return (; predict=yhat)
+    # the learning network from above:
+    Xs = source(X)
+    ys = source(y)
+    mach1 = machine(:preprocessor, Xs)
+    x = transform(mach1, Xs)
+    mach2 = machine(:classifier, x, ys)
+    yhat = predict(mach2, x)
+
+    verbosity > 0 && @info "I'm a noisy fellow!"
+
+    # return "learning network interface":
+    return (; predict=yhat)
 end
 ```
 
@@ -594,10 +594,10 @@ using MLJ
 import MLJBase
 
 mutable struct CompositeE <: DeterministicNetworkComposite
-        clusterer     # `:kmeans` or `:kmedoids`
-        k::Int        # number of clusters
-        solver        # a ridge regression parameter we want to expose
-        c::Float64    # a "coupling" coefficient
+    clusterer     # `:kmeans` or `:kmedoids`
+    k::Int        # number of clusters
+    solver        # a ridge regression parameter we want to expose
+    c::Float64    # a "coupling" coefficient
 end
 ```
 
@@ -610,26 +610,26 @@ KMedoids = @load KMedoids pkg=Clustering verbosity=0
 
 function MLJBase.prefit(composite::CompositeE, verbosity, X, y)
 
-        Xs = source(X)
-        ys = source(y)
+    Xs = source(X)
+    ys = source(y)
 
-        k = composite.k
-        solver = composite.solver
-        c = composite.c
+    k = composite.k
+    solver = composite.solver
+    c = composite.c
 
-        clusterer = composite.clusterer == :kmeans ? KMeans(; k) : KMedoids(; k)
-        mach1 = machine(clusterer, Xs)
-        Xsmall = transform(mach1, Xs)
+    clusterer = composite.clusterer == :kmeans ? KMeans(; k) : KMedoids(; k)
+    mach1 = machine(clusterer, Xs)
+    Xsmall = transform(mach1, Xs)
 
-        # the coupling - ridge regularization depends on the number of
-        # clusters `k` and the coupling coefficient `c`:
-        lambda = exp(-c/k)
+    # the coupling - ridge regularization depends on the number of
+    # clusters `k` and the coupling coefficient `c`:
+    lambda = exp(-c/k)
 
-        ridge = RidgeRegressor(; lambda, solver)
-        mach2 = machine(ridge, Xsmall, ys)
-        yhat = predict(mach2, Xsmall)
+    ridge = RidgeRegressor(; lambda, solver)
+    mach2 = machine(ridge, Xsmall, ys)
+    yhat = predict(mach2, Xsmall)
 
-        return (predict=yhat,)
+    return (predict=yhat,)
 end
 ```
 
@@ -748,20 +748,17 @@ Q = @node sqrt(Z)
 (so that `Q() == 4`). Here's a more complicated application of `@node` to row-shuffle a
 table:
 
-```julia
-using Random
+```@example
+using MLJ, Random
 X = (x1 = [1, 2, 3, 4, 5],
-         x2 = [:one, :two, :three, :four, :five])
+     x2 = [:one, :two, :three, :four, :five])
 rows(X) = 1:nrows(X)
 
 Xs = source(X)
-rs  = @node rows(Xs)
+rs = @node rows(Xs)
 W = @node selectrows(Xs, @node shuffle(rs))
 
-julia> W()
-(x1 = [5, 1, 3, 2, 4],
- x2 = Symbol[:five, :one, :three, :two, :four],)
-
+W()
 ```
 
 **Important.** An argument not in global scope is assumed by `@node` to be a node or

diff --git a/docs/src/linear_pipelines.md b/docs/src/linear_pipelines.md
@@ -29,7 +29,7 @@ model type `KNNRegressor` assumes the features are all
   `X` with `coerce(X, :age=>Continuous)`
 - standardizing continuous features and one-hot encoding the
   `Multiclass` features using the `ContinuousEncoder` model
-  
+
 However, we can avoid separately applying these preprocessing steps
 (two of which require `fit!` steps) by combining them with the
 supervised `KKNRegressor` model in a new *pipeline* model, using

diff --git a/docs/src/loading_model_code.md b/docs/src/loading_model_code.md
@@ -32,7 +32,7 @@ provided by the
 package. Then, to determine which package provides the MLJ interface
 you call `load_path`:
 
-```julia
+```julia-repl
 julia> load_path("DecisionTreeClassifier", pkg="DecisionTree")
 "MLJDecisionTreeInterface.DecisionTreeClassifier"
 ```
@@ -41,22 +41,22 @@ In this case, we see that the package required is
 MLJDecisionTreeInterface.jl. If this package is not in `my_env` (do
 `Pkg.status()` to check) you add it by running
 
-```julia
+```julia-repl
 julia> Pkg.add("MLJDecisionTreeInterface");
 ```
 
 So long as `my_env` is the active environment, this action need never
 be repeated (unless you run `Pkg.rm("MLJDecisionTreeInterface")`). You
 are now ready to instantiate a decision tree classifier:
 
-```julia
+```julia-repl
 julia> Tree = @load DecisionTree pkg=DecisionTree
 julia> tree = Tree()
 ```
 
 which is equivalent to
 
-```julia
+```julia-repl
 julia> import MLJDecisionTreeInterface.DecisionTreeClassifier
 julia> Tree = MLJDecisionTreeInterface.DecisionTreeClassifier
 julia> tree = Tree()