From 2a063381ff5c03203a09b1ee05eeef17bdfc2900 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 2 Dec 2025 08:45:52 +1300
Subject: [PATCH 1/5] add uncertainty_radius_95 to PerformanceEvaluation
 structs;

---
 src/resampling.jl | 51 ++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/resampling.jl b/src/resampling.jl
index 51071e34..ea4bcdc3 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -575,7 +575,8 @@ When displayed, a `PerformanceEvaluation` object includes a value under the head
 `1.96*SE`, derived from the standard error of the `per_fold` entries. This value is
 suitable for constructing a formal 95% confidence interval for the given
 `measurement`. Such intervals should be interpreted with caution. See, for example, [Bates
-et al.  (2021)](https://arxiv.org/abs/2104.00673).
+et al. (2021)](https://arxiv.org/abs/2104.00673). It is also stored in the field
+`uncertainty_radius_95`.
 
 ### Fields
 
@@ -591,6 +592,9 @@ These fields are part of the public API of the `PerformanceEvaluation` struct.
   applied for a given measure `m` is
   `StatisticalMeasuresBase.external_aggregation_mode(m)` (commonly `Mean()` or `Sum()`)
 
+- `uncertainty_radius_95`: vector of radii of uncertainty for 95% confidence intervals,
+  one for each element of `meaures`. See cautionary note above.
+
 - `operation` (e.g., `predict_mode`): the operations applied for each measure to generate
   predictions to be evaluated. Possibilities are: $PREDICT_OPERATIONS_STRING.
 
@@ -638,6 +642,7 @@ struct PerformanceEvaluation{M,
     model::M
     measure::Measure
     measurement::Measurement
+    uncertainty_radius_95::Union{Nothing,Measurement}
     operation::Operation
     per_fold::PerFold
     per_observation::PerObservation
@@ -670,6 +675,7 @@ struct CompactPerformanceEvaluation{M,
     model::M
     measure::Measure
     measurement::Measurement
+    uncertainty_radius_95::Union{Nothing,Measurement}
     operation::Operation
     per_fold::PerFold
     per_observation::PerObservation
@@ -682,6 +688,7 @@ compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation(
     e.model,
     e.measure,
     e.measurement,
+    e.uncertainty_radius_95,
     e.operation,
     e.per_fold,
     e. per_observation,
@@ -693,18 +700,6 @@ compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation(
 round3(x) = x
 round3(x::AbstractFloat) = round(x, sigdigits=3)
 
-const SE_FACTOR = 1.96 # For a 95% confidence interval.
-
-_standard_error(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1)
-_standard_error(v) = "N/A"
-
-function _standard_errors(e::AbstractPerformanceEvaluation)
-    measure = e.measure
-    length(e.per_fold[1]) == 1 && return [nothing]
-    std_errors = map(_standard_error, e.per_fold)
-    return std_errors
-end
-
 # to address #874, while preserving the display worked out in #757:
 _repr_(f::Function) = repr(f)
 _repr_(x) = repr("text/plain", x)
@@ -722,14 +717,14 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
     _measure = [_repr_(m) for m in e.measure]
     _measurement = round3.(e.measurement)
     _per_fold = [round3.(v) for v in e.per_fold]
-    _sterr = round3.(_standard_errors(e))
+    _uncertainty_radius_95 = round3.(e.uncertainty_radius_95)
     row_labels = _label.(eachindex(e.measure))
 
     # Define header and data for main table
 
     data = hcat(_measure, e.operation, _measurement)
     header = ["measure", "operation", "measurement"]
-    if length(row_labels) > 1
+    if length(row_labels) > 1 && length(first(e.per_fold)) > 1
         data = hcat(row_labels, data)
         header =["", header...]
     end
@@ -738,14 +733,14 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
         println(io, "PerformanceEvaluation object "*
             "with these fields:")
         println(io, "  model, measure, operation,\n"*
-            "  measurement, per_fold, per_observation,\n"*
+            "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
             "  fitted_params_per_fold, report_per_fold,\n"*
             "  train_test_rows, resampling, repeats")
     else
         println(io, "CompactPerformanceEvaluation object "*
             "with these fields:")
         println(io, "  model, measure, operation,\n"*
-            "  measurement, per_fold, per_observation,\n"*
+            "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
             "  train_test_rows, resampling, repeats")
     end
 
@@ -764,8 +759,8 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
     # Show the per-fold table if needed:
 
     if length(first(e.per_fold)) > 1
-        show_sterr = any(!isnothing, _sterr)
-        data2 = hcat(_per_fold, _sterr)
+        show_sterr = any(!isnothing, _uncertainty_radius_95)
+        data2 = hcat(_per_fold, _uncertainty_radius_95)
         header2 = ["per_fold", "1.96*SE"]
         if length(row_labels) > 1
             data2 = hcat(row_labels, data2)
@@ -783,14 +778,19 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
     show_color ? color_on() : color_off()
 end
 
-_summary(e) = Tuple(round3.(e.measurement))
+function _summary(e)
+    confidence_intervals = map(zip(e.measurement, e.uncertainty_radius_95)) do (μ, δ)
+        "$(round3(μ))±$(round3(δ))"
+    end
+    return "(\"$(name(e.model))\", "*join(confidence_intervals, ", ")*")"
+end
+
 Base.show(io::IO, e::PerformanceEvaluation) =
     print(io, "PerformanceEvaluation$(_summary(e))")
 Base.show(io::IO, e::CompactPerformanceEvaluation) =
     print(io, "CompactPerformanceEvaluation$(_summary(e))")
 
 
-
 # ===============================================================
 ## USER CONTROL OF DEFAULT LOGGING
 
@@ -1426,6 +1426,11 @@ end
 _view(::Nothing, rows) = nothing
 _view(weights, rows) = view(weights, rows)
 
+const SE_FACTOR = 1.96 # For a 95% confidence interval.
+
+radius_95(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1)
+radius_95(v) = "N/A"
+
 # Evaluation when `resampling` is a TrainTestPairs (CORE EVALUATOR):
 function evaluate!(
     mach::Machine,
@@ -1579,10 +1584,14 @@ function evaluate!(
         )
     end
 
+    confidence_radii_95 = length(per_fold[1]) == 1 ? nothing :
+        map(radius_95, per_fold)
+
     evaluation = PerformanceEvaluation(
         mach.model,
         measures,
         per_measure,
+        confidence_radii_95,
         operations,
         per_fold,
         per_observation,

From e41417c948830b1ef493fca5c6cc891bb2b32b21 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 2 Dec 2025 15:52:48 +1300
Subject: [PATCH 2/5] add the `tag` field to PerformanceEvaluation objects

---
 src/resampling.jl  | 192 ++++++++++++++++++++++++++++++++++++++-------
 test/resampling.jl | 166 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 326 insertions(+), 32 deletions(-)

diff --git a/src/resampling.jl b/src/resampling.jl
index ea4bcdc3..eebcd75a 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -585,7 +585,13 @@ These fields are part of the public API of the `PerformanceEvaluation` struct.
 - `model`: model used to create the performance evaluation. In the case a
     tuning model, this is the best model found.
 
-- `measure`: vector of measures (metrics) used to evaluate performance
+- `tag`: a string label associated with the evaluation, specified by the user when
+  replacing `model` in `evaluate(model, ...)` with `tag => model`, or `mach` in
+  `evaluate!(mach, ...)` with `tag => mach`. If unspecified, it is auto-generated, but
+  tag-uniqueness is not 100% guaranteed.
+
+ - `measure`: vector of measures (metrics) used
+  to evaluate performance
 
 - `measurement`: vector of measurements - one for each element of `measure` - aggregating
   the performance measurements over all train/test pairs (folds). The aggregation method
@@ -633,6 +639,7 @@ See also [`CompactPerformanceEvaluation`](@ref).
 struct PerformanceEvaluation{M,
                              Measure,
                              Measurement,
+                             Uncertainty,
                              Operation,
                              PerFold,
                              PerObservation,
@@ -640,9 +647,10 @@ struct PerformanceEvaluation{M,
                              ReportPerFold,
                              R} <: AbstractPerformanceEvaluation
     model::M
+    tag::String
     measure::Measure
     measurement::Measurement
-    uncertainty_radius_95::Union{Nothing,Measurement}
+    uncertainty_radius_95::Uncertainty
     operation::Operation
     per_fold::PerFold
     per_observation::PerObservation
@@ -666,16 +674,18 @@ For more on the remaining fields, see [`PerformanceEvaluation`](@ref).
 
 """
 struct CompactPerformanceEvaluation{M,
-                             Measure,
-                             Measurement,
-                             Operation,
-                             PerFold,
-                             PerObservation,
-                             R} <: AbstractPerformanceEvaluation
+                                    Measure,
+                                    Measurement,
+                                    Uncertainty,
+                                    Operation,
+                                    PerFold,
+                                    PerObservation,
+                                    R} <: AbstractPerformanceEvaluation
     model::M
+    tag::String
     measure::Measure
     measurement::Measurement
-    uncertainty_radius_95::Union{Nothing,Measurement}
+    uncertainty_radius_95::Uncertainty
     operation::Operation
     per_fold::PerFold
     per_observation::PerObservation
@@ -686,6 +696,7 @@ end
 compactify(e::CompactPerformanceEvaluation) = e
 compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation(
     e.model,
+    e.tag,
     e.measure,
     e.measurement,
     e.uncertainty_radius_95,
@@ -703,6 +714,13 @@ round3(x::AbstractFloat) = round(x, sigdigits=3)
 # to address #874, while preserving the display worked out in #757:
 _repr_(f::Function) = repr(f)
 _repr_(x) = repr("text/plain", x)
+_repr(::Nothing) = ""
+
+function uncertainty_as_string(δ)
+    isnothing(δ) && return ""
+    δ isa Real && isinf(δ) && return ""
+    return string(round3(δ))
+end
 
 # helper for row labels: _label(1) ="A", _label(2) = "B", _label(27) = "BA", etc
 const alphabet = Char.(65:90)
@@ -716,8 +734,9 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
 
     _measure = [_repr_(m) for m in e.measure]
     _measurement = round3.(e.measurement)
-    _per_fold = [round3.(v) for v in e.per_fold]
-    _uncertainty_radius_95 = round3.(e.uncertainty_radius_95)
+    _per_fold = reshape([round3.(v) for v in e.per_fold], length(e.per_fold), 1)
+    _uncertainty_radius_95 = uncertainty_as_string.(e.uncertainty_radius_95)
+    show_radius = any(x -> !isempty(x), _uncertainty_radius_95)
     row_labels = _label.(eachindex(e.measure))
 
     # Define header and data for main table
@@ -732,18 +751,18 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
     if e isa PerformanceEvaluation
         println(io, "PerformanceEvaluation object "*
             "with these fields:")
-        println(io, "  model, measure, operation,\n"*
+        println(io, "  model, tag, measure, operation,\n"*
             "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
             "  fitted_params_per_fold, report_per_fold,\n"*
             "  train_test_rows, resampling, repeats")
     else
         println(io, "CompactPerformanceEvaluation object "*
             "with these fields:")
-        println(io, "  model, measure, operation,\n"*
+        println(io, "  model, tag, measure, operation,\n"*
             "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
             "  train_test_rows, resampling, repeats")
     end
-
+    println(io, "Tag: $(e.tag)")
     println(io, "Extract:")
     show_color = MLJBase.SHOW_COLOR[]
     color_off()
@@ -759,9 +778,12 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
     # Show the per-fold table if needed:
 
     if length(first(e.per_fold)) > 1
-        show_sterr = any(!isnothing, _uncertainty_radius_95)
-        data2 = hcat(_per_fold, _uncertainty_radius_95)
-        header2 = ["per_fold", "1.96*SE"]
+        data2 = _per_fold
+        header2 = ["per_fold", ]
+        if show_radius
+            data2 = hcat(_per_fold, _uncertainty_radius_95)
+            header2 = [header2..., "1.96*SE"]
+        end
         if length(row_labels) > 1
             data2 = hcat(row_labels, data2)
             header2 =["", header2...]
@@ -769,7 +791,7 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation)
         PrettyTables.pretty_table(
             io,
             data2;
-            column_labels = header2,
+            column_labels = [header2,],
             alignment=:l,
             line_breaks=true,
             style,
@@ -780,9 +802,11 @@ end
 
 function _summary(e)
     confidence_intervals = map(zip(e.measurement, e.uncertainty_radius_95)) do (μ, δ)
-        "$(round3(μ))±$(round3(δ))"
+        a = round3(μ)
+        b = uncertainty_as_string(δ)
+        isempty(b) ? a : "$a ± $b"
     end
-    return "(\"$(name(e.model))\", "*join(confidence_intervals, ", ")*")"
+    return "(\"$(e.tag)\", "*join(confidence_intervals, ", ")*")"
 end
 
 Base.show(io::IO, e::PerformanceEvaluation) =
@@ -854,6 +878,12 @@ end
 # ---------------------------------------------------------------
 # Helpers
 
+function machine_and_tag(mach::Machine)
+    tag = string(name(mach.model), "-", rand(100:999))
+    return mach, tag
+end
+machine_and_tag(pair::Pair{String,<:Machine}) = last(pair), first(pair)
+
 function actual_rows(rows, N, verbosity)
     unspecified_rows = (rows === nothing)
     _rows = unspecified_rows ? (1:N) : rows
@@ -1111,6 +1141,9 @@ the specified `resampling` strategy (defaulting to 6-fold cross-validation) and
 which can be a single measure or vector. Returns a [`PerformanceEvaluation`](@ref)
 object.
 
+In place of `mach`, one can use `tag_string => mach`, or a vector of either of these forms,
+to return a vector of performance evaluation objects.
+
 Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` is not an
 instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)`
 is expected. For example, setting
@@ -1175,12 +1208,49 @@ Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated.
 - `compact=false` - if `true`, the returned evaluation object excludes these fields:
   `fitted_params_per_fold`, `report_per_fold`, `train_test_rows`.
 
+# Example
+
+Setup:
+
+```julia
+using MLJ # or using MLJBase, StatisticalMeasures
+X, y = make_moons(rng=123) # a table and a vector
+model = ConstantClassifier()
+mach = machine(model, X, y)
+```
+
+Perform a simple evaluation on a holdout set, against accuracy and area under the ROC
+curve:
+
+```julia
+evaluate!(mach; resampling=Holdout(fraction_train=0.8), measure=[accuracy, auc])
+```
+
+Perform Monte Carlo cross-validation, with 2 folds and 5 repeats, against area Brier score:
+
+```julia
+evaluate!(mach; resampling=CV(nfolds=2, rng=123), repeats=5,  measures=brier_score)
+```
+
+Evaluate on explicitly specified train-test pairs, against cross entropy, and tag the
+result, "explicit folds":
+
+```julia
+e = evaluate!(
+    "explicit folds" => mach;
+    resampling=[(1:140, 141:150), (11:150, 1:10)],
+    measure=log_loss,
+)
+show(e)
+# PerformanceEvaluation("explicit folds", 0.708 ± 0.0328)
+```
+
 See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref),
 [`CompactPerformanceEvaluation`](@ref).
 
 """
 function evaluate!(
-    mach::Machine;
+    mach_or_pair::Union{Machine,Pair{String,<:Machine}};
     resampling=CV(),
     measures=nothing,
     measure=measures,
@@ -1203,6 +1273,8 @@ function evaluate!(
     # weights, measures, operations, and dispatches a
     # strategy-specific `evaluate!`
 
+    mach, tag = machine_and_tag(mach_or_pair)
+
     length(mach.args) > 1 || throw(ERR_NEED_TARGET)
 
     repeats > 0 || error("Need `repeats > 0`. ")
@@ -1252,6 +1324,7 @@ function evaluate!(
 
     evaluate!(
         mach,
+        tag,
         resampling,
         weights,
         class_weights,
@@ -1277,11 +1350,52 @@ See the machine version `evaluate!` for the complete list of options.
 
 Returns a  [`PerformanceEvaluation`](@ref) object.
 
+In place of `model`, one can use `tag_string => model`, or a vector of either of these
+forms, to return a vector of performance evaluation objects.
+
+# Example
+
+Setup:
+
+```julia
+using MLJ # or using MLJBase, StatisticalMeasures
+X, y = make_moons(rng=123) # a table and a vector
+model = ConstantClassifier()
+```
+
+Perform a simple evaluation on a holdout set, against accuracy and area under the ROC
+curve:
+
+```julia
+evaluate(model, X, y; resampling=Holdout(fraction_train=0.8), measure=[accuracy, auc])
+```
+
+Perform Monte Carlo cross-validation, with 2 folds and 5 repeats, against area Brier score:
+
+```julia
+evaluate(model, X, y; resampling=CV(nfolds=2, rng=123), repeats=5,  measures=brier_score)
+```
+
+Evaluate on explicitly specified train-test pairs, against cross entropy, and tag the
+result, "explicit folds":
+
+```julia
+e = evaluate(
+    "explicit folds" => model, X, y;
+    resampling=[(1:140, 141:150), (11:150, 1:10)],
+    measure=log_loss,
+)
+show(e)
+# PerformanceEvaluation("explicit folds", 0.708 ± 0.0328)
+```
+
 See also [`evaluate!`](@ref).
 
 """
 evaluate(model::Model, args...; cache=true, kwargs...) =
     evaluate!(machine(model, args...; cache=cache); kwargs...)
+evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) =
+    evaluate!(first(pair) => machine(last(pair), args...; cache=cache); kwargs...)
 
 # -------------------------------------------------------------------
 # Resource-specific methods to distribute a function parameterized by
@@ -1428,12 +1542,16 @@ _view(weights, rows) = view(weights, rows)
 
 const SE_FACTOR = 1.96 # For a 95% confidence interval.
 
-radius_95(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1)
-radius_95(v) = "N/A"
+function radius_95(v::AbstractVector{<:Real})
+    length(v) < 2 && return Inf
+    return SE_FACTOR*std(v) / sqrt(length(v) - 1)
+end
+radius_95(v) = nothing
 
 # Evaluation when `resampling` is a TrainTestPairs (CORE EVALUATOR):
 function evaluate!(
     mach::Machine,
+    tag,
     resampling,
     weights,
     class_weights,
@@ -1584,14 +1702,17 @@ function evaluate!(
         )
     end
 
-    confidence_radii_95 = length(per_fold[1]) == 1 ? nothing :
-        map(radius_95, per_fold)
+    # The following is a vector with `nothing` values for each measure not returning
+    # `Real` values (e.g., confmat), and `Inf` values for any other kind of measure when
+    # there is only one train-test fold.
+    uncertainty_radius_95 = map(radius_95, per_fold)
 
     evaluation = PerformanceEvaluation(
         mach.model,
+        tag,
         measures,
         per_measure,
-        confidence_radii_95,
+        uncertainty_radius_95,
         operations,
         per_fold,
         per_observation,
@@ -1610,8 +1731,17 @@ end
 # ----------------------------------------------------------------
 # Evaluation when `resampling` is a ResamplingStrategy
 
-function evaluate!(mach::Machine, resampling::ResamplingStrategy,
-                   weights, class_weights, rows, verbosity, repeats, args...)
+function evaluate!(
+    mach::Machine,
+    tag,
+    resampling::ResamplingStrategy,
+    weights,
+    class_weights,
+    rows,
+    verbosity,
+    repeats,
+    args...,
+    )
 
     train_args = Tuple(a() for a in mach.args)
     y = train_args[2]
@@ -1625,6 +1755,7 @@ function evaluate!(mach::Machine, resampling::ResamplingStrategy,
 
     evaluate!(
         mach,
+        tag,
         repeated_train_test_pairs,
         weights,
         class_weights,
@@ -1758,6 +1889,7 @@ end
 function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...)
 
     mach = machine(resampler.model, args...; cache=resampler.cache)
+    tag = ""
 
     _measures = _actual_measures(resampler.measure, resampler.model)
 
@@ -1786,6 +1918,7 @@ function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...)
     # `PerformanceEvaluation` object.)
     e = evaluate!(
         mach,
+        tag,
         resampler.resampling,
         resampler.weights,
         resampler.class_weights,
@@ -1842,8 +1975,10 @@ function MLJModelInterface.update(
     end
 
     mach, e = fitresult
+    tag = ""
     train_test_rows = e.train_test_rows
 
+
     # since `resampler.model` could have changed, so might the actual measures and
     # operations that should be passed to the (low level) `evaluate!`:
     measures = _actual_measures(resampler.measure, resampler.model)
@@ -1860,6 +1995,7 @@ function MLJModelInterface.update(
     # re-evaluate:
     e = evaluate!(
         mach2,
+        tag,
         train_test_rows,
         resampler.weights,
         resampler.class_weights,
diff --git a/test/resampling.jl b/test/resampling.jl
index d1928d39..d7161f1e 100644
--- a/test/resampling.jl
+++ b/test/resampling.jl
@@ -938,11 +938,13 @@ end
     fit!(mach)
 end
 
+model = ConstantClassifier()
+X = (; x = ones(10))
+y = coerce([1, 2, 2, 2, 1, 2, 1, 2, 1, 1], Multiclass)
+
 @testset "compact evaluation objects" begin
-    model = ConstantClassifier()
-    X, y = make_blobs(10)
-    e = evaluate(model, X, y)
-    ec = evaluate(model, X, y, compact=true)
+    e = evaluate("tag" => model, X, y)
+    ec = evaluate("tag" => model, X, y, compact=true)
     @test MLJBase.compactify(ec) == ec == MLJBase.compactify(e)
     @test e isa PerformanceEvaluation
     @test ec isa CompactPerformanceEvaluation
@@ -963,6 +965,162 @@ end
     end
 end
 
+bogus(yhat, y) = [1,]
+# The measures that get stored in a PerformanceEvaluation object are the user-specified
+# measures wrapped in `robust_measure(...)`!
+MLJBase._repr_(::API.RobustMeasure{<:typeof(bogus)}) = "bogus"
+
+@testset "more display tests" begin
+    # no extra table (only one train-test pair)
+    e = evaluate("tag" => model, X, y; resampling=Holdout(),
+                 measures = [bogus, log_loss])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌──────────────────────┬───────────┬─────────────┐\n"*
+        "│ measure              │ operation │ measurement │\n"*
+        "├──────────────────────┼───────────┼─────────────┤\n"*
+        "│ bogus                │ predict   │ [1.0]       │\n"*
+        "│ LogLoss(             │ predict   │ 0.751       │\n"*
+        "│   tol = 2.22045e-16) │           │             │\n"*
+        "└──────────────────────┴───────────┴─────────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.751)"
+
+    # extra table - one non-numeric measure:
+    e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2),
+                 measures = [bogus,])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌─────────┬───────────┬─────────────┐\n"*
+        "│ measure │ operation │ measurement │\n"*
+        "├─────────┼───────────┼─────────────┤\n"*
+        "│ bogus   │ predict   │ [1.0]       │\n"*
+        "└─────────┴───────────┴─────────────┘\n"*
+        "┌────────────────┐\n"*
+        "│ per_fold       │\n"*
+        "├────────────────┤\n"*
+        "│ [[1.0], [1.0]] │\n"*
+        "└────────────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0])"
+
+    # extra table - one numeric measure:
+    e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2),
+                 measures = [accuracy,])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌────────────┬──────────────┬─────────────┐\n"*
+        "│ measure    │ operation    │ measurement │\n"*
+        "├────────────┼──────────────┼─────────────┤\n"*
+        "│ Accuracy() │ predict_mode │ 0.4         │\n"*
+        "└────────────┴──────────────┴─────────────┘\n"*
+        "┌────────────┬─────────┐\n"*
+        "│ per_fold   │ 1.96*SE │\n"*
+        "├────────────┼─────────┤\n"*
+        "│ [0.4, 0.4] │ 0.0     │\n"*
+        "└────────────┴─────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", 0.4 ± 0.0)"
+
+    # extra table - two numeric measures:
+    e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2),
+                 measures = [accuracy, log_loss])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌───┬──────────────────────┬──────────────┬─────────────┐\n"*
+        "│   │ measure              │ operation    │ measurement │\n"*
+        "├───┼──────────────────────┼──────────────┼─────────────┤\n"*
+        "│ A │ Accuracy()           │ predict_mode │ 0.4         │\n"*
+        "│ B │ LogLoss(             │ predict      │ 0.754       │\n"*
+        "│   │   tol = 2.22045e-16) │              │             │\n"*
+        "└───┴──────────────────────┴──────────────┴─────────────┘\n"*
+        "┌───┬────────────────┬─────────┐\n"*
+        "│   │ per_fold       │ 1.96*SE │\n"*
+        "├───┼────────────────┼─────────┤\n"*
+        "│ A │ [0.4, 0.4]     │ 0.0     │\n"*
+        "│ B │ [0.754, 0.754] │ 0.0     │\n"*
+        "└───┴────────────────┴─────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", 0.4 ± 0.0, 0.754 ± 0.0)"
+
+    # extra table - two non-numeric measures:
+    e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2),
+                 measures = [bogus, bogus])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌───┬─────────┬───────────┬─────────────┐\n"*
+        "│   │ measure │ operation │ measurement │\n"*
+        "├───┼─────────┼───────────┼─────────────┤\n"*
+        "│ A │ bogus   │ predict   │ [1.0]       │\n"*
+        "│ B │ bogus   │ predict   │ [1.0]       │\n"*
+        "└───┴─────────┴───────────┴─────────────┘\n"*
+        "┌───┬────────────────┐\n"*
+        "│   │ per_fold       │\n"*
+        "├───┼────────────────┤\n"*
+        "│ A │ [[1.0], [1.0]] │\n"*
+        "│ B │ [[1.0], [1.0]] │\n"*
+        "└───┴────────────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], [1.0])"
+
+    # extra table - mixed type of measures:
+    e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2),
+                 measures = [bogus, macro_f1score])
+    @test sprint(show, MIME("text/plain"), e) ==
+        "PerformanceEvaluation object with these fields:\n"*
+        "  model, tag, measure, operation,\n"*
+        "  measurement, uncertainty_radius_95, per_fold, per_observation,\n"*
+        "  fitted_params_per_fold, report_per_fold,\n"*
+        "  train_test_rows, resampling, repeats\n"*
+        "Tag: tag\n"*
+        "Extract:\n"*
+        "┌───┬──────────────────────────────┬──────────────┬─────────────┐\n"*
+        "│   │ measure                      │ operation    │ measurement │\n"*
+        "├───┼──────────────────────────────┼──────────────┼─────────────┤\n"*
+        "│ A │ bogus                        │ predict      │ [1.0]       │\n"*
+        "│ B │ MulticlassFScore(            │ predict_mode │ 0.286       │\n"*
+        "│   │   beta = 1.0,                │              │             │\n"*
+        "│   │   average = MacroAvg(),      │              │             │\n"*
+        "│   │   return_type = LittleDict,  │              │             │\n"*
+        "│   │   levels = nothing,          │              │             │\n"*
+        "│   │   perm = nothing,            │              │             │\n"*
+        "│   │   rev = nothing,             │              │             │\n"*
+        "│   │   checks = true)             │              │             │\n"*
+        "└───┴──────────────────────────────┴──────────────┴─────────────┘\n"*
+        "┌───┬────────────────┬─────────┐\n"*
+        "│   │ per_fold       │ 1.96*SE │\n"*
+        "├───┼────────────────┼─────────┤\n"*
+        "│ A │ [[1.0], [1.0]] │         │\n"*
+        "│ B │ [0.286, 0.286] │ 0.0     │\n"*
+        "└───┴────────────────┴─────────┘\n"
+    @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.286 ± 0.0)"
+end
+
 
 # # TRANSFORMER WITH PREDICT
 

From dcc588993a69de2c350dd06561d0bbe9d8969489 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 2 Dec 2025 17:20:16 +1300
Subject: [PATCH 3/5] add multi-model support for `evaluate`/`evaluate!`

---
 src/composition/models/stacking.jl |  2 ++
 src/resampling.jl                  | 46 ++++++++++++++++++++++++++----
 test/resampling.jl                 | 23 +++++++++++++++
 3 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl
index 70be5041..8a805c74 100644
--- a/src/composition/models/stacking.jl
+++ b/src/composition/models/stacking.jl
@@ -399,8 +399,10 @@ function internal_stack_report(
     results = NamedTuple{modelnames}(
         [(
             model = model,
+            tag = "",
             measure = stack.measures,
             measurement = Vector{Any}(undef, n_measures),
+            uncertainty_radius_95 = fill(nothing, n_measures),
             operation = _actual_operations(nothing, stack.measures, model, verbosity),
             per_fold = [Vector{Any}(undef, nfolds) for _ in 1:n_measures],
             per_observation = [Vector{Vector{Any}}(undef, nfolds) for _ in 1:n_measures],
diff --git a/src/resampling.jl b/src/resampling.jl
index eebcd75a..7e3c0bb5 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -586,8 +586,8 @@ These fields are part of the public API of the `PerformanceEvaluation` struct.
     tuning model, this is the best model found.
 
 - `tag`: a string label associated with the evaluation, specified by the user when
-  replacing `model` in `evaluate(model, ...)` with `tag => model`, or `mach` in
-  `evaluate!(mach, ...)` with `tag => mach`. If unspecified, it is auto-generated, but
+  replacing `mach` in `evaluate!(mach, ...)` with `tag => mach` (or `model` in
+  `evaluate(model, ...)` with `tag => model`). If unspecified, it is auto-generated, but
   tag-uniqueness is not 100% guaranteed.
 
  - `measure`: vector of measures (metrics) used
@@ -1208,12 +1208,12 @@ Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated.
 - `compact=false` - if `true`, the returned evaluation object excludes these fields:
   `fitted_params_per_fold`, `report_per_fold`, `train_test_rows`.
 
-# Example
+# Examples
 
 Setup:
 
 ```julia
-using MLJ # or using MLJBase, StatisticalMeasures
+using MLJ
 X, y = make_moons(rng=123) # a table and a vector
 model = ConstantClassifier()
 mach = machine(model, X, y)
@@ -1245,6 +1245,18 @@ show(e)
 # PerformanceEvaluation("explicit folds", 0.708 ± 0.0328)
 ```
 
+Evaluate multiple machines:
+
+```julia
+@load KNNClassifier pkg=NearestNeighborModels
+mach1 = machine(ConstantClassifier(), X, y)
+mach2 = machine(KNNClassifier(), X , y)
+evaluate!(["const" => mach1, "knn" => mach2])
+# 2-element Vector{...}
+#  PerformanceEvaluation("const", 0.698 ± 0.0062)
+#  PerformanceEvaluation("knn", 2.22e-16 ± 0.0)
+```
+
 See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref),
 [`CompactPerformanceEvaluation`](@ref).
 
@@ -1342,6 +1354,12 @@ function evaluate!(
     )
 end
 
+# multiple machine evaluations:
+evaluate!(
+    machines_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Machine}}};
+    kwargs...,
+) = [evaluate!(x; kwargs...) for x in machines_or_pairs]
+
 """
     evaluate(model, data...; cache=true, options...)
 
@@ -1353,12 +1371,12 @@ Returns a  [`PerformanceEvaluation`](@ref) object.
 In place of `model`, one can use `tag_string => model`, or a vector of either of these
 forms, to return a vector of performance evaluation objects.
 
-# Example
+# Examples
 
 Setup:
 
 ```julia
-using MLJ # or using MLJBase, StatisticalMeasures
+using MLJ
 X, y = make_moons(rng=123) # a table and a vector
 model = ConstantClassifier()
 ```
@@ -1389,6 +1407,16 @@ show(e)
 # PerformanceEvaluation("explicit folds", 0.708 ± 0.0328)
 ```
 
+Evaluate muliple models:
+
+```julia
+@load KNNClassifier pkg=NearestNeighborModels
+evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X , y)
+# 2-element Vector{...}
+#  PerformanceEvaluation("const", 0.698 ± 0.0062)
+#  PerformanceEvaluation("knn", 2.22e-16 ± 0.0)
+```
+
 See also [`evaluate!`](@ref).
 
 """
@@ -1397,6 +1425,12 @@ evaluate(model::Model, args...; cache=true, kwargs...) =
 evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) =
     evaluate!(first(pair) => machine(last(pair), args...; cache=cache); kwargs...)
 
+# multiple model evaluations:
+evaluate(
+    models_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Model}}}, args...;
+    kwargs...,
+) = [evaluate(x, args...; kwargs...) for x in models_or_pairs]
+
 # -------------------------------------------------------------------
 # Resource-specific methods to distribute a function parameterized by
 # fold number `k` over processes/threads.
diff --git a/test/resampling.jl b/test/resampling.jl
index d7161f1e..8deda201 100644
--- a/test/resampling.jl
+++ b/test/resampling.jl
@@ -1121,6 +1121,29 @@ MLJBase._repr_(::API.RobustMeasure{<:typeof(bogus)}) = "bogus"
     @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.286 ± 0.0)"
 end
 
+@testset "mulitiple performance evaluations" begin
+    # model form:
+    e1 = evaluate("const" => ConstantClassifier(), X, y)
+    e2 = evaluate("knn" => KNNClassifier(), X, y)
+    es = evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X, y)
+    @test es[1].measurement == e1.measurement
+
+    # machine form:
+    mach1 = machine(ConstantClassifier(), X, y)
+    mach2 = machine(KNNClassifier(), X, y)
+    e1 = evaluate!("const" => mach1)
+    e2 = evaluate!("knn" => mach2)
+    es = evaluate!(["const" => mach1, "knn" => mach2])
+    @test es[1].measurement == e1.measurement
+
+    # display:
+    @test contains(
+        sprint(show, es),
+        "[PerformanceEvaluation(\"const\", 0.774 ± 0.0998), "*
+        "PerformanceEvaluation(\"knn\", 0.795 ± 0.0973)]",
+    )
+end
+
 
 # # TRANSFORMER WITH PREDICT
 

From 0aa2f78a5eb9e1fcb296bb773fcde6e453722149 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 9 Dec 2025 09:03:07 +1300
Subject: [PATCH 4/5] fix mistake found in review and add test to catch

---
 src/resampling.jl  | 2 +-
 test/resampling.jl | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/resampling.jl b/src/resampling.jl
index 7e3c0bb5..915f4b67 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -1427,7 +1427,7 @@ evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) =
 
 # multiple model evaluations:
 evaluate(
-    models_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Model}}}, args...;
+    models_or_pairs::AbstractVector{<:Union{Model,Pair{String,<:Model}}}, args...;
     kwargs...,
 ) = [evaluate(x, args...; kwargs...) for x in models_or_pairs]
 
diff --git a/test/resampling.jl b/test/resampling.jl
index 8deda201..20d6cc25 100644
--- a/test/resampling.jl
+++ b/test/resampling.jl
@@ -1127,6 +1127,8 @@ end
     e2 = evaluate("knn" => KNNClassifier(), X, y)
     es = evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X, y)
     @test es[1].measurement == e1.measurement
+    es = evaluate([ConstantClassifier(), KNNClassifier()], X, y)
+    @test es[1].measurement == e1.measurement
 
     # machine form:
     mach1 = machine(ConstantClassifier(), X, y)
@@ -1135,6 +1137,8 @@ end
     e2 = evaluate!("knn" => mach2)
     es = evaluate!(["const" => mach1, "knn" => mach2])
     @test es[1].measurement == e1.measurement
+    es2 = evaluate!([mach1, mach2])
+    @test es2[1].measurement == e1.measurement
 
     # display:
     @test contains(

From aeb7d2990c6cab76dd6eebbe4dcdc277a64e0538 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 9 Dec 2025 11:06:37 +1300
Subject: [PATCH 5/5] fix typo to close #1025

---
 src/resampling.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/resampling.jl b/src/resampling.jl
index 480b5895..74837d26 100644
--- a/src/resampling.jl
+++ b/src/resampling.jl
@@ -303,7 +303,7 @@ function train_test_pairs(cv::CV, rows)
 
     if n < 1
         throw(ArgumentError(
-            """Inusufficient data for $n_folds-fold cross-validation.
+            """Insufficient data for $n_folds-fold cross-validation.
             Try reducing nfolds. """
         ))
     end
@@ -407,7 +407,7 @@ function train_test_pairs(tscv::TimeSeriesCV, rows)
 
     if m < 1
         throw(ArgumentError(
-            "Inusufficient data for $n_folds-fold " *
+            "Insufficient data for $n_folds-fold " *
             "time-series cross-validation.\n" *
             "Try reducing nfolds. "
         ))