From 2a063381ff5c03203a09b1ee05eeef17bdfc2900 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 2 Dec 2025 08:45:52 +1300 Subject: [PATCH 1/5] add uncertainty_radius_95 to PerformanceEvaluation structs; --- src/resampling.jl | 51 ++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/src/resampling.jl b/src/resampling.jl index 51071e34..ea4bcdc3 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -575,7 +575,8 @@ When displayed, a `PerformanceEvaluation` object includes a value under the head `1.96*SE`, derived from the standard error of the `per_fold` entries. This value is suitable for constructing a formal 95% confidence interval for the given `measurement`. Such intervals should be interpreted with caution. See, for example, [Bates -et al. (2021)](https://arxiv.org/abs/2104.00673). +et al. (2021)](https://arxiv.org/abs/2104.00673). It is also stored in the field +`uncertainty_radius_95`. ### Fields @@ -591,6 +592,9 @@ These fields are part of the public API of the `PerformanceEvaluation` struct. applied for a given measure `m` is `StatisticalMeasuresBase.external_aggregation_mode(m)` (commonly `Mean()` or `Sum()`) +- `uncertainty_radius_95`: vector of radii of uncertainty for 95% confidence intervals, + one for each element of `meaures`. See cautionary note above. + - `operation` (e.g., `predict_mode`): the operations applied for each measure to generate predictions to be evaluated. Possibilities are: $PREDICT_OPERATIONS_STRING. @@ -638,6 +642,7 @@ struct PerformanceEvaluation{M, model::M measure::Measure measurement::Measurement + uncertainty_radius_95::Union{Nothing,Measurement} operation::Operation per_fold::PerFold per_observation::PerObservation @@ -670,6 +675,7 @@ struct CompactPerformanceEvaluation{M, model::M measure::Measure measurement::Measurement + uncertainty_radius_95::Union{Nothing,Measurement} operation::Operation per_fold::PerFold per_observation::PerObservation @@ -682,6 +688,7 @@ compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation( e.model, e.measure, e.measurement, + e.uncertainty_radius_95, e.operation, e.per_fold, e. per_observation, @@ -693,18 +700,6 @@ compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation( round3(x) = x round3(x::AbstractFloat) = round(x, sigdigits=3) -const SE_FACTOR = 1.96 # For a 95% confidence interval. - -_standard_error(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1) -_standard_error(v) = "N/A" - -function _standard_errors(e::AbstractPerformanceEvaluation) - measure = e.measure - length(e.per_fold[1]) == 1 && return [nothing] - std_errors = map(_standard_error, e.per_fold) - return std_errors -end - # to address #874, while preserving the display worked out in #757: _repr_(f::Function) = repr(f) _repr_(x) = repr("text/plain", x) @@ -722,14 +717,14 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) _measure = [_repr_(m) for m in e.measure] _measurement = round3.(e.measurement) _per_fold = [round3.(v) for v in e.per_fold] - _sterr = round3.(_standard_errors(e)) + _uncertainty_radius_95 = round3.(e.uncertainty_radius_95) row_labels = _label.(eachindex(e.measure)) # Define header and data for main table data = hcat(_measure, e.operation, _measurement) header = ["measure", "operation", "measurement"] - if length(row_labels) > 1 + if length(row_labels) > 1 && length(first(e.per_fold)) > 1 data = hcat(row_labels, data) header =["", header...] end @@ -738,14 +733,14 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) println(io, "PerformanceEvaluation object "* "with these fields:") println(io, " model, measure, operation,\n"* - " measurement, per_fold, per_observation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* " fitted_params_per_fold, report_per_fold,\n"* " train_test_rows, resampling, repeats") else println(io, "CompactPerformanceEvaluation object "* "with these fields:") println(io, " model, measure, operation,\n"* - " measurement, per_fold, per_observation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* " train_test_rows, resampling, repeats") end @@ -764,8 +759,8 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) # Show the per-fold table if needed: if length(first(e.per_fold)) > 1 - show_sterr = any(!isnothing, _sterr) - data2 = hcat(_per_fold, _sterr) + show_sterr = any(!isnothing, _uncertainty_radius_95) + data2 = hcat(_per_fold, _uncertainty_radius_95) header2 = ["per_fold", "1.96*SE"] if length(row_labels) > 1 data2 = hcat(row_labels, data2) @@ -783,14 +778,19 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) show_color ? color_on() : color_off() end -_summary(e) = Tuple(round3.(e.measurement)) +function _summary(e) + confidence_intervals = map(zip(e.measurement, e.uncertainty_radius_95)) do (μ, δ) + "$(round3(μ))±$(round3(δ))" + end + return "(\"$(name(e.model))\", "*join(confidence_intervals, ", ")*")" +end + Base.show(io::IO, e::PerformanceEvaluation) = print(io, "PerformanceEvaluation$(_summary(e))") Base.show(io::IO, e::CompactPerformanceEvaluation) = print(io, "CompactPerformanceEvaluation$(_summary(e))") - # =============================================================== ## USER CONTROL OF DEFAULT LOGGING @@ -1426,6 +1426,11 @@ end _view(::Nothing, rows) = nothing _view(weights, rows) = view(weights, rows) +const SE_FACTOR = 1.96 # For a 95% confidence interval. + +radius_95(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1) +radius_95(v) = "N/A" + # Evaluation when `resampling` is a TrainTestPairs (CORE EVALUATOR): function evaluate!( mach::Machine, @@ -1579,10 +1584,14 @@ function evaluate!( ) end + confidence_radii_95 = length(per_fold[1]) == 1 ? nothing : + map(radius_95, per_fold) + evaluation = PerformanceEvaluation( mach.model, measures, per_measure, + confidence_radii_95, operations, per_fold, per_observation, From e41417c948830b1ef493fca5c6cc891bb2b32b21 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 2 Dec 2025 15:52:48 +1300 Subject: [PATCH 2/5] add the `tag` field to PerformanceEvaluation objects --- src/resampling.jl | 192 ++++++++++++++++++++++++++++++++++++++------- test/resampling.jl | 166 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 326 insertions(+), 32 deletions(-) diff --git a/src/resampling.jl b/src/resampling.jl index ea4bcdc3..eebcd75a 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -585,7 +585,13 @@ These fields are part of the public API of the `PerformanceEvaluation` struct. - `model`: model used to create the performance evaluation. In the case a tuning model, this is the best model found. -- `measure`: vector of measures (metrics) used to evaluate performance +- `tag`: a string label associated with the evaluation, specified by the user when + replacing `model` in `evaluate(model, ...)` with `tag => model`, or `mach` in + `evaluate!(mach, ...)` with `tag => mach`. If unspecified, it is auto-generated, but + tag-uniqueness is not 100% guaranteed. + + - `measure`: vector of measures (metrics) used + to evaluate performance - `measurement`: vector of measurements - one for each element of `measure` - aggregating the performance measurements over all train/test pairs (folds). The aggregation method @@ -633,6 +639,7 @@ See also [`CompactPerformanceEvaluation`](@ref). struct PerformanceEvaluation{M, Measure, Measurement, + Uncertainty, Operation, PerFold, PerObservation, @@ -640,9 +647,10 @@ struct PerformanceEvaluation{M, ReportPerFold, R} <: AbstractPerformanceEvaluation model::M + tag::String measure::Measure measurement::Measurement - uncertainty_radius_95::Union{Nothing,Measurement} + uncertainty_radius_95::Uncertainty operation::Operation per_fold::PerFold per_observation::PerObservation @@ -666,16 +674,18 @@ For more on the remaining fields, see [`PerformanceEvaluation`](@ref). """ struct CompactPerformanceEvaluation{M, - Measure, - Measurement, - Operation, - PerFold, - PerObservation, - R} <: AbstractPerformanceEvaluation + Measure, + Measurement, + Uncertainty, + Operation, + PerFold, + PerObservation, + R} <: AbstractPerformanceEvaluation model::M + tag::String measure::Measure measurement::Measurement - uncertainty_radius_95::Union{Nothing,Measurement} + uncertainty_radius_95::Uncertainty operation::Operation per_fold::PerFold per_observation::PerObservation @@ -686,6 +696,7 @@ end compactify(e::CompactPerformanceEvaluation) = e compactify(e::PerformanceEvaluation) = CompactPerformanceEvaluation( e.model, + e.tag, e.measure, e.measurement, e.uncertainty_radius_95, @@ -703,6 +714,13 @@ round3(x::AbstractFloat) = round(x, sigdigits=3) # to address #874, while preserving the display worked out in #757: _repr_(f::Function) = repr(f) _repr_(x) = repr("text/plain", x) +_repr(::Nothing) = "" + +function uncertainty_as_string(δ) + isnothing(δ) && return "" + δ isa Real && isinf(δ) && return "" + return string(round3(δ)) +end # helper for row labels: _label(1) ="A", _label(2) = "B", _label(27) = "BA", etc const alphabet = Char.(65:90) @@ -716,8 +734,9 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) _measure = [_repr_(m) for m in e.measure] _measurement = round3.(e.measurement) - _per_fold = [round3.(v) for v in e.per_fold] - _uncertainty_radius_95 = round3.(e.uncertainty_radius_95) + _per_fold = reshape([round3.(v) for v in e.per_fold], length(e.per_fold), 1) + _uncertainty_radius_95 = uncertainty_as_string.(e.uncertainty_radius_95) + show_radius = any(x -> !isempty(x), _uncertainty_radius_95) row_labels = _label.(eachindex(e.measure)) # Define header and data for main table @@ -732,18 +751,18 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) if e isa PerformanceEvaluation println(io, "PerformanceEvaluation object "* "with these fields:") - println(io, " model, measure, operation,\n"* + println(io, " model, tag, measure, operation,\n"* " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* " fitted_params_per_fold, report_per_fold,\n"* " train_test_rows, resampling, repeats") else println(io, "CompactPerformanceEvaluation object "* "with these fields:") - println(io, " model, measure, operation,\n"* + println(io, " model, tag, measure, operation,\n"* " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* " train_test_rows, resampling, repeats") end - + println(io, "Tag: $(e.tag)") println(io, "Extract:") show_color = MLJBase.SHOW_COLOR[] color_off() @@ -759,9 +778,12 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) # Show the per-fold table if needed: if length(first(e.per_fold)) > 1 - show_sterr = any(!isnothing, _uncertainty_radius_95) - data2 = hcat(_per_fold, _uncertainty_radius_95) - header2 = ["per_fold", "1.96*SE"] + data2 = _per_fold + header2 = ["per_fold", ] + if show_radius + data2 = hcat(_per_fold, _uncertainty_radius_95) + header2 = [header2..., "1.96*SE"] + end if length(row_labels) > 1 data2 = hcat(row_labels, data2) header2 =["", header2...] @@ -769,7 +791,7 @@ function Base.show(io::IO, ::MIME"text/plain", e::AbstractPerformanceEvaluation) PrettyTables.pretty_table( io, data2; - column_labels = header2, + column_labels = [header2,], alignment=:l, line_breaks=true, style, @@ -780,9 +802,11 @@ end function _summary(e) confidence_intervals = map(zip(e.measurement, e.uncertainty_radius_95)) do (μ, δ) - "$(round3(μ))±$(round3(δ))" + a = round3(μ) + b = uncertainty_as_string(δ) + isempty(b) ? a : "$a ± $b" end - return "(\"$(name(e.model))\", "*join(confidence_intervals, ", ")*")" + return "(\"$(e.tag)\", "*join(confidence_intervals, ", ")*")" end Base.show(io::IO, e::PerformanceEvaluation) = @@ -854,6 +878,12 @@ end # --------------------------------------------------------------- # Helpers +function machine_and_tag(mach::Machine) + tag = string(name(mach.model), "-", rand(100:999)) + return mach, tag +end +machine_and_tag(pair::Pair{String,<:Machine}) = last(pair), first(pair) + function actual_rows(rows, N, verbosity) unspecified_rows = (rows === nothing) _rows = unspecified_rows ? (1:N) : rows @@ -1111,6 +1141,9 @@ the specified `resampling` strategy (defaulting to 6-fold cross-validation) and which can be a single measure or vector. Returns a [`PerformanceEvaluation`](@ref) object. +In place of `mach`, one can use `tag_string => mach`, or a vector of either of these forms, +to return a vector of performance evaluation objects. + Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` is not an instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)` is expected. For example, setting @@ -1175,12 +1208,49 @@ Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated. - `compact=false` - if `true`, the returned evaluation object excludes these fields: `fitted_params_per_fold`, `report_per_fold`, `train_test_rows`. +# Example + +Setup: + +```julia +using MLJ # or using MLJBase, StatisticalMeasures +X, y = make_moons(rng=123) # a table and a vector +model = ConstantClassifier() +mach = machine(model, X, y) +``` + +Perform a simple evaluation on a holdout set, against accuracy and area under the ROC +curve: + +```julia +evaluate!(mach; resampling=Holdout(fraction_train=0.8), measure=[accuracy, auc]) +``` + +Perform Monte Carlo cross-validation, with 2 folds and 5 repeats, against area Brier score: + +```julia +evaluate!(mach; resampling=CV(nfolds=2, rng=123), repeats=5, measures=brier_score) +``` + +Evaluate on explicitly specified train-test pairs, against cross entropy, and tag the +result, "explicit folds": + +```julia +e = evaluate!( + "explicit folds" => mach; + resampling=[(1:140, 141:150), (11:150, 1:10)], + measure=log_loss, +) +show(e) +# PerformanceEvaluation("explicit folds", 0.708 ± 0.0328) +``` + See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref), [`CompactPerformanceEvaluation`](@ref). """ function evaluate!( - mach::Machine; + mach_or_pair::Union{Machine,Pair{String,<:Machine}}; resampling=CV(), measures=nothing, measure=measures, @@ -1203,6 +1273,8 @@ function evaluate!( # weights, measures, operations, and dispatches a # strategy-specific `evaluate!` + mach, tag = machine_and_tag(mach_or_pair) + length(mach.args) > 1 || throw(ERR_NEED_TARGET) repeats > 0 || error("Need `repeats > 0`. ") @@ -1252,6 +1324,7 @@ function evaluate!( evaluate!( mach, + tag, resampling, weights, class_weights, @@ -1277,11 +1350,52 @@ See the machine version `evaluate!` for the complete list of options. Returns a [`PerformanceEvaluation`](@ref) object. +In place of `model`, one can use `tag_string => model`, or a vector of either of these +forms, to return a vector of performance evaluation objects. + +# Example + +Setup: + +```julia +using MLJ # or using MLJBase, StatisticalMeasures +X, y = make_moons(rng=123) # a table and a vector +model = ConstantClassifier() +``` + +Perform a simple evaluation on a holdout set, against accuracy and area under the ROC +curve: + +```julia +evaluate(model, X, y; resampling=Holdout(fraction_train=0.8), measure=[accuracy, auc]) +``` + +Perform Monte Carlo cross-validation, with 2 folds and 5 repeats, against area Brier score: + +```julia +evaluate(model, X, y; resampling=CV(nfolds=2, rng=123), repeats=5, measures=brier_score) +``` + +Evaluate on explicitly specified train-test pairs, against cross entropy, and tag the +result, "explicit folds": + +```julia +e = evaluate( + "explicit folds" => model, X, y; + resampling=[(1:140, 141:150), (11:150, 1:10)], + measure=log_loss, +) +show(e) +# PerformanceEvaluation("explicit folds", 0.708 ± 0.0328) +``` + See also [`evaluate!`](@ref). """ evaluate(model::Model, args...; cache=true, kwargs...) = evaluate!(machine(model, args...; cache=cache); kwargs...) +evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) = + evaluate!(first(pair) => machine(last(pair), args...; cache=cache); kwargs...) # ------------------------------------------------------------------- # Resource-specific methods to distribute a function parameterized by @@ -1428,12 +1542,16 @@ _view(weights, rows) = view(weights, rows) const SE_FACTOR = 1.96 # For a 95% confidence interval. -radius_95(v::AbstractVector{<:Real}) = SE_FACTOR*std(v) / sqrt(length(v) - 1) -radius_95(v) = "N/A" +function radius_95(v::AbstractVector{<:Real}) + length(v) < 2 && return Inf + return SE_FACTOR*std(v) / sqrt(length(v) - 1) +end +radius_95(v) = nothing # Evaluation when `resampling` is a TrainTestPairs (CORE EVALUATOR): function evaluate!( mach::Machine, + tag, resampling, weights, class_weights, @@ -1584,14 +1702,17 @@ function evaluate!( ) end - confidence_radii_95 = length(per_fold[1]) == 1 ? nothing : - map(radius_95, per_fold) + # The following is a vector with `nothing` values for each measure not returning + # `Real` values (e.g., confmat), and `Inf` values for any other kind of measure when + # there is only one train-test fold. + uncertainty_radius_95 = map(radius_95, per_fold) evaluation = PerformanceEvaluation( mach.model, + tag, measures, per_measure, - confidence_radii_95, + uncertainty_radius_95, operations, per_fold, per_observation, @@ -1610,8 +1731,17 @@ end # ---------------------------------------------------------------- # Evaluation when `resampling` is a ResamplingStrategy -function evaluate!(mach::Machine, resampling::ResamplingStrategy, - weights, class_weights, rows, verbosity, repeats, args...) +function evaluate!( + mach::Machine, + tag, + resampling::ResamplingStrategy, + weights, + class_weights, + rows, + verbosity, + repeats, + args..., + ) train_args = Tuple(a() for a in mach.args) y = train_args[2] @@ -1625,6 +1755,7 @@ function evaluate!(mach::Machine, resampling::ResamplingStrategy, evaluate!( mach, + tag, repeated_train_test_pairs, weights, class_weights, @@ -1758,6 +1889,7 @@ end function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...) mach = machine(resampler.model, args...; cache=resampler.cache) + tag = "" _measures = _actual_measures(resampler.measure, resampler.model) @@ -1786,6 +1918,7 @@ function MLJModelInterface.fit(resampler::Resampler, verbosity::Int, args...) # `PerformanceEvaluation` object.) e = evaluate!( mach, + tag, resampler.resampling, resampler.weights, resampler.class_weights, @@ -1842,8 +1975,10 @@ function MLJModelInterface.update( end mach, e = fitresult + tag = "" train_test_rows = e.train_test_rows + # since `resampler.model` could have changed, so might the actual measures and # operations that should be passed to the (low level) `evaluate!`: measures = _actual_measures(resampler.measure, resampler.model) @@ -1860,6 +1995,7 @@ function MLJModelInterface.update( # re-evaluate: e = evaluate!( mach2, + tag, train_test_rows, resampler.weights, resampler.class_weights, diff --git a/test/resampling.jl b/test/resampling.jl index d1928d39..d7161f1e 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -938,11 +938,13 @@ end fit!(mach) end +model = ConstantClassifier() +X = (; x = ones(10)) +y = coerce([1, 2, 2, 2, 1, 2, 1, 2, 1, 1], Multiclass) + @testset "compact evaluation objects" begin - model = ConstantClassifier() - X, y = make_blobs(10) - e = evaluate(model, X, y) - ec = evaluate(model, X, y, compact=true) + e = evaluate("tag" => model, X, y) + ec = evaluate("tag" => model, X, y, compact=true) @test MLJBase.compactify(ec) == ec == MLJBase.compactify(e) @test e isa PerformanceEvaluation @test ec isa CompactPerformanceEvaluation @@ -963,6 +965,162 @@ end end end +bogus(yhat, y) = [1,] +# The measures that get stored in a PerformanceEvaluation object are the user-specified +# measures wrapped in `robust_measure(...)`! +MLJBase._repr_(::API.RobustMeasure{<:typeof(bogus)}) = "bogus" + +@testset "more display tests" begin + # no extra table (only one train-test pair) + e = evaluate("tag" => model, X, y; resampling=Holdout(), + measures = [bogus, log_loss]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌──────────────────────┬───────────┬─────────────┐\n"* + "│ measure │ operation │ measurement │\n"* + "├──────────────────────┼───────────┼─────────────┤\n"* + "│ bogus │ predict │ [1.0] │\n"* + "│ LogLoss( │ predict │ 0.751 │\n"* + "│ tol = 2.22045e-16) │ │ │\n"* + "└──────────────────────┴───────────┴─────────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.751)" + + # extra table - one non-numeric measure: + e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2), + measures = [bogus,]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌─────────┬───────────┬─────────────┐\n"* + "│ measure │ operation │ measurement │\n"* + "├─────────┼───────────┼─────────────┤\n"* + "│ bogus │ predict │ [1.0] │\n"* + "└─────────┴───────────┴─────────────┘\n"* + "┌────────────────┐\n"* + "│ per_fold │\n"* + "├────────────────┤\n"* + "│ [[1.0], [1.0]] │\n"* + "└────────────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0])" + + # extra table - one numeric measure: + e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2), + measures = [accuracy,]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌────────────┬──────────────┬─────────────┐\n"* + "│ measure │ operation │ measurement │\n"* + "├────────────┼──────────────┼─────────────┤\n"* + "│ Accuracy() │ predict_mode │ 0.4 │\n"* + "└────────────┴──────────────┴─────────────┘\n"* + "┌────────────┬─────────┐\n"* + "│ per_fold │ 1.96*SE │\n"* + "├────────────┼─────────┤\n"* + "│ [0.4, 0.4] │ 0.0 │\n"* + "└────────────┴─────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", 0.4 ± 0.0)" + + # extra table - two numeric measures: + e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2), + measures = [accuracy, log_loss]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌───┬──────────────────────┬──────────────┬─────────────┐\n"* + "│ │ measure │ operation │ measurement │\n"* + "├───┼──────────────────────┼──────────────┼─────────────┤\n"* + "│ A │ Accuracy() │ predict_mode │ 0.4 │\n"* + "│ B │ LogLoss( │ predict │ 0.754 │\n"* + "│ │ tol = 2.22045e-16) │ │ │\n"* + "└───┴──────────────────────┴──────────────┴─────────────┘\n"* + "┌───┬────────────────┬─────────┐\n"* + "│ │ per_fold │ 1.96*SE │\n"* + "├───┼────────────────┼─────────┤\n"* + "│ A │ [0.4, 0.4] │ 0.0 │\n"* + "│ B │ [0.754, 0.754] │ 0.0 │\n"* + "└───┴────────────────┴─────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", 0.4 ± 0.0, 0.754 ± 0.0)" + + # extra table - two non-numeric measures: + e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2), + measures = [bogus, bogus]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌───┬─────────┬───────────┬─────────────┐\n"* + "│ │ measure │ operation │ measurement │\n"* + "├───┼─────────┼───────────┼─────────────┤\n"* + "│ A │ bogus │ predict │ [1.0] │\n"* + "│ B │ bogus │ predict │ [1.0] │\n"* + "└───┴─────────┴───────────┴─────────────┘\n"* + "┌───┬────────────────┐\n"* + "│ │ per_fold │\n"* + "├───┼────────────────┤\n"* + "│ A │ [[1.0], [1.0]] │\n"* + "│ B │ [[1.0], [1.0]] │\n"* + "└───┴────────────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], [1.0])" + + # extra table - mixed type of measures: + e = evaluate("tag" => model, X, y; resampling=CV(nfolds=2), + measures = [bogus, macro_f1score]) + @test sprint(show, MIME("text/plain"), e) == + "PerformanceEvaluation object with these fields:\n"* + " model, tag, measure, operation,\n"* + " measurement, uncertainty_radius_95, per_fold, per_observation,\n"* + " fitted_params_per_fold, report_per_fold,\n"* + " train_test_rows, resampling, repeats\n"* + "Tag: tag\n"* + "Extract:\n"* + "┌───┬──────────────────────────────┬──────────────┬─────────────┐\n"* + "│ │ measure │ operation │ measurement │\n"* + "├───┼──────────────────────────────┼──────────────┼─────────────┤\n"* + "│ A │ bogus │ predict │ [1.0] │\n"* + "│ B │ MulticlassFScore( │ predict_mode │ 0.286 │\n"* + "│ │ beta = 1.0, │ │ │\n"* + "│ │ average = MacroAvg(), │ │ │\n"* + "│ │ return_type = LittleDict, │ │ │\n"* + "│ │ levels = nothing, │ │ │\n"* + "│ │ perm = nothing, │ │ │\n"* + "│ │ rev = nothing, │ │ │\n"* + "│ │ checks = true) │ │ │\n"* + "└───┴──────────────────────────────┴──────────────┴─────────────┘\n"* + "┌───┬────────────────┬─────────┐\n"* + "│ │ per_fold │ 1.96*SE │\n"* + "├───┼────────────────┼─────────┤\n"* + "│ A │ [[1.0], [1.0]] │ │\n"* + "│ B │ [0.286, 0.286] │ 0.0 │\n"* + "└───┴────────────────┴─────────┘\n" + @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.286 ± 0.0)" +end + # # TRANSFORMER WITH PREDICT From dcc588993a69de2c350dd06561d0bbe9d8969489 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 2 Dec 2025 17:20:16 +1300 Subject: [PATCH 3/5] add multi-model support for `evaluate`/`evaluate!` --- src/composition/models/stacking.jl | 2 ++ src/resampling.jl | 46 ++++++++++++++++++++++++++---- test/resampling.jl | 23 +++++++++++++++ 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index 70be5041..8a805c74 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -399,8 +399,10 @@ function internal_stack_report( results = NamedTuple{modelnames}( [( model = model, + tag = "", measure = stack.measures, measurement = Vector{Any}(undef, n_measures), + uncertainty_radius_95 = fill(nothing, n_measures), operation = _actual_operations(nothing, stack.measures, model, verbosity), per_fold = [Vector{Any}(undef, nfolds) for _ in 1:n_measures], per_observation = [Vector{Vector{Any}}(undef, nfolds) for _ in 1:n_measures], diff --git a/src/resampling.jl b/src/resampling.jl index eebcd75a..7e3c0bb5 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -586,8 +586,8 @@ These fields are part of the public API of the `PerformanceEvaluation` struct. tuning model, this is the best model found. - `tag`: a string label associated with the evaluation, specified by the user when - replacing `model` in `evaluate(model, ...)` with `tag => model`, or `mach` in - `evaluate!(mach, ...)` with `tag => mach`. If unspecified, it is auto-generated, but + replacing `mach` in `evaluate!(mach, ...)` with `tag => mach` (or `model` in + `evaluate(model, ...)` with `tag => model`). If unspecified, it is auto-generated, but tag-uniqueness is not 100% guaranteed. - `measure`: vector of measures (metrics) used @@ -1208,12 +1208,12 @@ Although `evaluate!` is mutating, `mach.model` and `mach.args` are not mutated. - `compact=false` - if `true`, the returned evaluation object excludes these fields: `fitted_params_per_fold`, `report_per_fold`, `train_test_rows`. -# Example +# Examples Setup: ```julia -using MLJ # or using MLJBase, StatisticalMeasures +using MLJ X, y = make_moons(rng=123) # a table and a vector model = ConstantClassifier() mach = machine(model, X, y) @@ -1245,6 +1245,18 @@ show(e) # PerformanceEvaluation("explicit folds", 0.708 ± 0.0328) ``` +Evaluate multiple machines: + +```julia +@load KNNClassifier pkg=NearestNeighborModels +mach1 = machine(ConstantClassifier(), X, y) +mach2 = machine(KNNClassifier(), X , y) +evaluate!(["const" => mach1, "knn" => mach2]) +# 2-element Vector{...} +# PerformanceEvaluation("const", 0.698 ± 0.0062) +# PerformanceEvaluation("knn", 2.22e-16 ± 0.0) +``` + See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref), [`CompactPerformanceEvaluation`](@ref). @@ -1342,6 +1354,12 @@ function evaluate!( ) end +# multiple machine evaluations: +evaluate!( + machines_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Machine}}}; + kwargs..., +) = [evaluate!(x; kwargs...) for x in machines_or_pairs] + """ evaluate(model, data...; cache=true, options...) @@ -1353,12 +1371,12 @@ Returns a [`PerformanceEvaluation`](@ref) object. In place of `model`, one can use `tag_string => model`, or a vector of either of these forms, to return a vector of performance evaluation objects. -# Example +# Examples Setup: ```julia -using MLJ # or using MLJBase, StatisticalMeasures +using MLJ X, y = make_moons(rng=123) # a table and a vector model = ConstantClassifier() ``` @@ -1389,6 +1407,16 @@ show(e) # PerformanceEvaluation("explicit folds", 0.708 ± 0.0328) ``` +Evaluate muliple models: + +```julia +@load KNNClassifier pkg=NearestNeighborModels +evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X , y) +# 2-element Vector{...} +# PerformanceEvaluation("const", 0.698 ± 0.0062) +# PerformanceEvaluation("knn", 2.22e-16 ± 0.0) +``` + See also [`evaluate!`](@ref). """ @@ -1397,6 +1425,12 @@ evaluate(model::Model, args...; cache=true, kwargs...) = evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) = evaluate!(first(pair) => machine(last(pair), args...; cache=cache); kwargs...) +# multiple model evaluations: +evaluate( + models_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Model}}}, args...; + kwargs..., +) = [evaluate(x, args...; kwargs...) for x in models_or_pairs] + # ------------------------------------------------------------------- # Resource-specific methods to distribute a function parameterized by # fold number `k` over processes/threads. diff --git a/test/resampling.jl b/test/resampling.jl index d7161f1e..8deda201 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -1121,6 +1121,29 @@ MLJBase._repr_(::API.RobustMeasure{<:typeof(bogus)}) = "bogus" @test sprint(show, e) == "PerformanceEvaluation(\"tag\", [1.0], 0.286 ± 0.0)" end +@testset "mulitiple performance evaluations" begin + # model form: + e1 = evaluate("const" => ConstantClassifier(), X, y) + e2 = evaluate("knn" => KNNClassifier(), X, y) + es = evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X, y) + @test es[1].measurement == e1.measurement + + # machine form: + mach1 = machine(ConstantClassifier(), X, y) + mach2 = machine(KNNClassifier(), X, y) + e1 = evaluate!("const" => mach1) + e2 = evaluate!("knn" => mach2) + es = evaluate!(["const" => mach1, "knn" => mach2]) + @test es[1].measurement == e1.measurement + + # display: + @test contains( + sprint(show, es), + "[PerformanceEvaluation(\"const\", 0.774 ± 0.0998), "* + "PerformanceEvaluation(\"knn\", 0.795 ± 0.0973)]", + ) +end + # # TRANSFORMER WITH PREDICT From 0aa2f78a5eb9e1fcb296bb773fcde6e453722149 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 9 Dec 2025 09:03:07 +1300 Subject: [PATCH 4/5] fix mistake found in review and add test to catch --- src/resampling.jl | 2 +- test/resampling.jl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/resampling.jl b/src/resampling.jl index 7e3c0bb5..915f4b67 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -1427,7 +1427,7 @@ evaluate(pair::Pair{String,<:Model}, args...; cache=true, kwargs...) = # multiple model evaluations: evaluate( - models_or_pairs::AbstractVector{<:Union{Machine,Pair{String,<:Model}}}, args...; + models_or_pairs::AbstractVector{<:Union{Model,Pair{String,<:Model}}}, args...; kwargs..., ) = [evaluate(x, args...; kwargs...) for x in models_or_pairs] diff --git a/test/resampling.jl b/test/resampling.jl index 8deda201..20d6cc25 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -1127,6 +1127,8 @@ end e2 = evaluate("knn" => KNNClassifier(), X, y) es = evaluate(["const" => ConstantClassifier(), "knn" => KNNClassifier()], X, y) @test es[1].measurement == e1.measurement + es = evaluate([ConstantClassifier(), KNNClassifier()], X, y) + @test es[1].measurement == e1.measurement # machine form: mach1 = machine(ConstantClassifier(), X, y) @@ -1135,6 +1137,8 @@ end e2 = evaluate!("knn" => mach2) es = evaluate!(["const" => mach1, "knn" => mach2]) @test es[1].measurement == e1.measurement + es2 = evaluate!([mach1, mach2]) + @test es2[1].measurement == e1.measurement # display: @test contains( From aeb7d2990c6cab76dd6eebbe4dcdc277a64e0538 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 9 Dec 2025 11:06:37 +1300 Subject: [PATCH 5/5] fix typo to close #1025 --- src/resampling.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/resampling.jl b/src/resampling.jl index 480b5895..74837d26 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -303,7 +303,7 @@ function train_test_pairs(cv::CV, rows) if n < 1 throw(ArgumentError( - """Inusufficient data for $n_folds-fold cross-validation. + """Insufficient data for $n_folds-fold cross-validation. Try reducing nfolds. """ )) end @@ -407,7 +407,7 @@ function train_test_pairs(tscv::TimeSeriesCV, rows) if m < 1 throw(ArgumentError( - "Inusufficient data for $n_folds-fold " * + "Insufficient data for $n_folds-fold " * "time-series cross-validation.\n" * "Try reducing nfolds. " ))