From 82ba27c2bd01118a56c456554179ad668470bc3e Mon Sep 17 00:00:00 2001 From: "Anthony Blaom, PhD" Date: Wed, 6 Jul 2022 19:37:53 +1200 Subject: [PATCH 1/2] Implement StableRNGs throughout tests to fix reproducibility problems in CI (#26) * add StableRNGs * bump compat DecisionTree = "0.11" * fix an invalid test * add rng as hyper-parameter for AdaBoostStumpClassifier * update docstring * add reproducibility test for AdaBoostStumpClassifier * srng -> stable_rng to improve readability of code --- Project.toml | 5 +-- src/MLJDecisionTreeInterface.jl | 6 ++-- test/runtests.jl | 56 ++++++++++++++++++++++----------- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/Project.toml b/Project.toml index b7786d8..552c4e6 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -DecisionTree = "0.10" +DecisionTree = "0.11" MLJModelInterface = "1.4" Tables = "1.6" julia = "1.6" @@ -18,7 +18,8 @@ julia = "1.6" [extras] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["CategoricalArrays", "MLJBase", "Test"] +test = ["CategoricalArrays", "MLJBase", "StableRNGs", "Test"] diff --git a/src/MLJDecisionTreeInterface.jl b/src/MLJDecisionTreeInterface.jl index 6ac287d..bd22487 100644 --- a/src/MLJDecisionTreeInterface.jl +++ b/src/MLJDecisionTreeInterface.jl @@ -156,6 +156,7 @@ end MMI.@mlj_model mutable struct AdaBoostStumpClassifier <: MMI.Probabilistic n_iter::Int = 10::(_ ≥ 1) + rng::Union{AbstractRNG,Integer} = GLOBAL_RNG end function MMI.fit(m::AdaBoostStumpClassifier, verbosity::Int, X, y) @@ -165,8 +166,8 @@ function MMI.fit(m::AdaBoostStumpClassifier, verbosity::Int, X, y) classes_seen = filter(in(unique(y)), MMI.classes(y[1])) integers_seen = MMI.int(classes_seen) - stumps, coefs = DT.build_adaboost_stumps(yplain, Xmatrix, - m.n_iter) + stumps, coefs = + DT.build_adaboost_stumps(yplain, Xmatrix, m.n_iter, rng=m.rng) cache = nothing report = NamedTuple() return (stumps, coefs, classes_seen, integers_seen), cache, report @@ -586,6 +587,7 @@ Train the machine with `fit!(mach, rows=...)`. - `n_iter=10`: number of iterations of AdaBoost +- `rng=Random.GLOBAL_RNG`: random number generator or seed # Operations diff --git a/test/runtests.jl b/test/runtests.jl index 1ccdc25..89d207d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,9 +2,12 @@ using Test import CategoricalArrays import CategoricalArrays.categorical using MLJBase +using StableRNGs using Random Random.seed!(1234) +stable_rng() = StableRNGs.StableRNG(123) + # load code to be tested: import DecisionTree using MLJDecisionTreeInterface @@ -12,7 +15,7 @@ using MLJDecisionTreeInterface # get some test data: X, y = @load_iris -baretree = DecisionTreeClassifier() +baretree = DecisionTreeClassifier(rng=stable_rng()) baretree.max_depth = 1 fitresult, cache, report = MLJBase.fit(baretree, 2, X, y); @@ -50,13 +53,17 @@ using Random: seed! seed!(0) n,m = 10^3, 5; -raw_features = rand(n,m); -weights = rand(-1:1,m); +raw_features = rand(stable_rng(), n,m); +weights = rand(stable_rng(), -1:1,m); labels = raw_features * weights; features = MLJBase.table(raw_features); -R1Tree = DecisionTreeRegressor(min_samples_leaf=5, merge_purity_threshold=0.1) -R2Tree = DecisionTreeRegressor(min_samples_split=5) +R1Tree = DecisionTreeRegressor( + min_samples_leaf=5, + merge_purity_threshold=0.1, + rng=stable_rng(), +) +R2Tree = DecisionTreeRegressor(min_samples_split=5, rng=stable_rng()) model1, = MLJBase.fit(R1Tree,1, features, labels) vals1 = MLJBase.predict(R1Tree,model1,features) @@ -75,11 +82,15 @@ vals2 = MLJBase.predict(R2Tree, model2, features) ## TEST ON ORDINAL FEATURES OTHER THAN CONTINUOUS N = 20 -X = (x1=rand(N), x2=categorical(rand("abc", N), ordered=true), x3=collect(1:N)) +X = ( + x1=rand(stable_rng(),N), + x2=categorical(rand(stable_rng(), "abc", N), ordered=true), + x3=collect(1:N), +) yfinite = X.x2 ycont = float.(X.x3) -rgs = DecisionTreeRegressor() +rgs = DecisionTreeRegressor(rng=stable_rng()) fitresult, _, _ = MLJBase.fit(rgs, 1, X, ycont) @test rms(predict(rgs, fitresult, X), ycont) < 1.5 @@ -90,10 +101,10 @@ fitresult, _, _ = MLJBase.fit(clf, 1, X, yfinite) # -- Ensemble -rfc = RandomForestClassifier() -abs = AdaBoostStumpClassifier() +rfc = RandomForestClassifier(rng=stable_rng()) +abs = AdaBoostStumpClassifier(rng=stable_rng()) -X, y = MLJBase.make_blobs(100, 3; rng=555) +X, y = MLJBase.make_blobs(100, 3; rng=stable_rng()) m = machine(rfc, X, y) fit!(m) @@ -103,19 +114,21 @@ m = machine(abs, X, y) fit!(m) @test accuracy(predict_mode(m, X), y) > 0.95 -X, y = MLJBase.make_regression(rng=5124) -rfr = RandomForestRegressor() +X, y = MLJBase.make_regression(rng=stable_rng()) +rfr = RandomForestRegressor(rng=stable_rng()) m = machine(rfr, X, y) fit!(m) @test rms(predict(m, X), y) < 0.4 N = 10 function reproducibility(model, X, y, loss) - model.rng = 123 - model.n_subfeatures = 1 + if !(model isa AdaBoostStumpClassifier) + model.n_subfeatures = 1 + end mach = machine(model, X, y) train, test = partition(eachindex(y), 0.7) errs = map(1:N) do i + model.rng = stable_rng() fit!(mach, rows=train, force=true, verbosity=0) yhat = predict(mach, rows=test) loss(yhat, y[test]) |> mean @@ -124,14 +137,21 @@ function reproducibility(model, X, y, loss) end @testset "reporoducibility" begin - X, y = make_blobs(); + X, y = make_blobs(rng=stable_rng()); loss = BrierLoss() - for model in [DecisionTreeClassifier(), RandomForestClassifier()] + for model in [ + DecisionTreeClassifier(), + RandomForestClassifier(), + AdaBoostStumpClassifier(), + ] @test reproducibility(model, X, y, loss) end - X, y = make_regression(); + X, y = make_regression(rng=stable_rng()); loss = LPLoss(p=2) - for model in [DecisionTreeRegressor(), RandomForestRegressor()] + for model in [ + DecisionTreeRegressor(), + RandomForestRegressor(), + ] @test reproducibility(model, X, y, loss) end end From d94341aa65080a01de86cdb12e5e4a243803b81a Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 19:38:29 +1200 Subject: [PATCH 2/2] bump 0.2.3 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 552c4e6..3a3f67c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJDecisionTreeInterface" uuid = "c6f25543-311c-4c74-83dc-3ea6d1015661" authors = ["Anthony D. Blaom "] -version = "0.2.2" +version = "0.2.3" [deps] DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"