From b5be6376e5f67a08ca17815212015146322a1afe Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 12:23:23 +1200 Subject: [PATCH 1/7] add StableRNGs --- Project.toml | 3 ++- test/runtests.jl | 51 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index b7786d8..dc80eaf 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,8 @@ julia = "1.6" [extras] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["CategoricalArrays", "MLJBase", "Test"] +test = ["CategoricalArrays", "MLJBase", "StableRNGs", "Test"] diff --git a/test/runtests.jl b/test/runtests.jl index 1ccdc25..d7c5d3a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,9 +2,12 @@ using Test import CategoricalArrays import CategoricalArrays.categorical using MLJBase +using StableRNGs using Random Random.seed!(1234) +srng() = StableRNGs.StableRNG(123) + # load code to be tested: import DecisionTree using MLJDecisionTreeInterface @@ -12,7 +15,7 @@ using MLJDecisionTreeInterface # get some test data: X, y = @load_iris -baretree = DecisionTreeClassifier() +baretree = DecisionTreeClassifier(rng=srng()) baretree.max_depth = 1 fitresult, cache, report = MLJBase.fit(baretree, 2, X, y); @@ -50,13 +53,17 @@ using Random: seed! seed!(0) n,m = 10^3, 5; -raw_features = rand(n,m); -weights = rand(-1:1,m); +raw_features = rand(srng(), n,m); +weights = rand(srng(), -1:1,m); labels = raw_features * weights; features = MLJBase.table(raw_features); -R1Tree = DecisionTreeRegressor(min_samples_leaf=5, merge_purity_threshold=0.1) -R2Tree = DecisionTreeRegressor(min_samples_split=5) +R1Tree = DecisionTreeRegressor( + min_samples_leaf=5, + merge_purity_threshold=0.1, + rng=srng(), +) +R2Tree = DecisionTreeRegressor(min_samples_split=5, rng=srng()) model1, = MLJBase.fit(R1Tree,1, features, labels) vals1 = MLJBase.predict(R1Tree,model1,features) @@ -75,11 +82,15 @@ vals2 = MLJBase.predict(R2Tree, model2, features) ## TEST ON ORDINAL FEATURES OTHER THAN CONTINUOUS N = 20 -X = (x1=rand(N), x2=categorical(rand("abc", N), ordered=true), x3=collect(1:N)) +X = ( + x1=rand(srng(),N), + x2=categorical(rand(srng(), "abc", N), ordered=true), + x3=collect(1:N), +) yfinite = X.x2 ycont = float.(X.x3) -rgs = DecisionTreeRegressor() +rgs = DecisionTreeRegressor(rng=srng()) fitresult, _, _ = MLJBase.fit(rgs, 1, X, ycont) @test rms(predict(rgs, fitresult, X), ycont) < 1.5 @@ -90,10 +101,10 @@ fitresult, _, _ = MLJBase.fit(clf, 1, X, yfinite) # -- Ensemble -rfc = RandomForestClassifier() -abs = AdaBoostStumpClassifier() +rfc = RandomForestClassifier(rng=srng()) +abs = AdaBoostStumpClassifier(rng=srng()) -X, y = MLJBase.make_blobs(100, 3; rng=555) +X, y = MLJBase.make_blobs(100, 3; rng=srng()) m = machine(rfc, X, y) fit!(m) @@ -103,15 +114,15 @@ m = machine(abs, X, y) fit!(m) @test accuracy(predict_mode(m, X), y) > 0.95 -X, y = MLJBase.make_regression(rng=5124) -rfr = RandomForestRegressor() +X, y = MLJBase.make_regression(rng=srng()) +rfr = RandomForestRegressor(rng=srng()) m = machine(rfr, X, y) fit!(m) @test rms(predict(m, X), y) < 0.4 N = 10 function reproducibility(model, X, y, loss) - model.rng = 123 + model.rng = srng() model.n_subfeatures = 1 mach = machine(model, X, y) train, test = partition(eachindex(y), 0.7) @@ -124,14 +135,20 @@ function reproducibility(model, X, y, loss) end @testset "reporoducibility" begin - X, y = make_blobs(); + X, y = make_blobs(rng=srng()); loss = BrierLoss() - for model in [DecisionTreeClassifier(), RandomForestClassifier()] + for model in [ + DecisionTreeClassifier(rng=srng()), + RandomForestClassifier(rng=srng()), + ] @test reproducibility(model, X, y, loss) end - X, y = make_regression(); + X, y = make_regression(rng=srng()); loss = LPLoss(p=2) - for model in [DecisionTreeRegressor(), RandomForestRegressor()] + for model in [ + DecisionTreeRegressor(rng=srng()), + RandomForestRegressor(rng=srng()), + ] @test reproducibility(model, X, y, loss) end end From 9652153ad41b6c46c0099871c3bfc8c2272b8c39 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 12:24:21 +1200 Subject: [PATCH 2/7] bump compat DecisionTree = "0.11" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index dc80eaf..552c4e6 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -DecisionTree = "0.10" +DecisionTree = "0.11" MLJModelInterface = "1.4" Tables = "1.6" julia = "1.6" From 2fd027afbcfd7918a9bd25bf0b1681cbc1fb45e6 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 12:40:23 +1200 Subject: [PATCH 3/7] fix an invalid test --- test/runtests.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index d7c5d3a..40a6c0b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -122,11 +122,11 @@ fit!(m) N = 10 function reproducibility(model, X, y, loss) - model.rng = srng() model.n_subfeatures = 1 mach = machine(model, X, y) train, test = partition(eachindex(y), 0.7) errs = map(1:N) do i + model.rng = srng() fit!(mach, rows=train, force=true, verbosity=0) yhat = predict(mach, rows=test) loss(yhat, y[test]) |> mean @@ -138,16 +138,16 @@ end X, y = make_blobs(rng=srng()); loss = BrierLoss() for model in [ - DecisionTreeClassifier(rng=srng()), - RandomForestClassifier(rng=srng()), + DecisionTreeClassifier(), + RandomForestClassifier(), ] @test reproducibility(model, X, y, loss) end X, y = make_regression(rng=srng()); loss = LPLoss(p=2) for model in [ - DecisionTreeRegressor(rng=srng()), - RandomForestRegressor(rng=srng()), + DecisionTreeRegressor(), + RandomForestRegressor(), ] @test reproducibility(model, X, y, loss) end From bcd52de59960132f53e383634076dd5ac28095b8 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 12:40:32 +1200 Subject: [PATCH 4/7] add rng as hyper-parameter for AdaBoostStumpClassifier --- src/MLJDecisionTreeInterface.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/MLJDecisionTreeInterface.jl b/src/MLJDecisionTreeInterface.jl index 6ac287d..f51919a 100644 --- a/src/MLJDecisionTreeInterface.jl +++ b/src/MLJDecisionTreeInterface.jl @@ -156,6 +156,7 @@ end MMI.@mlj_model mutable struct AdaBoostStumpClassifier <: MMI.Probabilistic n_iter::Int = 10::(_ ≥ 1) + rng::Union{AbstractRNG,Integer} = GLOBAL_RNG end function MMI.fit(m::AdaBoostStumpClassifier, verbosity::Int, X, y) @@ -165,8 +166,8 @@ function MMI.fit(m::AdaBoostStumpClassifier, verbosity::Int, X, y) classes_seen = filter(in(unique(y)), MMI.classes(y[1])) integers_seen = MMI.int(classes_seen) - stumps, coefs = DT.build_adaboost_stumps(yplain, Xmatrix, - m.n_iter) + stumps, coefs = + DT.build_adaboost_stumps(yplain, Xmatrix, m.n_iter, rng=m.rng) cache = nothing report = NamedTuple() return (stumps, coefs, classes_seen, integers_seen), cache, report From 64fdd5c274fcc4b0ae017f232c6fe2c2f6e8ca36 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 12:42:06 +1200 Subject: [PATCH 5/7] update docstring --- src/MLJDecisionTreeInterface.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/MLJDecisionTreeInterface.jl b/src/MLJDecisionTreeInterface.jl index f51919a..bd22487 100644 --- a/src/MLJDecisionTreeInterface.jl +++ b/src/MLJDecisionTreeInterface.jl @@ -587,6 +587,7 @@ Train the machine with `fit!(mach, rows=...)`. - `n_iter=10`: number of iterations of AdaBoost +- `rng=Random.GLOBAL_RNG`: random number generator or seed # Operations From 5a498615d950c42db93d4f9c9c8713299b589e36 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 13:14:47 +1200 Subject: [PATCH 6/7] add reproducibility test for AdaBoostStumpClassifier --- test/runtests.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 40a6c0b..04da493 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -122,7 +122,9 @@ fit!(m) N = 10 function reproducibility(model, X, y, loss) - model.n_subfeatures = 1 + if !(model isa AdaBoostStumpClassifier) + model.n_subfeatures = 1 + end mach = machine(model, X, y) train, test = partition(eachindex(y), 0.7) errs = map(1:N) do i @@ -140,6 +142,7 @@ end for model in [ DecisionTreeClassifier(), RandomForestClassifier(), + AdaBoostStumpClassifier(), ] @test reproducibility(model, X, y, loss) end From 997e90c546d6f65379d86e38e33cd6255f6b52ce Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 6 Jul 2022 13:15:39 +1200 Subject: [PATCH 7/7] srng -> stable_rng to improve readability of code --- test/runtests.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 04da493..89d207d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,7 +6,7 @@ using StableRNGs using Random Random.seed!(1234) -srng() = StableRNGs.StableRNG(123) +stable_rng() = StableRNGs.StableRNG(123) # load code to be tested: import DecisionTree @@ -15,7 +15,7 @@ using MLJDecisionTreeInterface # get some test data: X, y = @load_iris -baretree = DecisionTreeClassifier(rng=srng()) +baretree = DecisionTreeClassifier(rng=stable_rng()) baretree.max_depth = 1 fitresult, cache, report = MLJBase.fit(baretree, 2, X, y); @@ -53,17 +53,17 @@ using Random: seed! seed!(0) n,m = 10^3, 5; -raw_features = rand(srng(), n,m); -weights = rand(srng(), -1:1,m); +raw_features = rand(stable_rng(), n,m); +weights = rand(stable_rng(), -1:1,m); labels = raw_features * weights; features = MLJBase.table(raw_features); R1Tree = DecisionTreeRegressor( min_samples_leaf=5, merge_purity_threshold=0.1, - rng=srng(), + rng=stable_rng(), ) -R2Tree = DecisionTreeRegressor(min_samples_split=5, rng=srng()) +R2Tree = DecisionTreeRegressor(min_samples_split=5, rng=stable_rng()) model1, = MLJBase.fit(R1Tree,1, features, labels) vals1 = MLJBase.predict(R1Tree,model1,features) @@ -83,14 +83,14 @@ vals2 = MLJBase.predict(R2Tree, model2, features) N = 20 X = ( - x1=rand(srng(),N), - x2=categorical(rand(srng(), "abc", N), ordered=true), + x1=rand(stable_rng(),N), + x2=categorical(rand(stable_rng(), "abc", N), ordered=true), x3=collect(1:N), ) yfinite = X.x2 ycont = float.(X.x3) -rgs = DecisionTreeRegressor(rng=srng()) +rgs = DecisionTreeRegressor(rng=stable_rng()) fitresult, _, _ = MLJBase.fit(rgs, 1, X, ycont) @test rms(predict(rgs, fitresult, X), ycont) < 1.5 @@ -101,10 +101,10 @@ fitresult, _, _ = MLJBase.fit(clf, 1, X, yfinite) # -- Ensemble -rfc = RandomForestClassifier(rng=srng()) -abs = AdaBoostStumpClassifier(rng=srng()) +rfc = RandomForestClassifier(rng=stable_rng()) +abs = AdaBoostStumpClassifier(rng=stable_rng()) -X, y = MLJBase.make_blobs(100, 3; rng=srng()) +X, y = MLJBase.make_blobs(100, 3; rng=stable_rng()) m = machine(rfc, X, y) fit!(m) @@ -114,8 +114,8 @@ m = machine(abs, X, y) fit!(m) @test accuracy(predict_mode(m, X), y) > 0.95 -X, y = MLJBase.make_regression(rng=srng()) -rfr = RandomForestRegressor(rng=srng()) +X, y = MLJBase.make_regression(rng=stable_rng()) +rfr = RandomForestRegressor(rng=stable_rng()) m = machine(rfr, X, y) fit!(m) @test rms(predict(m, X), y) < 0.4 @@ -128,7 +128,7 @@ function reproducibility(model, X, y, loss) mach = machine(model, X, y) train, test = partition(eachindex(y), 0.7) errs = map(1:N) do i - model.rng = srng() + model.rng = stable_rng() fit!(mach, rows=train, force=true, verbosity=0) yhat = predict(mach, rows=test) loss(yhat, y[test]) |> mean @@ -137,7 +137,7 @@ function reproducibility(model, X, y, loss) end @testset "reporoducibility" begin - X, y = make_blobs(rng=srng()); + X, y = make_blobs(rng=stable_rng()); loss = BrierLoss() for model in [ DecisionTreeClassifier(), @@ -146,7 +146,7 @@ end ] @test reproducibility(model, X, y, loss) end - X, y = make_regression(rng=srng()); + X, y = make_regression(rng=stable_rng()); loss = LPLoss(p=2) for model in [ DecisionTreeRegressor(),