In [1]:
ENV["JULIA_PKG_SERVER"] = ""
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()

using MLJ
using MLJBalancing: BalancedModel
using Imbalance
using Random
using DataFrames

[32m[1m  Activating[22m[39m project at `~/Documents/GitHub/MLJBalancing/example`


#### Load Some Data

In [2]:
X, y = Imbalance.generate_imbalanced_data(1000, 5; class_probs=[0.2, 0.3, 0.5])
X = DataFrame(X)
(X_train, X_test), (y_train, y_test) = partition((X, y), 0.8, rng=123, multi=true)
Imbalance.checkbalance(y)

0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 189 (37.4%) 
1: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 305 (60.3%) 
2: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 506 (100.0%) 


#### Load Some Balancers

In [3]:
balancer1 = Imbalance.MLJ.RandomOversampler(ratios=1.0, rng=42)
balancer2 = Imbalance.MLJ.SMOTENC(k=10, ratios=1.2, rng=42)
balancer3 = Imbalance.MLJ.ROSE(ratios=1.3, rng=42)

ROSE(
  s = 1.0, 
  ratios = 1.3, 
  rng = 42, 
  try_perserve_type = true)

#### Load a Classification Model

In [4]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels verbosity=0
model_prob = LogisticClassifier()

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l2, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = nothing)

#### Wrap the Balancers and the Classification Model Together

In [5]:
balanced_model = BalancedModel(model=model_prob, 
                               balancer1=balancer1, 
                               balancer2=balancer2, 
                               balancer3=balancer3)

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 1.0, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        knn_tree = "Brute", 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 1.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))

##### Now they behave as a single model!

In [6]:
mach = machine(balanced_model, X_train, y_train)
fit!(mach)

┌ Info: Training machine(BalancedModelProbabilistic(model = LogisticClassifier(lambda = 2.220446049250313e-16, …), …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492


┌ Info: Training machine(ROSE(s = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(SMOTENC(k = 10, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(RandomOversampler(ratios = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(:model, …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  0[39m[K[A

[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:01[39m[K
[34m  class:  1[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  1[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  1[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  1[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  1[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


┌ Info: Solver: MLJLinearModels.LBFGS{Optim.Options{Float64, Nothing}, NamedTuple{(), Tuple{}}}
│   optim_options: Optim.Options{Float64, Nothing}
│   lbfgs_options: NamedTuple{(), Tuple{}} NamedTuple()
└ @ MLJLinearModels /Users/essam/.julia/packages/MLJLinearModels/zSQnL/src/mlj/interface.jl:72


trained Machine; does not cache data
  model: BalancedModelProbabilistic(model = LogisticClassifier(lambda = 2.220446049250313e-16, …), …)
  args: 
    1:	Source @226 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @078 ⏎ AbstractVector{Multiclass{3}}


In [7]:

y_pred = predict(mach, X_test)

200-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, Int64, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>1.0, 2=>4.16e-270)
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>1.2e-217, 2=>1.0)
 UnivariateFinite{Multiclass{3}}(0=>2.99e-304, 1=>1.0, 2=>1.19e-221)
 UnivariateFinite{Multiclass{3}}(0=>1.0, 1=>1.35e-179, 2=>2.0900000000000003e-267)
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>1.36e-93, 2=>1.0)
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>4.01e-71, 2=>1.0)
 UnivariateFinite{Multiclass{3}}(0=>1.16e-270, 1=>4.55e-103, 2=>1.0)
 UnivariateFinite{Multiclass{3}}(0=>1.0, 1=>1.0299999999999999e-198, 2=>0.0)
 UnivariateFinite{Multiclass{3}}(0=>1.0, 1=>2.2100000000000002e-73, 2=>1.45e-97)
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>3.4900000000000003e-75, 2=>1.0)
 ⋮
 UnivariateFinite{Multiclass{3}}(0=>1.3699999999999999e-239, 1=>9.34e-140, 2=>1.0)
 UnivariateFinite{Multiclass{3}}(0=>0.0, 1=>1.0, 2=>2.3599999999999997e-256)
 UnivariateFinite{Multiclass{3}}(

#### You can even tune it if you wish

In [None]:
r1 = range(balanced_model, :(balancer1.ratios), lower=1.0, upper=1.4)
r2 = range(balanced_model, :(balancer2.k), lower=3, upper=10)
r3 = range(balanced_model, :(balancer3.s), lower=0.0, upper=0.3)

tuned_balanced_model = TunedModel(model=balanced_model,
									  tuning=Grid(goal=4),
									  resampling=CV(nfolds=4),
									  range=[r1, r2, r3],
									  measure=cross_entropy);

mach = machine(tuned_balanced_model, X, y);
fit!(mach, verbosity=0);

In [None]:
fitted_params(mach).best_model