In [1]:
ENV["JULIA_PKG_SERVER"] = ""
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()

using MLJ
using MLJBalancing: BalancedModel
using Imbalance
using Random
using DataFrames

[32m[1m  Activating[22m[39m project at `~/Documents/GitHub/MLJBalancing`


│ It is recommended to `Pkg.resolve()` or consider `Pkg.update()` if necessary.
└ @ Pkg.API /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-4.0/build/default-macmini-aarch64-4-0/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/Pkg/src/API.jl:1535


#### Load Some Data

In [2]:
X, y = Imbalance.generate_imbalanced_data(1000, 5; probs=[0.2, 0.3, 0.5])
X = DataFrame(X)
(X_train, X_test), (y_train, y_test) = partition((X, y), 0.8, rng=123, multi=true)
Imbalance.checkbalance(y)

0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 200 (39.5%) 
1: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 294 (58.1%) 
2: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 506 (100.0%) 


#### Load Some Balancers

In [3]:
balancer1 = Imbalance.MLJ.RandomOversampler(ratios=1.0, rng=42)
balancer2 = Imbalance.MLJ.SMOTENC(k=10, ratios=1.2, rng=42)
balancer3 = Imbalance.MLJ.ROSE(ratios=1.3, rng=42)

ROSE(
  s = 1.0, 
  ratios = 1.3, 
  rng = 42, 
  try_perserve_type = true)

#### Load a Classification Model

In [4]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels verbosity=0
model_prob = LogisticClassifier()

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l2, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = nothing)

#### Wrap the Balancers and the Classification Model Together

In [6]:
balanced_model = BalancedModel(model=model_prob, 
                               balancer1=balancer1, 
                               balancer2=balancer2, 
                               balancer3=balancer3)

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 1.0, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 1.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))

##### Now they behave as a single model!

In [7]:
mach = machine(balanced_model, X_train, y_train)
fit!(mach)
y_pred = predict(mach, X_test)

┌ Info: Training machine(BalancedModelProbabilistic(model = LogisticClassifier(lambda = 2.220446049250313e-16, …), …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492


┌ Info: Training machine(ROSE(s = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(SMOTENC(k = 10, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(RandomOversampler(ratios = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(:model, …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  0[39m[K[A

[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:02[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:03[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  0[39m[K


┌ Info: Solver: MLJLinearModels.LBFGS{Optim.Options{Float64, Nothing}, NamedTuple{(), Tuple{}}}
│   optim_options: Optim.Options{Float64, Nothing}
│   lbfgs_options: NamedTuple{(), Tuple{}} NamedTuple()
└ @ MLJLinearModels /Users/essam/.julia/packages/MLJLinearModels/zSQnL/src/mlj/interface.jl:72


200-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, Int64, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(0=>0.348, 1=>0.343, 2=>0.309)
 UnivariateFinite{Multiclass{3}}(0=>0.282, 1=>0.306, 2=>0.412)
 UnivariateFinite{Multiclass{3}}(0=>0.306, 1=>0.319, 2=>0.374)
 UnivariateFinite{Multiclass{3}}(0=>0.347, 1=>0.334, 2=>0.319)
 UnivariateFinite{Multiclass{3}}(0=>0.319, 1=>0.333, 2=>0.348)
 UnivariateFinite{Multiclass{3}}(0=>0.372, 1=>0.337, 2=>0.29)
 UnivariateFinite{Multiclass{3}}(0=>0.36, 1=>0.337, 2=>0.303)
 UnivariateFinite{Multiclass{3}}(0=>0.289, 1=>0.293, 2=>0.418)
 UnivariateFinite{Multiclass{3}}(0=>0.312, 1=>0.308, 2=>0.38)
 UnivariateFinite{Multiclass{3}}(0=>0.345, 1=>0.349, 2=>0.306)
 ⋮
 UnivariateFinite{Multiclass{3}}(0=>0.371, 1=>0.374, 2=>0.255)
 UnivariateFinite{Multiclass{3}}(0=>0.365, 1=>0.376, 2=>0.259)
 UnivariateFinite{Multiclass{3}}(0=>0.355, 1=>0.361, 2=>0.284)
 UnivariateFinite{Multiclass{3}}(0=>0.332, 1=>0.35, 2=>0.318)
 UnivariateFinite{M

#### You can even tune it if you wish

In [None]:
r1 = range(balanced_model, :(balancer1.ratios), lower=0.8, upper=1.0)
r2 = range(balanced_model, :(balancer2.k), lower=3, upper=10)
r3 = range(balanced_model, :(balancer3.s), lower=0.0, upper=0.5)

tuned_balanced_model = TunedModel(model=balanced_model,
									  tuning=Grid(goal=4),
									  resampling=CV(nfolds=4),
									  range=[r1, r2, r3],
									  measure=cross_entropy);

mach = machine(tuned_balanced_model, X, y);
fit!(mach, verbosity=0);

In [9]:
fitted_params(mach).best_model

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 0.8, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 0.5, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))