In [1]:
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()

using MLJ
using MLJBalancing
using Imbalance
using Random
using DataFrames

#### Load Some Data

In [2]:
X, y = Imbalance.generate_imbalanced_data(1000, 5; probs=[0.2, 0.3, 0.5])
X = DataFrame(X)
train_inds, test_inds = partition(eachindex(y), 0.8, shuffle=true, stratify=y, rng=Random.Xoshiro(42))
X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
Imbalance.checkbalance(y)

0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 204 (39.8%) 
1: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 283 (55.2%) 
2: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 513 (100.0%) 


#### Load Some Balancers

In [3]:
balancer1 = Imbalance.MLJ.RandomOversampler(ratios=1.0, rng=42)
balancer2 = Imbalance.MLJ.SMOTENC(k=10, ratios=1.2, rng=42)
balancer3 = Imbalance.MLJ.ROSE(ratios=1.3, rng=42)

ROSE(
  s = 1.0, 
  ratios = 1.3, 
  rng = 42, 
  try_perserve_type = true)

#### Load a Classification Model

In [4]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels verbosity=0
model_prob = LogisticClassifier()

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l2, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = nothing)

#### Wrap the Balancers and the Classification Model Together

In [5]:
balanced_model = BalancedModel(model=model_prob, balancer1=balancer1, balancer2=balancer2, balancer3=balancer3)

Probabilistic


BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 1.0, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 1.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))

##### Now they behave as a single model!

In [8]:
mach = machine(balanced_model, X_train, y_train)
fit!(mach)
y_pred = predict(mach, X_test)

┌ Info: Training machine(BalancedModelProbabilistic(model = LogisticClassifier(lambda = 2.220446049250313e-16, …), …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(ROSE(s = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(SMOTENC(k = 10, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(RandomOversampler(ratios = 1.0, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492
┌ Info: Training machine(:model, …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/ByFwA/src/machines.jl:492


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  1[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  1[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  1[39m[K


[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  1[39m[K


┌ Info: Solver: MLJLinearModels.LBFGS{Optim.Options{Float64, Nothing}, NamedTuple{(), Tuple{}}}
│   optim_options: Optim.Options{Float64, Nothing}
│   lbfgs_options: NamedTuple{(), Tuple{}} NamedTuple()
└ @ MLJLinearModels /Users/essam/.julia/packages/MLJLinearModels/zSQnL/src/mlj/interface.jl:72


201-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, Int64, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(0=>0.322, 1=>0.378, 2=>0.3)
 UnivariateFinite{Multiclass{3}}(0=>0.312, 1=>0.414, 2=>0.274)
 UnivariateFinite{Multiclass{3}}(0=>0.336, 1=>0.352, 2=>0.312)
 UnivariateFinite{Multiclass{3}}(0=>0.315, 1=>0.383, 2=>0.302)
 UnivariateFinite{Multiclass{3}}(0=>0.345, 1=>0.272, 2=>0.383)
 UnivariateFinite{Multiclass{3}}(0=>0.333, 1=>0.286, 2=>0.38)
 UnivariateFinite{Multiclass{3}}(0=>0.309, 1=>0.347, 2=>0.344)
 UnivariateFinite{Multiclass{3}}(0=>0.312, 1=>0.362, 2=>0.326)
 UnivariateFinite{Multiclass{3}}(0=>0.358, 1=>0.265, 2=>0.378)
 UnivariateFinite{Multiclass{3}}(0=>0.347, 1=>0.313, 2=>0.34)
 ⋮
 UnivariateFinite{Multiclass{3}}(0=>0.323, 1=>0.37, 2=>0.307)
 UnivariateFinite{Multiclass{3}}(0=>0.332, 1=>0.345, 2=>0.323)
 UnivariateFinite{Multiclass{3}}(0=>0.347, 1=>0.307, 2=>0.346)
 UnivariateFinite{Multiclass{3}}(0=>0.348, 1=>0.272, 2=>0.38)
 UnivariateFinite{Mul

#### You can even tune it if you wish

In [None]:
r1 = range(balanced_model, :(balancer1.ratios), lower=0.8, upper=1.0)
r2 = range(balanced_model, :(balancer2.k), lower=3, upper=10)
r3 = range(balanced_model, :(balancer3.s), lower=0.0, upper=0.5)

tuned_balanced_model = TunedModel(model=balanced_model,
									  tuning=Grid(goal=4),
									  resampling=CV(nfolds=4),
									  range=[r1, r2, r3],
									  measure=cross_entropy);

mach = machine(tuned_balanced_model, X, y);
fit!(mach, verbosity=0);

In [12]:
fitted_params(mach).best_model

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 0.8, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 3, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 0.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))