In [1]:
ENV["JULIA_PKG_SERVER"] = ""
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()

using MLJ
using MLJBalancing: BalancedModel
using Imbalance
using Random
using DataFrames

#### Load Some Data

In [2]:
X, y = Imbalance.generate_imbalanced_data(1000, 5; probs=[0.2, 0.3, 0.5])
X = DataFrame(X)
(X_train, X_test), (y_train, y_test) = partition((X, y), 0.8, rng=123, multi=true)
Imbalance.checkbalance(y)

0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 204 (40.9%) 
1: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 297 (59.5%) 
2: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 499 (100.0%) 


#### Load Some Balancers

In [3]:
balancer1 = Imbalance.MLJ.RandomOversampler(ratios=1.0, rng=42)
balancer2 = Imbalance.MLJ.SMOTENC(k=10, ratios=1.2, rng=42)
balancer3 = Imbalance.MLJ.ROSE(ratios=1.3, rng=42)

ROSE(
  s = 1.0, 
  ratios = 1.3, 
  rng = 42, 
  try_perserve_type = true)

#### Load a Classification Model

In [4]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels verbosity=0
model_prob = LogisticClassifier()

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l2, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = nothing)

#### Wrap the Balancers and the Classification Model Together

In [5]:
balanced_model = BalancedModel(model=model_prob, 
                               balancer1=balancer1, 
                               balancer2=balancer2, 
                               balancer3=balancer3)

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 1.0, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 1.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))

##### Now they behave as a single model!

In [None]:
mach = machine(balanced_model, X_train, y_train)
fit!(mach)

In [11]:

y_pred = predict(mach, X_test)

200-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, Int64, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(0=>0.359, 1=>0.295, 2=>0.346)
 UnivariateFinite{Multiclass{3}}(0=>0.384, 1=>0.294, 2=>0.322)
 UnivariateFinite{Multiclass{3}}(0=>0.301, 1=>0.395, 2=>0.304)
 UnivariateFinite{Multiclass{3}}(0=>0.285, 1=>0.369, 2=>0.346)
 UnivariateFinite{Multiclass{3}}(0=>0.279, 1=>0.39, 2=>0.331)
 UnivariateFinite{Multiclass{3}}(0=>0.31, 1=>0.34, 2=>0.35)
 UnivariateFinite{Multiclass{3}}(0=>0.292, 1=>0.392, 2=>0.316)
 UnivariateFinite{Multiclass{3}}(0=>0.331, 1=>0.351, 2=>0.318)
 UnivariateFinite{Multiclass{3}}(0=>0.303, 1=>0.35, 2=>0.347)
 UnivariateFinite{Multiclass{3}}(0=>0.311, 1=>0.351, 2=>0.338)
 ⋮
 UnivariateFinite{Multiclass{3}}(0=>0.319, 1=>0.354, 2=>0.326)
 UnivariateFinite{Multiclass{3}}(0=>0.375, 1=>0.291, 2=>0.334)
 UnivariateFinite{Multiclass{3}}(0=>0.345, 1=>0.329, 2=>0.326)
 UnivariateFinite{Multiclass{3}}(0=>0.312, 1=>0.343, 2=>0.345)
 UnivariateFinite{Mu

#### You can even tune it if you wish

In [None]:
r1 = range(balanced_model, :(balancer1.ratios), lower=1.0, upper=1.4)
r2 = range(balanced_model, :(balancer2.k), lower=3, upper=10)
r3 = range(balanced_model, :(balancer3.s), lower=0.0, upper=0.3)

tuned_balanced_model = TunedModel(model=balanced_model,
									  tuning=Grid(goal=4),
									  resampling=CV(nfolds=4),
									  range=[r1, r2, r3],
									  measure=cross_entropy);

mach = machine(tuned_balanced_model, X, y);
fit!(mach, verbosity=0);

In [10]:
fitted_params(mach).best_model

BalancedModelProbabilistic(
  model = LogisticClassifier(
        lambda = 2.220446049250313e-16, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  balancer1 = RandomOversampler(
        ratios = 1.4, 
        rng = 42, 
        try_perserve_type = true), 
  balancer2 = SMOTENC(
        k = 10, 
        ratios = 1.2, 
        rng = 42, 
        try_perserve_type = true), 
  balancer3 = ROSE(
        s = 0.0, 
        ratios = 1.3, 
        rng = 42, 
        try_perserve_type = true))