Install Julia if running on Google Colab

In [None]:
%%capture
%%shell
wget -O - https://raw.githubusercontent.com/JuliaAI/Imbalance.jl/dev/docs/src/examples/colab.sh | bash
# This should take around one minute to finish. Once it does, change the runtime to `Julia` by choosing `Runtime` 
# from the toolbar then `Change runtime type`. You can then delete this cell.

Install needed packages

In [None]:
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()
Pkg.add(["DataFrames", "Plots", "Colors", "CategoricalArrays", "Random", 
         "Imbalance", "MLJBase", "TableTransforms", "ScientificTypes", "HTTP"])

using DataFrames
using Plots
using CategoricalArrays
using Random
using Imbalance
using MLJBase: machine, transform
using ScientificTypes
using TableTransforms
using DataFrames
using HTTP: download

### 1. Generate Random Data

In [2]:
rng = Random.default_rng()


num_rows = 100
num_cont_feats = 0
class_probs = [0.5, 0.2, 0.3]

num_vals_per_category = [3, 4, 2, 5]

X, y = generate_imbalanced_data(num_rows, num_cont_feats; class_probs, num_vals_per_category)
X = DataFrame(X)

X = coerce(X, autotype(X, :few_to_finite))

Row,Column1,Column2,Column3,Column4
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Cat…
1,3,4,1,1
2,3,3,1,5
3,2,2,1,4
4,2,2,1,1
5,3,2,1,1
6,2,3,2,2
7,1,2,1,5
8,3,2,1,1
9,2,2,1,3
10,2,2,1,3


### 2. Apply SMOTE-N

#### Using Imbalance Only

In [3]:
Xover, yover = smoten(X, y; k = 5, ratios = Dict(0=>1.2, 1=> 1.2, 2=>1.2), rng = 42)

[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m[K
[34m  class:  1[39m[K


([1m210×4 DataFrame[0m
[1m Row [0m│[1m Column1 [0m[1m Column2 [0m[1m Column3 [0m[1m Column4 [0m
     │[90m Cat…    [0m[90m Cat…    [0m[90m Cat…    [0m[90m Cat…    [0m
─────┼────────────────────────────────────
   1 │ 3        4        1        1
   2 │ 3        3        1        5
   3 │ 2        2        1        4
   4 │ 2        2        1        1
   5 │ 3        2        1        1
   6 │ 2        3        2        2
   7 │ 1        2        1        5
   8 │ 3        2        1        1
  ⋮  │    ⋮        ⋮        ⋮        ⋮
 204 │ 1        3        2        4
 205 │ 1        4        1        5
 206 │ 1        4        1        4
 207 │ 2        2        1        2
 208 │ 1        4        2        4
 209 │ 2        2        1        2
 210 │ 1        3        2        4
[36m                          195 rows omitted[0m, CategoricalValue{Int64, UInt32}[0, 0, 0, 0, 0, 0, 0, 2, 0, 0  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### Using MLJ

In [4]:
smotenc_model = Imbalance.MLJ.SMOTEN(k=5, ratios=Dict(0=>1.2, 1=> 1.2, 2=>1.2), rng=42)
mach = machine(smotenc_model)
Xover, yover = transform(mach, X, y)
Xover

Row,Column1,Column2,Column3,Column4
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Cat…
1,3,4,1,1
2,3,3,1,5
3,2,2,1,4
4,2,2,1,1
5,3,2,1,1
6,2,3,2,2
7,1,2,1,5
8,3,2,1,1
9,2,2,1,3
10,2,2,1,3


#### Using TableTransforms

In [7]:
# For TableTransforms
y_ind= 3
Xy, _ = generate_imbalanced_data(num_rows, num_cont_feats; class_probs, insert_y=y_ind, num_vals_per_category, rng=10)
Xy = coerce(Xy, autotype(Xy, :few_to_finite))

(Column1 = CategoricalValue{Int64, UInt32}[3, 3, 2, 3, 1, 3, 3, 2, 3, 1  …  2, 1, 2, 3, 3, 2, 2, 1, 3, 2],
 Column2 = CategoricalValue{Int64, UInt32}[2, 4, 1, 3, 1, 2, 3, 2, 3, 1  …  2, 2, 1, 1, 4, 1, 3, 3, 3, 2],
 Column3 = CategoricalValue{Int64, UInt32}[2, 2, 1, 0, 0, 0, 1, 0, 2, 2  …  1, 1, 2, 0, 0, 0, 1, 1, 2, 0],
 Column4 = CategoricalValue{Int64, UInt32}[1, 2, 1, 1, 2, 2, 2, 1, 1, 2  …  2, 1, 2, 1, 2, 2, 2, 2, 2, 2],
 Column5 = CategoricalValue{Int64, UInt32}[2, 5, 5, 3, 4, 4, 4, 4, 2, 4  …  2, 3, 2, 1, 4, 1, 5, 1, 5, 4],)

In [8]:
using Imbalance: TableTransforms.SMOTEN
smoten_model_t = SMOTEN(y_ind, ratios=Dict(1=> 1.2, 2=>1.2, 3=>1.2, 4=>1.2), rng=42)
Xyover = Xy |> smoten_model_t

[32mProgress:  67%|███████████████████████████▍             |  ETA: 0:00:00[39m[K
[34m  class:  2[39m[K[A


[K[A[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m[K
[34m  class:  1[39m[K


(Column1 = CategoricalValue{Int64, UInt32}[3, 3, 2, 3, 1, 3, 3, 2, 3, 1  …  3, 3, 1, 2, 1, 1, 1, 2, 3, 1],
 Column2 = CategoricalValue{Int64, UInt32}[2, 4, 1, 3, 1, 2, 3, 2, 3, 1  …  1, 2, 3, 3, 2, 2, 2, 2, 2, 2],
 Column3 = CategoricalValue{Int64, UInt32}[2, 2, 1, 0, 0, 0, 1, 0, 2, 2  …  0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 Column4 = CategoricalValue{Int64, UInt32}[1, 2, 1, 1, 2, 2, 2, 1, 1, 2  …  1, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 Column5 = CategoricalValue{Int64, UInt32}[2, 5, 5, 3, 4, 4, 4, 4, 2, 4  …  1, 5, 4, 4, 5, 1, 5, 3, 5, 5],)