In [1]:
using DataFrames
using Plots
using CategoricalArrays
using Random
using MLJBase: machine, transform
using Imbalance
using ScientificTypes

include("./utils.jl")

plot_data

### 1. Generate Random Data

In [2]:
using CSV
using DataFrames

df = CSV.read("dataset.csv", DataFrame)
X, y = DataFrames.select(df, Not(:Body_Level)), df[:, :Body_Level]

([1m1477×16 DataFrame[0m
[1m  Row [0m│[1m Gender  [0m[1m Age     [0m[1m Height  [0m[1m Weight   [0m[1m H_Cal_Consump [0m[1m Veg_Consump [0m[1m Water[0m ⋯
      │[90m String7 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64  [0m[90m String3       [0m[90m Float64     [0m[90m Float[0m ⋯
──────┼─────────────────────────────────────────────────────────────────────────
    1 │ Female   22.5473  1.72246   51.8813  yes                2.66342        ⋯
    2 │ Male     19.7991  1.7437    54.9275  yes                2.0
    3 │ Female   17.8234  1.70841   50.0     yes                1.64224
    4 │ Female   19.0072  1.69073   49.8957  yes                1.21291
    5 │ Male     19.7293  1.79331   58.1951  yes                2.50883        ⋯
    6 │ Male     18.4706  1.85641   58.674   yes                2.34222
    7 │ Male     18.0     1.7387    50.2487  yes                1.87121
    8 │ Female   29.9704  1.61086   49.516   yes                2.05914
  ⋮   │    ⋮ 

In [3]:
X = coerce(X, autotype(X, :string_to_multiclass))

types = schema(X).scitypes
cat_inds = findall( x -> x <: Multiclass, types)
X = X[:, cat_inds]

Row,Gender,H_Cal_Consump,Alcohol_Consump,Smoking,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Transport
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…
1,Female,yes,no,no,Frequently,yes,no,Public_Transportation
2,Male,yes,Sometimes,no,Sometimes,yes,no,Public_Transportation
3,Female,yes,Sometimes,no,Sometimes,no,no,Public_Transportation
4,Female,yes,Sometimes,no,Sometimes,no,no,Public_Transportation
5,Male,yes,no,no,Sometimes,yes,no,Automobile
6,Male,yes,no,no,Sometimes,yes,no,Automobile
7,Male,yes,Sometimes,no,Sometimes,yes,no,Public_Transportation
8,Female,yes,no,no,Frequently,yes,no,Public_Transportation
9,Female,no,no,no,Frequently,no,no,Public_Transportation
10,Female,no,Sometimes,yes,Sometimes,no,no,Automobile


In [4]:
schema(X)

┌────────────────────┬───────────────┬────────────────────────────────────┐
│[22m names              [0m│[22m scitypes      [0m│[22m types                              [0m│
├────────────────────┼───────────────┼────────────────────────────────────┤
│ Gender             │ Multiclass{2} │ CategoricalValue{String7, UInt32}  │
│ H_Cal_Consump      │ Multiclass{2} │ CategoricalValue{String3, UInt32}  │
│ Alcohol_Consump    │ Multiclass{4} │ CategoricalValue{String15, UInt32} │
│ Smoking            │ Multiclass{2} │ CategoricalValue{String3, UInt32}  │
│ Food_Between_Meals │ Multiclass{4} │ CategoricalValue{String15, UInt32} │
│ Fam_Hist           │ Multiclass{2} │ CategoricalValue{String3, UInt32}  │
│ H_Cal_Burn         │ Multiclass{2} │ CategoricalValue{String3, UInt32}  │
│ Transport          │ Multiclass{5} │ CategoricalValue{String31, UInt32} │
└────────────────────┴───────────────┴────────────────────────────────────┘


#### Generation Alternative

In [15]:
rng = Random.default_rng()

num_rows = 100
num_cont_feats = 0
probs = [0.5, 0.2, 0.3]

extra_cat_feats = [3, 4, 2, 5]

Xc, yc = generate_imbalanced_data(num_rows, num_cont_feats; probs, extra_cat_feats, type = "Matrix")

([3 3 2 4; 2 1 1 1; … ; 1 4 1 4; 3 4 2 2], CategoricalValue{Int64, UInt32}[2, 2, 0, 0, 1, 0, 2, 0, 1, 2  …  0, 0, 2, 2, 0, 1, 0, 0, 1, 0])

In [None]:
Xc = coerce(Xc, autotype(X, :few_to_finite))
Xc = coerce(Xc, OrderedFactor=>Multiclass, Count=>Multiclass)
# make gender ordered factor
types = schema(Xc).scitypes

### 2. Apply SMOTEN

In [20]:
Xover, yover = smoten(X, y; k = 5, rng = 1234)
Xover

Row,Gender,H_Cal_Consump,Alcohol_Consump,Smoking,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Transport
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…
1,Female,yes,no,no,Frequently,yes,no,Public_Transportation
2,Male,yes,Sometimes,no,Sometimes,yes,no,Public_Transportation
3,Female,yes,Sometimes,no,Sometimes,no,no,Public_Transportation
4,Female,yes,Sometimes,no,Sometimes,no,no,Public_Transportation
5,Male,yes,no,no,Sometimes,yes,no,Automobile
6,Male,yes,no,no,Sometimes,yes,no,Automobile
7,Male,yes,Sometimes,no,Sometimes,yes,no,Public_Transportation
8,Female,yes,no,no,Frequently,yes,no,Public_Transportation
9,Female,no,no,no,Frequently,no,no,Public_Transportation
10,Female,no,Sometimes,yes,Sometimes,no,no,Automobile


### 3. Check the Results

In [22]:
using StatsBase

countmap(yover)

Dict{String15, Int64} with 4 entries:
  "Body Level 4" => 680
  "Body Level 3" => 680
  "Body Level 2" => 680
  "Body Level 1" => 680