# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-11 00:04:07.474740: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-11 00:04:07.474762: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'wine_quality'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[ 1.72096961e-01, -8.17699008e-02,  2.13280202e-01, ...,
        -1.24692128e+00, -3.49184257e-01, -1.39315246e+00],
       [-6.57501128e-01,  2.15895632e-01,  4.80011213e-02, ...,
         7.40028640e-01,  1.34184656e-03, -8.24275678e-01],
       [ 1.47575110e+00,  1.74519434e-02,  5.43838363e-01, ...,
         4.75101984e-01, -4.36815783e-01, -3.36667007e-01],
       ...,
       [-4.20473102e-01, -3.79435433e-01, -1.19159198e+00, ...,
        -1.31315295e+00, -2.61552731e-01, -9.05543789e-01],
       [-1.60561323e+00,  1.16673788e-01, -2.82557040e-01, ...,
         1.00495530e+00, -9.62604939e-01,  1.85757201e+00],
       [-1.01304317e+00, -6.77100966e-01,  3.78559282e-01, ...,
         4.75101984e-01, -1.48839409e+00,  1.04489089e+00]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 4715, 1: 183})
Ratio-> 25.8 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[939   4]
 [ 30   7]]

TNR: 0.99576
TPR: 0.18919
TNRxTPR: 0.18839
G-mean: 0.43404
__________________________________________________
Classifier:  LGBMClassifier
[[938   5]
 [ 27  10]]

TNR: 0.9947
TPR: 0.27027
TNRxTPR: 0.26884
G-mean: 0.5185
__________________________________________________
Classifier:  XGBClassifier
[[936   7]
 [ 30   7]]

TNR: 0.99258
TPR: 0.18919
TNRxTPR: 0.18778
G-mean: 0.43334
__________________________________________________
Classifier:  BaggingClassifier
[[940   3]
 [ 28   9]]

TNR: 0.99682
TPR: 0.24324
TNRxTPR: 0.24247
G-mean: 0.49241
__________________________________________________
Classifier:  RandomForestClassifier
[[940   3]
 [ 29   8]]

TNR: 0.99682
TPR: 0.21622
TNRxTPR: 0.21553
G-mean: 0.46425
__________________________________________________
Ensemble predictions (majority voting):
[[938   5]
 [ 29   8]]

TNR: 0.9947
TPR: 0.21622
TNRxTPR: 0.21507
G-mean: 0.46376


## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [11]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [12]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 3772, 1: 3772})
Ratio->  1 : 1.0


In [13]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[917  26]
 [ 23  14]]

TNR: 0.97243
TPR: 0.37838
TNRxTPR: 0.36795
G-mean: 0.60659
__________________________________________________
Classifier:  LGBMClassifier
[[925  18]
 [ 20  17]]

TNR: 0.98091
TPR: 0.45946
TNRxTPR: 0.45069
G-mean: 0.67133
__________________________________________________
Classifier:  XGBClassifier
[[923  20]
 [ 20  17]]

TNR: 0.97879
TPR: 0.45946
TNRxTPR: 0.44971
G-mean: 0.67061
__________________________________________________
Classifier:  BaggingClassifier
[[908  35]
 [ 22  15]]

TNR: 0.96288
TPR: 0.40541
TNRxTPR: 0.39036
G-mean: 0.62479
__________________________________________________
Classifier:  RandomForestClassifier
[[924  19]
 [ 23  14]]

TNR: 0.97985
TPR: 0.37838
TNRxTPR: 0.37075
G-mean: 0.6089
__________________________________________________
Ensemble predictions (majority voting):
[[925  18]
 [ 21  16]]

TNR: 0.98091
TPR: 0.43243
TNRxTPR: 0.42418
G-mean: 0.65129


In [14]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [15]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 3772, 1: 3772})
Ratio->  1 : 1.0


In [16]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[925  18]
 [ 25  12]]

TNR: 0.98091
TPR: 0.32432
TNRxTPR: 0.31813
G-mean: 0.56403
__________________________________________________
Classifier:  LGBMClassifier
[[922  21]
 [ 21  16]]

TNR: 0.97773
TPR: 0.43243
TNRxTPR: 0.4228
G-mean: 0.65023
__________________________________________________
Classifier:  XGBClassifier
[[925  18]
 [ 26  11]]

TNR: 0.98091
TPR: 0.2973
TNRxTPR: 0.29162
G-mean: 0.54002
__________________________________________________
Classifier:  BaggingClassifier
[[926  17]
 [ 27  10]]

TNR: 0.98197
TPR: 0.27027
TNRxTPR: 0.2654
G-mean: 0.51517
__________________________________________________
Classifier:  RandomForestClassifier
[[936   7]
 [ 30   7]]

TNR: 0.99258
TPR: 0.18919
TNRxTPR: 0.18778
G-mean: 0.43334
__________________________________________________
Ensemble predictions (majority voting):
[[929  14]
 [ 27  10]]

TNR: 0.98515
TPR: 0.27027
TNRxTPR: 0.26626
G-mean: 0.516


In [17]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)

2023-07-11 00:05:16.418703: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-11 00:05:16.418727: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-11 00:05:16.418744: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a-Modern-15-A5M): /proc/driver/nvidia/version does not exist
2023-07-11 00:05:16.418903: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 3772, 1.0: 3772})
Ratio->  1 : 1.0


In [19]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[938   5]
 [ 28   9]]

TNR: 0.9947
TPR: 0.24324
TNRxTPR: 0.24195
G-mean: 0.49189
__________________________________________________
Classifier:  LGBMClassifier
[[939   4]
 [ 28   9]]

TNR: 0.99576
TPR: 0.24324
TNRxTPR: 0.24221
G-mean: 0.49215
__________________________________________________
Classifier:  XGBClassifier
[[935   8]
 [ 28   9]]

TNR: 0.99152
TPR: 0.24324
TNRxTPR: 0.24118
G-mean: 0.4911
__________________________________________________
Classifier:  BaggingClassifier
[[938   5]
 [ 27  10]]

TNR: 0.9947
TPR: 0.27027
TNRxTPR: 0.26884
G-mean: 0.5185
__________________________________________________
Classifier:  RandomForestClassifier
[[940   3]
 [ 29   8]]

TNR: 0.99682
TPR: 0.21622
TNRxTPR: 0.21553
G-mean: 0.46425
__________________________________________________
Ensemble predictions (majority voting):
[[939   4]
 [ 30   7]]

TNR: 0.99576
TPR: 0.18919
TNRxTPR: 0.18839
G-mean: 0.43404


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [30]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [33]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.15)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 3772, 0: 566})
Ratio->  1 : 6.7


In [34]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[722 221]
 [  7  30]]

TNR: 0.76564
TPR: 0.81081
TNRxTPR: 0.62079
G-mean: 0.7879
__________________________________________________
Classifier:  LGBMClassifier
[[776 167]
 [  8  29]]

TNR: 0.82291
TPR: 0.78378
TNRxTPR: 0.64498
G-mean: 0.80311
__________________________________________________
Classifier:  XGBClassifier
[[770 173]
 [  9  28]]

TNR: 0.81654
TPR: 0.75676
TNRxTPR: 0.61792
G-mean: 0.78608
__________________________________________________
Classifier:  BaggingClassifier
[[751 192]
 [ 12  25]]

TNR: 0.79639
TPR: 0.67568
TNRxTPR: 0.5381
G-mean: 0.73356
__________________________________________________
Classifier:  RandomForestClassifier
[[737 206]
 [ 10  27]]

TNR: 0.78155
TPR: 0.72973
TNRxTPR: 0.57032
G-mean: 0.75519
__________________________________________________
Ensemble predictions (majority voting):
[[754 189]
 [  8  29]]

TNR: 0.79958
TPR: 0.78378
TNRxTPR: 0.62669
G-mean: 0.79164


In [27]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [28]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.1)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 3772, 0: 377})
Ratio->  1 : 10.0


In [29]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[711 232]
 [  8  29]]

TNR: 0.75398
TPR: 0.78378
TNRxTPR: 0.59095
G-mean: 0.76874
__________________________________________________
Classifier:  LGBMClassifier
[[792 151]
 [  7  30]]

TNR: 0.83987
TPR: 0.81081
TNRxTPR: 0.68098
G-mean: 0.82521
__________________________________________________
Classifier:  XGBClassifier
[[779 164]
 [  9  28]]

TNR: 0.82609
TPR: 0.75676
TNRxTPR: 0.62515
G-mean: 0.79066
__________________________________________________
Classifier:  BaggingClassifier
[[814 129]
 [ 16  21]]

TNR: 0.8632
TPR: 0.56757
TNRxTPR: 0.48993
G-mean: 0.69995
__________________________________________________
Classifier:  RandomForestClassifier
[[825 118]
 [ 14  23]]

TNR: 0.87487
TPR: 0.62162
TNRxTPR: 0.54384
G-mean: 0.73745
__________________________________________________
Ensemble predictions (majority voting):
[[796 147]
 [ 10  27]]

TNR: 0.84411
TPR: 0.72973
TNRxTPR: 0.61598
G-mean: 0.78484


In [114]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=.5)

In [115]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.05)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 1959, 0.0: 189})
Ratio->  1 : 10.4


In [116]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[740 203]
 [ 10  27]]

TNR: 0.78473
TPR: 0.72973
TNRxTPR: 0.57264
G-mean: 0.75673
__________________________________________________
Classifier:  LGBMClassifier
[[750 193]
 [ 10  27]]

TNR: 0.79533
TPR: 0.72973
TNRxTPR: 0.58038
G-mean: 0.76183
__________________________________________________
Classifier:  XGBClassifier
[[743 200]
 [ 12  25]]

TNR: 0.78791
TPR: 0.67568
TNRxTPR: 0.53237
G-mean: 0.72964
__________________________________________________
Classifier:  BaggingClassifier
[[768 175]
 [ 11  26]]

TNR: 0.81442
TPR: 0.7027
TNRxTPR: 0.5723
G-mean: 0.7565
__________________________________________________
Classifier:  RandomForestClassifier
[[776 167]
 [  8  29]]

TNR: 0.82291
TPR: 0.78378
TNRxTPR: 0.64498
G-mean: 0.80311
__________________________________________________
Ensemble predictions (majority voting):
[[758 185]
 [ 10  27]]

TNR: 0.80382
TPR: 0.72973
TNRxTPR: 0.58657
G-mean: 0.76588


## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
