# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-10 16:36:31.366345: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-10 16:36:31.366368: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'abalone'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[-0.67483383, -0.68801788,  1.31667716, ..., -0.60768536,
        -0.72621157, -0.63821689],
       [-0.67483383, -0.68801788,  1.31667716, ..., -1.17090984,
        -1.20522124, -1.21298732],
       [ 1.48184628, -0.68801788, -0.75948762, ..., -0.4634999 ,
        -0.35668983, -0.20713907],
       ...,
       [-0.67483383, -0.68801788,  1.31667716, ...,  0.74855917,
         0.97541324,  0.49695471],
       [ 1.48184628, -0.68801788, -0.75948762, ...,  0.77334105,
         0.73362741,  0.41073914],
       [-0.67483383, -0.68801788,  1.31667716, ...,  2.64099341,
         1.78744868,  1.84048058]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res  
    
    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)
    
    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))
    

## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 3786, 1: 391})
Ratio-> 9.7 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[744  14]
 [ 72   6]]

TNR: 0.98153
TPR: 0.07692
TNRxTPR: 0.0755
G-mean: 0.27478
__________________________________________________
Classifier:  LGBMClassifier
[[731  27]
 [ 64  14]]

TNR: 0.96438
TPR: 0.17949
TNRxTPR: 0.17309
G-mean: 0.41605
__________________________________________________
Classifier:  XGBClassifier
[[731  27]
 [ 67  11]]

TNR: 0.96438
TPR: 0.14103
TNRxTPR: 0.136
G-mean: 0.36878
__________________________________________________
Classifier:  BaggingClassifier
[[734  24]
 [ 63  15]]

TNR: 0.96834
TPR: 0.19231
TNRxTPR: 0.18622
G-mean: 0.43153
__________________________________________________
Classifier:  RandomForestClassifier
[[742  16]
 [ 71   7]]

TNR: 0.97889
TPR: 0.08974
TNRxTPR: 0.08785
G-mean: 0.29639
__________________________________________________
Ensemble predictions (majority voting):
[[741  17]
 [ 66  12]]

TNR: 0.97757
TPR: 0.15385
TNRxTPR: 0.1504
G-mean: 0.38781


## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [10]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [11]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 3028, 1: 3028})
Ratio->  1 : 1.0


In [12]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[669  89]
 [ 39  39]]

TNR: 0.88259
TPR: 0.5
TNRxTPR: 0.44129
G-mean: 0.6643
__________________________________________________
Classifier:  LGBMClassifier
[[685  73]
 [ 45  33]]

TNR: 0.90369
TPR: 0.42308
TNRxTPR: 0.38233
G-mean: 0.61833
__________________________________________________
Classifier:  XGBClassifier
[[688  70]
 [ 52  26]]

TNR: 0.90765
TPR: 0.33333
TNRxTPR: 0.30255
G-mean: 0.55005
__________________________________________________
Classifier:  BaggingClassifier
[[676  82]
 [ 50  28]]

TNR: 0.89182
TPR: 0.35897
TNRxTPR: 0.32014
G-mean: 0.56581
__________________________________________________
Classifier:  RandomForestClassifier
[[684  74]
 [ 50  28]]

TNR: 0.90237
TPR: 0.35897
TNRxTPR: 0.32393
G-mean: 0.56915
__________________________________________________
Ensemble predictions (majority voting):
[[680  78]
 [ 49  29]]

TNR: 0.8971
TPR: 0.37179
TNRxTPR: 0.33354
G-mean: 0.57753


In [13]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [14]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 3028, 1: 3028})
Ratio->  1 : 1.0


In [15]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[672  86]
 [ 38  40]]

TNR: 0.88654
TPR: 0.51282
TNRxTPR: 0.45464
G-mean: 0.67427
__________________________________________________
Classifier:  LGBMClassifier
[[682  76]
 [ 44  34]]

TNR: 0.89974
TPR: 0.4359
TNRxTPR: 0.39219
G-mean: 0.62625
__________________________________________________
Classifier:  XGBClassifier
[[702  56]
 [ 52  26]]

TNR: 0.92612
TPR: 0.33333
TNRxTPR: 0.30871
G-mean: 0.55561
__________________________________________________
Classifier:  BaggingClassifier
[[706  52]
 [ 60  18]]

TNR: 0.9314
TPR: 0.23077
TNRxTPR: 0.21494
G-mean: 0.46361
__________________________________________________
Classifier:  RandomForestClassifier
[[721  37]
 [ 62  16]]

TNR: 0.95119
TPR: 0.20513
TNRxTPR: 0.19512
G-mean: 0.44172
__________________________________________________
Ensemble predictions (majority voting):
[[698  60]
 [ 53  25]]

TNR: 0.92084
TPR: 0.32051
TNRxTPR: 0.29514
G-mean: 0.54327


In [11]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)

2023-07-10 16:37:00.887465: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-10 16:37:00.887490: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-10 16:37:00.887507: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a-Modern-15-A5M): /proc/driver/nvidia/version does not exist
2023-07-10 16:37:00.887681: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 3028, 1.0: 3028})
Ratio->  1 : 1.0


In [13]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[742  16]
 [ 72   6]]

TNR: 0.97889
TPR: 0.07692
TNRxTPR: 0.0753
G-mean: 0.27441
__________________________________________________
Classifier:  LGBMClassifier
[[732  26]
 [ 67  11]]

TNR: 0.9657
TPR: 0.14103
TNRxTPR: 0.13619
G-mean: 0.36904
__________________________________________________
Classifier:  XGBClassifier
[[727  31]
 [ 64  14]]

TNR: 0.9591
TPR: 0.17949
TNRxTPR: 0.17215
G-mean: 0.41491
__________________________________________________
Classifier:  BaggingClassifier
[[735  23]
 [ 71   7]]

TNR: 0.96966
TPR: 0.08974
TNRxTPR: 0.08702
G-mean: 0.29499
__________________________________________________
Classifier:  RandomForestClassifier
[[741  17]
 [ 71   7]]

TNR: 0.97757
TPR: 0.08974
TNRxTPR: 0.08773
G-mean: 0.29619
__________________________________________________
Ensemble predictions (majority voting):
[[738  20]
 [ 70   8]]

TNR: 0.97361
TPR: 0.10256
TNRxTPR: 0.09986
G-mean: 0.316


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [57]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [64]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 3028, 0: 606})
Ratio->  1 : 5.0


In [65]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[511 247]
 [  9  69]]

TNR: 0.67414
TPR: 0.88462
TNRxTPR: 0.59636
G-mean: 0.77224
__________________________________________________
Classifier:  LGBMClassifier
[[545 213]
 [ 17  61]]

TNR: 0.719
TPR: 0.78205
TNRxTPR: 0.56229
G-mean: 0.74986
__________________________________________________
Classifier:  XGBClassifier
[[536 222]
 [ 17  61]]

TNR: 0.70712
TPR: 0.78205
TNRxTPR: 0.55301
G-mean: 0.74364
__________________________________________________
Classifier:  BaggingClassifier
[[556 202]
 [ 20  58]]

TNR: 0.73351
TPR: 0.74359
TNRxTPR: 0.54543
G-mean: 0.73853
__________________________________________________
Classifier:  RandomForestClassifier
[[544 214]
 [ 12  66]]

TNR: 0.71768
TPR: 0.84615
TNRxTPR: 0.60727
G-mean: 0.77927
__________________________________________________
Ensemble predictions (majority voting):
[[541 217]
 [ 14  64]]

TNR: 0.71372
TPR: 0.82051
TNRxTPR: 0.58562
G-mean: 0.76526


In [31]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [32]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.3)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 3028, 0: 908})
Ratio->  1 : 3.3


In [33]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[562 196]
 [ 14  64]]

TNR: 0.74142
TPR: 0.82051
TNRxTPR: 0.60835
G-mean: 0.77997
__________________________________________________
Classifier:  LGBMClassifier
[[598 160]
 [ 25  53]]

TNR: 0.78892
TPR: 0.67949
TNRxTPR: 0.53606
G-mean: 0.73216
__________________________________________________
Classifier:  XGBClassifier
[[614 144]
 [ 28  50]]

TNR: 0.81003
TPR: 0.64103
TNRxTPR: 0.51925
G-mean: 0.72059
__________________________________________________
Classifier:  BaggingClassifier
[[627 131]
 [ 36  42]]

TNR: 0.82718
TPR: 0.53846
TNRxTPR: 0.4454
G-mean: 0.66739
__________________________________________________
Classifier:  RandomForestClassifier
[[627 131]
 [ 27  51]]

TNR: 0.82718
TPR: 0.65385
TNRxTPR: 0.54085
G-mean: 0.73542
__________________________________________________
Ensemble predictions (majority voting):
[[607 151]
 [ 25  53]]

TNR: 0.80079
TPR: 0.67949
TNRxTPR: 0.54413
G-mean: 0.73765


In [54]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1)

In [55]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.13)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 3028, 0.0: 394})
Ratio->  1 : 7.7


In [56]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[587 171]
 [ 16  62]]

TNR: 0.77441
TPR: 0.79487
TNRxTPR: 0.61555
G-mean: 0.78457
__________________________________________________
Classifier:  LGBMClassifier
[[585 173]
 [ 20  58]]

TNR: 0.77177
TPR: 0.74359
TNRxTPR: 0.57388
G-mean: 0.75755
__________________________________________________
Classifier:  XGBClassifier
[[592 166]
 [ 14  64]]

TNR: 0.781
TPR: 0.82051
TNRxTPR: 0.64082
G-mean: 0.80051
__________________________________________________
Classifier:  BaggingClassifier
[[608 150]
 [ 26  52]]

TNR: 0.80211
TPR: 0.66667
TNRxTPR: 0.53474
G-mean: 0.73126
__________________________________________________
Classifier:  RandomForestClassifier
[[600 158]
 [ 17  61]]

TNR: 0.79156
TPR: 0.78205
TNRxTPR: 0.61904
G-mean: 0.78679
__________________________________________________
Ensemble predictions (majority voting):
[[600 158]
 [ 15  63]]

TNR: 0.79156
TPR: 0.80769
TNRxTPR: 0.63933
G-mean: 0.79958


## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
