# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-11 12:34:41.947793: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-11 12:34:41.947823: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'mammography'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[ 0.23002989,  5.07280511, -0.27607289,  0.83248134, -0.37788263,
         0.48034378],
       [ 0.15549807, -0.16939796,  0.67068217, -0.85959098, -0.37788263,
        -0.94576553],
       [-0.78444989, -0.44367356,  5.67495902, -0.85959098, -0.37788263,
        -0.94576553],
       ...,
       [ 1.20504168,  1.76380266, -0.50149077,  1.56247766,  6.48936266,
         0.93133561],
       [ 0.73667692, -0.22248356, -0.05065502,  1.5097322 ,  0.53929325,
         1.31528811],
       [ 0.17701066, -0.19151695, -0.50149077,  1.5789342 ,  7.75105157,
         1.55602027]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 10923, 1: 260})
Ratio-> 42.0 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2178    7]
 [  18   34]]

TNR: 0.9968
TPR: 0.65385
TNRxTPR: 0.65175
G-mean: 0.80731
__________________________________________________
Classifier:  LGBMClassifier
[[2180    5]
 [  16   36]]

TNR: 0.99771
TPR: 0.69231
TNRxTPR: 0.69072
G-mean: 0.8311
__________________________________________________
Classifier:  XGBClassifier
[[2176    9]
 [  20   32]]

TNR: 0.99588
TPR: 0.61538
TNRxTPR: 0.61285
G-mean: 0.78285
__________________________________________________
Classifier:  BaggingClassifier
[[2180    5]
 [  27   25]]

TNR: 0.99771
TPR: 0.48077
TNRxTPR: 0.47967
G-mean: 0.69258
__________________________________________________
Classifier:  RandomForestClassifier
[[2181    4]
 [  27   25]]

TNR: 0.99817
TPR: 0.48077
TNRxTPR: 0.47989
G-mean: 0.69274
__________________________________________________
Ensemble predictions (majority voting):
[[2182    3]
 [  22   30]]

TNR: 0.99863
TPR: 0.57692
TNRxTPR: 0.57

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [11]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [12]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 8738, 1: 8738})
Ratio->  1 : 1.0


In [13]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2163   22]
 [  15   37]]

TNR: 0.98993
TPR: 0.71154
TNRxTPR: 0.70437
G-mean: 0.83927
__________________________________________________
Classifier:  LGBMClassifier
[[2166   19]
 [   7   45]]

TNR: 0.9913
TPR: 0.86538
TNRxTPR: 0.85786
G-mean: 0.92621
__________________________________________________
Classifier:  XGBClassifier
[[2169   16]
 [  12   40]]

TNR: 0.99268
TPR: 0.76923
TNRxTPR: 0.7636
G-mean: 0.87384
__________________________________________________
Classifier:  BaggingClassifier
[[2150   35]
 [   8   44]]

TNR: 0.98398
TPR: 0.84615
TNRxTPR: 0.8326
G-mean: 0.91247
__________________________________________________
Classifier:  RandomForestClassifier
[[2159   26]
 [  10   42]]

TNR: 0.9881
TPR: 0.80769
TNRxTPR: 0.79808
G-mean: 0.89335
__________________________________________________
Ensemble predictions (majority voting):
[[2168   17]
 [  10   42]]

TNR: 0.99222
TPR: 0.80769
TNRxTPR: 0.8014

In [14]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [15]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 8738, 1: 8738})
Ratio->  1 : 1.0


In [16]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2172   13]
 [   8   44]]

TNR: 0.99405
TPR: 0.84615
TNRxTPR: 0.84112
G-mean: 0.91713
__________________________________________________
Classifier:  LGBMClassifier
[[2171   14]
 [  12   40]]

TNR: 0.99359
TPR: 0.76923
TNRxTPR: 0.7643
G-mean: 0.87424
__________________________________________________
Classifier:  XGBClassifier
[[2173   12]
 [  13   39]]

TNR: 0.99451
TPR: 0.75
TNRxTPR: 0.74588
G-mean: 0.86364
__________________________________________________
Classifier:  BaggingClassifier
[[2168   17]
 [  19   33]]

TNR: 0.99222
TPR: 0.63462
TNRxTPR: 0.62968
G-mean: 0.79352
__________________________________________________
Classifier:  RandomForestClassifier
[[2178    7]
 [  19   33]]

TNR: 0.9968
TPR: 0.63462
TNRxTPR: 0.63258
G-mean: 0.79535
__________________________________________________
Ensemble predictions (majority voting):
[[2177    8]
 [  12   40]]

TNR: 0.99634
TPR: 0.76923
TNRxTPR: 0.76641

In [17]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)

2023-07-11 12:35:46.985803: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-11 12:35:46.985842: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-11 12:35:46.985863: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a-Modern-15-A5M): /proc/driver/nvidia/version does not exist
2023-07-11 12:35:46.986094: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 8738, 1.0: 8738})
Ratio->  1 : 1.0


In [19]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2179    6]
 [  21   31]]

TNR: 0.99725
TPR: 0.59615
TNRxTPR: 0.59452
G-mean: 0.77105
__________________________________________________
Classifier:  LGBMClassifier
[[2178    7]
 [  21   31]]

TNR: 0.9968
TPR: 0.59615
TNRxTPR: 0.59424
G-mean: 0.77087
__________________________________________________
Classifier:  XGBClassifier
[[2179    6]
 [  22   30]]

TNR: 0.99725
TPR: 0.57692
TNRxTPR: 0.57534
G-mean: 0.75851
__________________________________________________
Classifier:  BaggingClassifier
[[2179    6]
 [  27   25]]

TNR: 0.99725
TPR: 0.48077
TNRxTPR: 0.47945
G-mean: 0.69242
__________________________________________________
Classifier:  RandomForestClassifier
[[2181    4]
 [  23   29]]

TNR: 0.99817
TPR: 0.55769
TNRxTPR: 0.55667
G-mean: 0.7461
__________________________________________________
Ensemble predictions (majority voting):
[[2181    4]
 [  22   30]]

TNR: 0.99817
TPR: 0.57692
TNRxTPR: 0.57

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [20]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [21]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 8738, 0: 1748})
Ratio->  1 : 5.0


In [22]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2082  103]
 [   3   49]]

TNR: 0.95286
TPR: 0.94231
TNRxTPR: 0.89789
G-mean: 0.94757
__________________________________________________
Classifier:  LGBMClassifier
[[2097   88]
 [   3   49]]

TNR: 0.95973
TPR: 0.94231
TNRxTPR: 0.90436
G-mean: 0.95098
__________________________________________________
Classifier:  XGBClassifier
[[2091   94]
 [   4   48]]

TNR: 0.95698
TPR: 0.92308
TNRxTPR: 0.88337
G-mean: 0.93988
__________________________________________________
Classifier:  BaggingClassifier
[[2078  107]
 [   4   48]]

TNR: 0.95103
TPR: 0.92308
TNRxTPR: 0.87787
G-mean: 0.93695
__________________________________________________
Classifier:  RandomForestClassifier
[[2075  110]
 [   4   48]]

TNR: 0.94966
TPR: 0.92308
TNRxTPR: 0.87661
G-mean: 0.93627
__________________________________________________
Ensemble predictions (majority voting):
[[2092   93]
 [   4   48]]

TNR: 0.95744
TPR: 0.92308
TNRxTPR: 0.

In [23]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [24]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.15)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 8738, 0: 1311})
Ratio->  1 : 6.7


In [25]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2099   86]
 [   2   50]]

TNR: 0.96064
TPR: 0.96154
TNRxTPR: 0.92369
G-mean: 0.96109
__________________________________________________
Classifier:  LGBMClassifier
[[2121   64]
 [   5   47]]

TNR: 0.97071
TPR: 0.90385
TNRxTPR: 0.87737
G-mean: 0.93668
__________________________________________________
Classifier:  XGBClassifier
[[2111   74]
 [   3   49]]

TNR: 0.96613
TPR: 0.94231
TNRxTPR: 0.91039
G-mean: 0.95415
__________________________________________________
Classifier:  BaggingClassifier
[[2110   75]
 [   5   47]]

TNR: 0.96568
TPR: 0.90385
TNRxTPR: 0.87282
G-mean: 0.93425
__________________________________________________
Classifier:  RandomForestClassifier
[[2135   50]
 [   7   45]]

TNR: 0.97712
TPR: 0.86538
TNRxTPR: 0.84558
G-mean: 0.91956
__________________________________________________
Ensemble predictions (majority voting):
[[2124   61]
 [   5   47]]

TNR: 0.97208
TPR: 0.90385
TNRxTPR: 0.

In [39]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1.3)

In [40]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.08)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 11297, 0.0: 699})
Ratio->  1 : 16.2


In [41]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[2115   70]
 [   5   47]]

TNR: 0.96796
TPR: 0.90385
TNRxTPR: 0.87489
G-mean: 0.93536
__________________________________________________
Classifier:  LGBMClassifier
[[2108   77]
 [   3   49]]

TNR: 0.96476
TPR: 0.94231
TNRxTPR: 0.9091
G-mean: 0.95347
__________________________________________________
Classifier:  XGBClassifier
[[2119   66]
 [   2   50]]

TNR: 0.96979
TPR: 0.96154
TNRxTPR: 0.93249
G-mean: 0.96566
__________________________________________________
Classifier:  BaggingClassifier
[[2120   65]
 [  10   42]]

TNR: 0.97025
TPR: 0.80769
TNRxTPR: 0.78366
G-mean: 0.88525
__________________________________________________
Classifier:  RandomForestClassifier
[[2129   56]
 [   4   48]]

TNR: 0.97437
TPR: 0.92308
TNRxTPR: 0.89942
G-mean: 0.94838
__________________________________________________
Ensemble predictions (majority voting):
[[2132   53]
 [   3   49]]

TNR: 0.97574
TPR: 0.94231
TNRxTPR: 0.9

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
