# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-10 17:05:07.099332: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-10 17:05:07.099354: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'us_crime'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[ 1.04361188, -0.81499701, -0.6300017 , ...,  0.13369936,
         0.1673161 ,  0.94039921],
       [-0.45393678, -1.85363638, -0.23533523, ..., -0.55581688,
         1.25903177, -0.39144686],
       [-0.45393678, -0.26512911,  1.22493074, ..., -0.11255644,
        -0.61871918, -0.39144686],
       ...,
       [ 0.80715683, -0.57061128,  0.27773119, ...,  0.42920632,
         0.07997885,  3.39599039],
       [ 0.17661003,  0.2847388 , -0.47213511, ...,  0.72471328,
         0.73500825,  0.52419731],
       [ 1.12243023,  1.9343425 , -0.15640193, ...,  0.330704  ,
        -0.4877133 ,  3.77057209]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 1844, 1: 150})
Ratio-> 12.3 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[362   7]
 [ 15  15]]

TNR: 0.98103
TPR: 0.5
TNRxTPR: 0.49051
G-mean: 0.70037
__________________________________________________
Classifier:  LGBMClassifier
[[361   8]
 [ 18  12]]

TNR: 0.97832
TPR: 0.4
TNRxTPR: 0.39133
G-mean: 0.62556
__________________________________________________
Classifier:  XGBClassifier
[[361   8]
 [ 13  17]]

TNR: 0.97832
TPR: 0.56667
TNRxTPR: 0.55438
G-mean: 0.74457
__________________________________________________
Classifier:  BaggingClassifier
[[361   8]
 [ 18  12]]

TNR: 0.97832
TPR: 0.4
TNRxTPR: 0.39133
G-mean: 0.62556
__________________________________________________
Classifier:  RandomForestClassifier
[[363   6]
 [ 17  13]]

TNR: 0.98374
TPR: 0.43333
TNRxTPR: 0.42629
G-mean: 0.65291
__________________________________________________
Ensemble predictions (majority voting):
[[362   7]
 [ 17  13]]

TNR: 0.98103
TPR: 0.43333
TNRxTPR: 0.42511
G-mean: 0.65201


## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [11]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [12]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 1475, 1: 1475})
Ratio->  1 : 1.0


In [13]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[348  21]
 [ 12  18]]

TNR: 0.94309
TPR: 0.6
TNRxTPR: 0.56585
G-mean: 0.75223
__________________________________________________
Classifier:  LGBMClassifier
[[348  21]
 [ 13  17]]

TNR: 0.94309
TPR: 0.56667
TNRxTPR: 0.53442
G-mean: 0.73104
__________________________________________________
Classifier:  XGBClassifier
[[354  15]
 [ 11  19]]

TNR: 0.95935
TPR: 0.63333
TNRxTPR: 0.60759
G-mean: 0.77948
__________________________________________________
Classifier:  BaggingClassifier
[[349  20]
 [ 16  14]]

TNR: 0.9458
TPR: 0.46667
TNRxTPR: 0.44137
G-mean: 0.66436
__________________________________________________
Classifier:  RandomForestClassifier
[[351  18]
 [ 12  18]]

TNR: 0.95122
TPR: 0.6
TNRxTPR: 0.57073
G-mean: 0.75547
__________________________________________________
Ensemble predictions (majority voting):
[[350  19]
 [ 13  17]]

TNR: 0.94851
TPR: 0.56667
TNRxTPR: 0.53749
G-mean: 0.73314


In [14]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [15]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 1475, 1: 1475})
Ratio->  1 : 1.0


In [16]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[354  15]
 [ 12  18]]

TNR: 0.95935
TPR: 0.6
TNRxTPR: 0.57561
G-mean: 0.75869
__________________________________________________
Classifier:  LGBMClassifier
[[356  13]
 [ 15  15]]

TNR: 0.96477
TPR: 0.5
TNRxTPR: 0.48238
G-mean: 0.69454
__________________________________________________
Classifier:  XGBClassifier
[[353  16]
 [ 13  17]]

TNR: 0.95664
TPR: 0.56667
TNRxTPR: 0.5421
G-mean: 0.73627
__________________________________________________
Classifier:  BaggingClassifier
[[351  18]
 [ 20  10]]

TNR: 0.95122
TPR: 0.33333
TNRxTPR: 0.31707
G-mean: 0.56309
__________________________________________________
Classifier:  RandomForestClassifier
[[357  12]
 [ 15  15]]

TNR: 0.96748
TPR: 0.5
TNRxTPR: 0.48374
G-mean: 0.69551
__________________________________________________
Ensemble predictions (majority voting):
[[355  14]
 [ 14  16]]

TNR: 0.96206
TPR: 0.53333
TNRxTPR: 0.5131
G-mean: 0.71631


In [17]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [18]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 1475, 1.0: 1475})
Ratio->  1 : 1.0


In [19]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[359  10]
 [ 15  15]]

TNR: 0.9729
TPR: 0.5
TNRxTPR: 0.48645
G-mean: 0.69746
__________________________________________________
Classifier:  LGBMClassifier
[[363   6]
 [ 17  13]]

TNR: 0.98374
TPR: 0.43333
TNRxTPR: 0.42629
G-mean: 0.65291
__________________________________________________
Classifier:  XGBClassifier
[[361   8]
 [ 13  17]]

TNR: 0.97832
TPR: 0.56667
TNRxTPR: 0.55438
G-mean: 0.74457
__________________________________________________
Classifier:  BaggingClassifier
[[362   7]
 [ 17  13]]

TNR: 0.98103
TPR: 0.43333
TNRxTPR: 0.42511
G-mean: 0.65201
__________________________________________________
Classifier:  RandomForestClassifier
[[363   6]
 [ 18  12]]

TNR: 0.98374
TPR: 0.4
TNRxTPR: 0.3935
G-mean: 0.62729
__________________________________________________
Ensemble predictions (majority voting):
[[362   7]
 [ 14  16]]

TNR: 0.98103
TPR: 0.53333
TNRxTPR: 0.52322
G-mean: 0.72334


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [38]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=.8) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [39]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 1180, 0: 295})
Ratio->  1 : 4.0


In [40]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[306  63]
 [  3  27]]

TNR: 0.82927
TPR: 0.9
TNRxTPR: 0.74634
G-mean: 0.86391
__________________________________________________
Classifier:  LGBMClassifier
[[316  53]
 [  4  26]]

TNR: 0.85637
TPR: 0.86667
TNRxTPR: 0.74219
G-mean: 0.8615
__________________________________________________
Classifier:  XGBClassifier
[[317  52]
 [  4  26]]

TNR: 0.85908
TPR: 0.86667
TNRxTPR: 0.74453
G-mean: 0.86286
__________________________________________________
Classifier:  BaggingClassifier
[[323  46]
 [  4  26]]

TNR: 0.87534
TPR: 0.86667
TNRxTPR: 0.75863
G-mean: 0.87099
__________________________________________________
Classifier:  RandomForestClassifier
[[312  57]
 [  3  27]]

TNR: 0.84553
TPR: 0.9
TNRxTPR: 0.76098
G-mean: 0.87234
__________________________________________________
Ensemble predictions (majority voting):
[[313  56]
 [  4  26]]

TNR: 0.84824
TPR: 0.86667
TNRxTPR: 0.73514
G-mean: 0.8574


In [23]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=.8) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [24]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 1180, 0: 295})
Ratio->  1 : 4.0


In [25]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[311  58]
 [  2  28]]

TNR: 0.84282
TPR: 0.93333
TNRxTPR: 0.78663
G-mean: 0.88692
__________________________________________________
Classifier:  LGBMClassifier
[[317  52]
 [  5  25]]

TNR: 0.85908
TPR: 0.83333
TNRxTPR: 0.7159
G-mean: 0.84611
__________________________________________________
Classifier:  XGBClassifier
[[318  51]
 [  8  22]]

TNR: 0.86179
TPR: 0.73333
TNRxTPR: 0.63198
G-mean: 0.79497
__________________________________________________
Classifier:  BaggingClassifier
[[313  56]
 [  6  24]]

TNR: 0.84824
TPR: 0.8
TNRxTPR: 0.67859
G-mean: 0.82377
__________________________________________________
Classifier:  RandomForestClassifier
[[326  43]
 [  4  26]]

TNR: 0.88347
TPR: 0.86667
TNRxTPR: 0.76567
G-mean: 0.87503
__________________________________________________
Ensemble predictions (majority voting):
[[318  51]
 [  3  27]]

TNR: 0.86179
TPR: 0.9
TNRxTPR: 0.77561
G-mean: 0.88069


In [32]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=.9)



In [33]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.18)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 1339, 0.0: 266})
Ratio->  1 : 5.0


In [34]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[329  40]
 [  3  27]]

TNR: 0.8916
TPR: 0.9
TNRxTPR: 0.80244
G-mean: 0.89579
__________________________________________________
Classifier:  LGBMClassifier
[[327  42]
 [  3  27]]

TNR: 0.88618
TPR: 0.9
TNRxTPR: 0.79756
G-mean: 0.89306
__________________________________________________
Classifier:  XGBClassifier
[[327  42]
 [  6  24]]

TNR: 0.88618
TPR: 0.8
TNRxTPR: 0.70894
G-mean: 0.84199
__________________________________________________
Classifier:  BaggingClassifier
[[329  40]
 [  3  27]]

TNR: 0.8916
TPR: 0.9
TNRxTPR: 0.80244
G-mean: 0.89579
__________________________________________________
Classifier:  RandomForestClassifier
[[326  43]
 [  6  24]]

TNR: 0.88347
TPR: 0.8
TNRxTPR: 0.70678
G-mean: 0.8407
__________________________________________________
Ensemble predictions (majority voting):
[[329  40]
 [  3  27]]

TNR: 0.8916
TPR: 0.9
TNRxTPR: 0.80244
G-mean: 0.89579


## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
