# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-10 17:01:35.712257: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-10 17:01:35.712315: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'isolet'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[-0.22814972, -0.73141297, -0.53755132, ...,  1.34514074,
         1.52756432, -0.58066277],
       [-0.20871631, -0.81514481, -0.30610169, ...,  0.72576266,
         1.12135984,  0.53729779],
       [ 0.64381879,  0.22275509,  0.47152008, ...,  0.1993505 ,
         0.23263731, -0.54962399],
       ...,
       [-1.20066498, -1.60622325, -0.57857546, ..., -0.33712804,
        -0.0257472 , -0.84759631],
       [-0.80692726, -0.99198146,  0.4996859 , ...,  0.34856969,
        -0.37486674, -0.61678063],
       [-1.17024747, -1.48250008,  0.05882947, ..., -0.8173533 ,
        -0.75763635, -0.10040813]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 7197, 1: 600})
Ratio-> 12.0 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1432    8]
 [  21   99]]

TNR: 0.99444
TPR: 0.825
TNRxTPR: 0.82042
G-mean: 0.90577
__________________________________________________
Classifier:  LGBMClassifier
[[1430   10]
 [  25   95]]

TNR: 0.99306
TPR: 0.79167
TNRxTPR: 0.78617
G-mean: 0.88666
__________________________________________________
Classifier:  XGBClassifier
[[1430   10]
 [  25   95]]

TNR: 0.99306
TPR: 0.79167
TNRxTPR: 0.78617
G-mean: 0.88666
__________________________________________________
Classifier:  BaggingClassifier
[[1429   11]
 [  57   63]]

TNR: 0.99236
TPR: 0.525
TNRxTPR: 0.52099
G-mean: 0.7218
__________________________________________________
Classifier:  RandomForestClassifier
[[1437    3]
 [  51   69]]

TNR: 0.99792
TPR: 0.575
TNRxTPR: 0.5738
G-mean: 0.7575
__________________________________________________
Ensemble predictions (majority voting):
[[1432    8]
 [  29   91]]

TNR: 0.99444
TPR: 0.75833
TNRxTPR: 0.75412
G-m

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [None]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 5757, 1: 5757})
Ratio->  1 : 1.0


In [None]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1424   16]
 [   8  112]]

TNR: 0.98889
TPR: 0.93333
TNRxTPR: 0.92296
G-mean: 0.96071
__________________________________________________
Classifier:  LGBMClassifier
[[1423   17]
 [  18  102]]

TNR: 0.98819
TPR: 0.85
TNRxTPR: 0.83997
G-mean: 0.9165
__________________________________________________
Classifier:  XGBClassifier
[[1423   17]
 [  13  107]]

TNR: 0.98819
TPR: 0.89167
TNRxTPR: 0.88114
G-mean: 0.93869
__________________________________________________
Classifier:  BaggingClassifier
[[1417   23]
 [  28   92]]

TNR: 0.98403
TPR: 0.76667
TNRxTPR: 0.75442
G-mean: 0.86857
__________________________________________________
Classifier:  RandomForestClassifier
[[1421   19]
 [  15  105]]

TNR: 0.98681
TPR: 0.875
TNRxTPR: 0.86345
G-mean: 0.92922
__________________________________________________
Ensemble predictions (majority voting):
[[1422   18]
 [  12  108]]

TNR: 0.9875
TPR: 0.9
TNRxTPR: 0.88875
G-mea

In [None]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [None]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 5757, 1: 5757})
Ratio->  1 : 1.0


In [None]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1421   19]
 [   7  113]]

TNR: 0.98681
TPR: 0.94167
TNRxTPR: 0.92924
G-mean: 0.96397
__________________________________________________
Classifier:  LGBMClassifier
[[1422   18]
 [  18  102]]

TNR: 0.9875
TPR: 0.85
TNRxTPR: 0.83938
G-mean: 0.91617
__________________________________________________
Classifier:  XGBClassifier
[[1420   20]
 [  16  104]]

TNR: 0.98611
TPR: 0.86667
TNRxTPR: 0.85463
G-mean: 0.92446
__________________________________________________
Classifier:  BaggingClassifier
[[1410   30]
 [  36   84]]

TNR: 0.97917
TPR: 0.7
TNRxTPR: 0.68542
G-mean: 0.8279
__________________________________________________
Classifier:  RandomForestClassifier
[[1424   16]
 [  32   88]]

TNR: 0.98889
TPR: 0.73333
TNRxTPR: 0.72519
G-mean: 0.85158
__________________________________________________
Ensemble predictions (majority voting):
[[1424   16]
 [  19  101]]

TNR: 0.98889
TPR: 0.84167
TNRxTPR: 0.83231
G-m

In [None]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [None]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 5757, 1.0: 5757})
Ratio->  1 : 1.0


In [None]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1432    8]
 [  21   99]]

TNR: 0.99444
TPR: 0.825
TNRxTPR: 0.82042
G-mean: 0.90577
__________________________________________________
Classifier:  LGBMClassifier
[[1432    8]
 [  20  100]]

TNR: 0.99444
TPR: 0.83333
TNRxTPR: 0.8287
G-mean: 0.91033
__________________________________________________
Classifier:  XGBClassifier
[[1429   11]
 [  25   95]]

TNR: 0.99236
TPR: 0.79167
TNRxTPR: 0.78562
G-mean: 0.88635
__________________________________________________
Classifier:  BaggingClassifier
[[1431    9]
 [  63   57]]

TNR: 0.99375
TPR: 0.475
TNRxTPR: 0.47203
G-mean: 0.68705
__________________________________________________
Classifier:  RandomForestClassifier
[[1438    2]
 [  50   70]]

TNR: 0.99861
TPR: 0.58333
TNRxTPR: 0.58252
G-mean: 0.76323
__________________________________________________
Ensemble predictions (majority voting):
[[1433    7]
 [  30   90]]

TNR: 0.99514
TPR: 0.75
TNRxTPR: 0.74635
G-

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [None]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.5)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 5757, 0: 2878})
Ratio->  1 : 2.0


In [None]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1399   41]
 [   5  115]]

TNR: 0.97153
TPR: 0.95833
TNRxTPR: 0.93105
G-mean: 0.96491
__________________________________________________
Classifier:  LGBMClassifier
[[1408   32]
 [  10  110]]

TNR: 0.97778
TPR: 0.91667
TNRxTPR: 0.8963
G-mean: 0.94673
__________________________________________________
Classifier:  XGBClassifier
[[1406   34]
 [   7  113]]

TNR: 0.97639
TPR: 0.94167
TNRxTPR: 0.91943
G-mean: 0.95887
__________________________________________________
Classifier:  BaggingClassifier
[[1390   50]
 [  17  103]]

TNR: 0.96528
TPR: 0.85833
TNRxTPR: 0.82853
G-mean: 0.91024
__________________________________________________
Classifier:  RandomForestClassifier
[[1397   43]
 [   9  111]]

TNR: 0.97014
TPR: 0.925
TNRxTPR: 0.89738
G-mean: 0.9473
__________________________________________________
Ensemble predictions (majority voting):
[[1406   34]
 [   7  113]]

TNR: 0.97639
TPR: 0.94167
TNRxTPR: 0.9194

In [None]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [None]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.5)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 5757, 0: 2878})
Ratio->  1 : 2.0


In [None]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1396   44]
 [   5  115]]

TNR: 0.96944
TPR: 0.95833
TNRxTPR: 0.92905
G-mean: 0.96387
__________________________________________________
Classifier:  LGBMClassifier
[[1409   31]
 [  10  110]]

TNR: 0.97847
TPR: 0.91667
TNRxTPR: 0.89693
G-mean: 0.94707
__________________________________________________
Classifier:  XGBClassifier
[[1409   31]
 [   9  111]]

TNR: 0.97847
TPR: 0.925
TNRxTPR: 0.90509
G-mean: 0.95136
__________________________________________________
Classifier:  BaggingClassifier
[[1376   64]
 [  25   95]]

TNR: 0.95556
TPR: 0.79167
TNRxTPR: 0.75648
G-mean: 0.86976
__________________________________________________
Classifier:  RandomForestClassifier
[[1400   40]
 [  17  103]]

TNR: 0.97222
TPR: 0.85833
TNRxTPR: 0.83449
G-mean: 0.9135
__________________________________________________
Ensemble predictions (majority voting):
[[1406   34]
 [  10  110]]

TNR: 0.97639
TPR: 0.91667
TNRxTPR: 0.895

In [None]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1)



In [None]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.17)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 5757, 0.0: 979})
Ratio->  1 : 5.9


In [None]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1381   59]
 [   4  116]]

TNR: 0.95903
TPR: 0.96667
TNRxTPR: 0.92706
G-mean: 0.96284
__________________________________________________
Classifier:  LGBMClassifier
[[1378   62]
 [   9  111]]

TNR: 0.95694
TPR: 0.925
TNRxTPR: 0.88517
G-mean: 0.94084
__________________________________________________
Classifier:  XGBClassifier
[[1386   54]
 [   9  111]]

TNR: 0.9625
TPR: 0.925
TNRxTPR: 0.89031
G-mean: 0.94356
__________________________________________________
Classifier:  BaggingClassifier
[[1382   58]
 [  15  105]]

TNR: 0.95972
TPR: 0.875
TNRxTPR: 0.83976
G-mean: 0.91638
__________________________________________________
Classifier:  RandomForestClassifier
[[1390   50]
 [   7  113]]

TNR: 0.96528
TPR: 0.94167
TNRxTPR: 0.90897
G-mean: 0.9534
__________________________________________________
Ensemble predictions (majority voting):
[[1389   51]
 [   7  113]]

TNR: 0.96458
TPR: 0.94167
TNRxTPR: 0.90832
G-

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
