# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'scene'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [5]:
X

array([[-5.70966246e-02, -1.39373285e-01, -1.60650256e-01, ...,
         8.28460395e-01, -6.56686800e-01, -5.38642595e-01],
       [ 5.21683833e-01,  3.67666751e-01,  2.42865593e-01, ...,
         1.27836183e-01, -1.98412296e-01, -4.94390370e-01],
       [ 6.33182679e-01,  3.92012921e-01,  2.46937596e-01, ...,
        -4.27131926e-01,  7.51690015e-04, -1.75746908e-01],
       ...,
       [ 1.37390506e+00,  1.26150964e+00,  1.01003201e+00, ...,
        -5.50180393e-01, -6.33173355e-01, -6.05411846e-01],
       [ 1.05434959e+00,  1.03025371e+00,  9.85945079e-01, ...,
         8.85168297e-01,  7.60629408e-01,  7.55661012e-01],
       [ 1.47981679e+00,  8.66408671e-01,  5.45961709e-01, ...,
        -7.21513143e-01, -5.83024871e-01, -7.10510578e-01]])

In [6]:
y[y == -1] = 0

We split data into train and test partitions.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [8]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [9]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line)
Which is starting score?


In [10]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 2230, 1: 177})
Ratio-> 12.6 : 1


In [11]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[445   2]
 [ 31   4]]

TNR: 0.99553
TPR: 0.11429
TNRxTPR: 0.11377
G-mean: 0.3373
__________________________________________________
Classifier:  LGBMClassifier
[[444   3]
 [ 31   4]]

TNR: 0.99329
TPR: 0.11429
TNRxTPR: 0.11352
G-mean: 0.33693
__________________________________________________
Classifier:  XGBClassifier
[[442   5]
 [ 31   4]]

TNR: 0.98881
TPR: 0.11429
TNRxTPR: 0.11301
G-mean: 0.33617
__________________________________________________
Classifier:  BaggingClassifier
[[438   9]
 [ 31   4]]

TNR: 0.97987
TPR: 0.11429
TNRxTPR: 0.11198
G-mean: 0.33464
__________________________________________________
Classifier:  RandomForestClassifier
[[445   2]
 [ 32   3]]

TNR: 0.99553
TPR: 0.08571
TNRxTPR: 0.08533
G-mean: 0.29211
__________________________________________________
Ensemble predictions (majority voting):
[[445   2]
 [ 31   4]]

TNR: 0.99553
TPR: 0.11429
TNRxTPR: 0.11377
G-mean: 0.3373


## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [12]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [13]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 1783, 1: 1783})
Ratio->  1 : 1.0


In [14]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[422  25]
 [ 25  10]]

TNR: 0.94407
TPR: 0.28571
TNRxTPR: 0.26973
G-mean: 0.51936
__________________________________________________
Classifier:  LGBMClassifier
[[426  21]
 [ 24  11]]

TNR: 0.95302
TPR: 0.31429
TNRxTPR: 0.29952
G-mean: 0.54728
__________________________________________________
Classifier:  XGBClassifier
[[422  25]
 [ 26   9]]

TNR: 0.94407
TPR: 0.25714
TNRxTPR: 0.24276
G-mean: 0.49271
__________________________________________________
Classifier:  BaggingClassifier
[[419  28]
 [ 27   8]]

TNR: 0.93736
TPR: 0.22857
TNRxTPR: 0.21425
G-mean: 0.46288
__________________________________________________
Classifier:  RandomForestClassifier
[[434  13]
 [ 27   8]]

TNR: 0.97092
TPR: 0.22857
TNRxTPR: 0.22192
G-mean: 0.47109
__________________________________________________
Ensemble predictions (majority voting):
[[429  18]
 [ 27   8]]

TNR: 0.95973
TPR: 0.22857
TNRxTPR: 0.21937
G-mean: 0.46837


In [15]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [16]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 1783, 1: 1783})
Ratio->  1 : 1.0


In [17]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[442   5]
 [ 30   5]]

TNR: 0.98881
TPR: 0.14286
TNRxTPR: 0.14126
G-mean: 0.37584
__________________________________________________
Classifier:  LGBMClassifier
[[443   4]
 [ 30   5]]

TNR: 0.99105
TPR: 0.14286
TNRxTPR: 0.14158
G-mean: 0.37627
__________________________________________________
Classifier:  XGBClassifier
[[440   7]
 [ 29   6]]

TNR: 0.98434
TPR: 0.17143
TNRxTPR: 0.16874
G-mean: 0.41078
__________________________________________________
Classifier:  BaggingClassifier
[[424  23]
 [ 28   7]]

TNR: 0.94855
TPR: 0.2
TNRxTPR: 0.18971
G-mean: 0.43556
__________________________________________________
Classifier:  RandomForestClassifier
[[442   5]
 [ 30   5]]

TNR: 0.98881
TPR: 0.14286
TNRxTPR: 0.14126
G-mean: 0.37584
__________________________________________________
Ensemble predictions (majority voting):
[[442   5]
 [ 30   5]]

TNR: 0.98881
TPR: 0.14286
TNRxTPR: 0.14126
G-mean: 0.37584


In [18]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [19]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 1783, 1.0: 1783})
Ratio->  1 : 1.0


In [20]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[445   2]
 [ 32   3]]

TNR: 0.99553
TPR: 0.08571
TNRxTPR: 0.08533
G-mean: 0.29211
__________________________________________________
Classifier:  LGBMClassifier
[[444   3]
 [ 32   3]]

TNR: 0.99329
TPR: 0.08571
TNRxTPR: 0.08514
G-mean: 0.29179
__________________________________________________
Classifier:  XGBClassifier
[[444   3]
 [ 32   3]]

TNR: 0.99329
TPR: 0.08571
TNRxTPR: 0.08514
G-mean: 0.29179
__________________________________________________
Classifier:  BaggingClassifier
[[444   3]
 [ 30   5]]

TNR: 0.99329
TPR: 0.14286
TNRxTPR: 0.1419
G-mean: 0.37669
__________________________________________________
Classifier:  RandomForestClassifier
[[445   2]
 [ 32   3]]

TNR: 0.99553
TPR: 0.08571
TNRxTPR: 0.08533
G-mean: 0.29211
__________________________________________________
Ensemble predictions (majority voting):
[[445   2]
 [ 32   3]]

TNR: 0.99553
TPR: 0.08571
TNRxTPR: 0.08533
G-mean: 0.29211


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [21]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [24]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 1783, 0: 357})
Ratio->  1 : 5.0


In [25]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[304 143]
 [  6  29]]

TNR: 0.68009
TPR: 0.82857
TNRxTPR: 0.5635
G-mean: 0.75067
__________________________________________________
Classifier:  LGBMClassifier
[[346 101]
 [ 11  24]]

TNR: 0.77405
TPR: 0.68571
TNRxTPR: 0.53078
G-mean: 0.72854
__________________________________________________
Classifier:  XGBClassifier
[[315 132]
 [  8  27]]

TNR: 0.7047
TPR: 0.77143
TNRxTPR: 0.54362
G-mean: 0.73731
__________________________________________________
Classifier:  BaggingClassifier
[[347 100]
 [ 13  22]]

TNR: 0.77629
TPR: 0.62857
TNRxTPR: 0.48795
G-mean: 0.69854
__________________________________________________
Classifier:  RandomForestClassifier
[[333 114]
 [ 12  23]]

TNR: 0.74497
TPR: 0.65714
TNRxTPR: 0.48955
G-mean: 0.69968
__________________________________________________
Ensemble predictions (majority voting):
[[333 114]
 [ 10  25]]

TNR: 0.74497
TPR: 0.71429
TNRxTPR: 0.53212
G-mean: 0.72946


In [26]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [27]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 1783, 0: 357})
Ratio->  1 : 5.0


In [28]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[316 131]
 [ 10  25]]

TNR: 0.70694
TPR: 0.71429
TNRxTPR: 0.50495
G-mean: 0.7106
__________________________________________________
Classifier:  LGBMClassifier
[[376  71]
 [ 17  18]]

TNR: 0.84116
TPR: 0.51429
TNRxTPR: 0.4326
G-mean: 0.65772
__________________________________________________
Classifier:  XGBClassifier
[[373  74]
 [ 12  23]]

TNR: 0.83445
TPR: 0.65714
TNRxTPR: 0.54835
G-mean: 0.74051
__________________________________________________
Classifier:  BaggingClassifier
[[361  86]
 [ 14  21]]

TNR: 0.80761
TPR: 0.6
TNRxTPR: 0.48456
G-mean: 0.69611
__________________________________________________
Classifier:  RandomForestClassifier
[[366  81]
 [ 17  18]]

TNR: 0.81879
TPR: 0.51429
TNRxTPR: 0.42109
G-mean: 0.64892
__________________________________________________
Ensemble predictions (majority voting):
[[357  90]
 [ 13  22]]

TNR: 0.79866
TPR: 0.62857
TNRxTPR: 0.50201
G-mean: 0.70853


In [44]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=2)



In [45]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.12)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 3424, 0.0: 214})
Ratio->  1 : 16.0


In [46]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[343 104]
 [  9  26]]

TNR: 0.76734
TPR: 0.74286
TNRxTPR: 0.57002
G-mean: 0.755
__________________________________________________
Classifier:  LGBMClassifier
[[351  96]
 [ 16  19]]

TNR: 0.78523
TPR: 0.54286
TNRxTPR: 0.42627
G-mean: 0.65289
__________________________________________________
Classifier:  XGBClassifier
[[340 107]
 [ 15  20]]

TNR: 0.76063
TPR: 0.57143
TNRxTPR: 0.43464
G-mean: 0.65928
__________________________________________________
Classifier:  BaggingClassifier
[[357  90]
 [ 19  16]]

TNR: 0.79866
TPR: 0.45714
TNRxTPR: 0.3651
G-mean: 0.60424
__________________________________________________
Classifier:  RandomForestClassifier
[[371  76]
 [ 17  18]]

TNR: 0.82998
TPR: 0.51429
TNRxTPR: 0.42685
G-mean: 0.65333
__________________________________________________
Ensemble predictions (majority voting):
[[359  88]
 [ 16  19]]

TNR: 0.80313
TPR: 0.54286
TNRxTPR: 0.43599
G-mean: 0.66029


## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
