# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'webpage'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [5]:
X

array([[-0.18789338, -0.17081211, -0.21451116, ..., -0.40485971,
        -0.44986496, -0.02838501],
       [-0.18789338, -0.17081211, -0.21451116, ..., -0.40485971,
         2.22288928, -0.02838501],
       [-0.18789338, -0.17081211, -0.21451116, ...,  2.4699914 ,
        -0.44986496, -0.02838501],
       ...,
       [-0.18789338, -0.17081211, -0.21451116, ...,  2.4699914 ,
         2.22288928, -0.02838501],
       [-0.18789338, -0.17081211, -0.21451116, ..., -0.40485971,
        -0.44986496, -0.02838501],
       [-0.18789338, -0.17081211, -0.21451116, ..., -0.40485971,
         2.22288928, -0.02838501]])

In [6]:
y[y == -1] = 0

We split data into train and test partitions.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [8]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [9]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line)
Which is starting score?


In [10]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 33799, 1: 981})
Ratio-> 34.5 : 1


In [11]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6743   17]
 [  48  148]]

TNR: 0.99749
TPR: 0.7551
TNRxTPR: 0.7532
G-mean: 0.86787
__________________________________________________
Classifier:  LGBMClassifier
[[6747   13]
 [  65  131]]

TNR: 0.99808
TPR: 0.66837
TNRxTPR: 0.66708
G-mean: 0.81675
__________________________________________________
Classifier:  XGBClassifier
[[6742   18]
 [  52  144]]

TNR: 0.99734
TPR: 0.73469
TNRxTPR: 0.73274
G-mean: 0.856
__________________________________________________
Classifier:  BaggingClassifier
[[6716   44]
 [  46  150]]

TNR: 0.99349
TPR: 0.76531
TNRxTPR: 0.76032
G-mean: 0.87197
__________________________________________________
Classifier:  RandomForestClassifier
[[6738   22]
 [  49  147]]

TNR: 0.99675
TPR: 0.75
TNRxTPR: 0.74756
G-mean: 0.86462
__________________________________________________
Ensemble predictions (majority voting):
[[6743   17]
 [  50  146]]

TNR: 0.99749
TPR: 0.7449
TNRxTPR: 0.74302
G-

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [12]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [13]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 27039, 1: 27039})
Ratio->  1 : 1.0


In [14]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6732   28]
 [  50  146]]

TNR: 0.99586
TPR: 0.7449
TNRxTPR: 0.74181
G-mean: 0.86129
__________________________________________________
Classifier:  LGBMClassifier
[[6731   29]
 [  56  140]]

TNR: 0.99571
TPR: 0.71429
TNRxTPR: 0.71122
G-mean: 0.84334
__________________________________________________
Classifier:  XGBClassifier
[[6727   33]
 [  59  137]]

TNR: 0.99512
TPR: 0.69898
TNRxTPR: 0.69557
G-mean: 0.83401
__________________________________________________
Classifier:  BaggingClassifier
[[6689   71]
 [  87  109]]

TNR: 0.9895
TPR: 0.55612
TNRxTPR: 0.55028
G-mean: 0.74181
__________________________________________________
Classifier:  RandomForestClassifier
[[6739   21]
 [ 117   79]]

TNR: 0.99689
TPR: 0.40306
TNRxTPR: 0.40181
G-mean: 0.63388
__________________________________________________
Ensemble predictions (majority voting):
[[6735   25]
 [  64  132]]

TNR: 0.9963
TPR: 0.67347
TNRxTPR: 0.670

In [15]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [16]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 27039, 1: 27039})
Ratio->  1 : 1.0


In [17]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6660  100]
 [  32  164]]

TNR: 0.98521
TPR: 0.83673
TNRxTPR: 0.82436
G-mean: 0.90794
__________________________________________________
Classifier:  LGBMClassifier
[[6565  195]
 [  18  178]]

TNR: 0.97115
TPR: 0.90816
TNRxTPR: 0.88197
G-mean: 0.93913
__________________________________________________
Classifier:  XGBClassifier
[[6618  142]
 [  24  172]]

TNR: 0.97899
TPR: 0.87755
TNRxTPR: 0.85912
G-mean: 0.92689
__________________________________________________
Classifier:  BaggingClassifier
[[6668   92]
 [  70  126]]

TNR: 0.98639
TPR: 0.64286
TNRxTPR: 0.63411
G-mean: 0.79631
__________________________________________________
Classifier:  RandomForestClassifier
[[6708   52]
 [  66  130]]

TNR: 0.99231
TPR: 0.66327
TNRxTPR: 0.65816
G-mean: 0.81127
__________________________________________________
Ensemble predictions (majority voting):
[[6667   93]
 [  33  163]]

TNR: 0.98624
TPR: 0.83163
TNRxTPR: 0.

In [31]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [32]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 27039, 1.0: 27039})
Ratio->  1 : 1.0


In [34]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6747   13]
 [  50  146]]

TNR: 0.99808
TPR: 0.7449
TNRxTPR: 0.74347
G-mean: 0.86224
__________________________________________________
Classifier:  LGBMClassifier
[[6748   12]
 [  46  150]]

TNR: 0.99822
TPR: 0.76531
TNRxTPR: 0.76395
G-mean: 0.87404
__________________________________________________
Classifier:  XGBClassifier
[[6743   17]
 [  45  151]]

TNR: 0.99749
TPR: 0.77041
TNRxTPR: 0.76847
G-mean: 0.87662
__________________________________________________
Classifier:  BaggingClassifier
[[6700   60]
 [  43  153]]

TNR: 0.99112
TPR: 0.78061
TNRxTPR: 0.77368
G-mean: 0.87959
__________________________________________________
Classifier:  RandomForestClassifier
[[6738   22]
 [  50  146]]

TNR: 0.99675
TPR: 0.7449
TNRxTPR: 0.74247
G-mean: 0.86167
__________________________________________________
Ensemble predictions (majority voting):
[[6748   12]
 [  45  151]]

TNR: 0.99822
TPR: 0.77041
TNRxTPR: 0.76

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [35]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [36]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.06)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 27039, 0: 1622})
Ratio->  1 : 16.7


In [37]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6046  714]
 [  25  171]]

TNR: 0.89438
TPR: 0.87245
TNRxTPR: 0.7803
G-mean: 0.88335
__________________________________________________
Classifier:  LGBMClassifier
[[6317  443]
 [  15  181]]

TNR: 0.93447
TPR: 0.92347
TNRxTPR: 0.86295
G-mean: 0.92895
__________________________________________________
Classifier:  XGBClassifier
[[6207  553]
 [  17  179]]

TNR: 0.9182
TPR: 0.91327
TNRxTPR: 0.83856
G-mean: 0.91573
__________________________________________________
Classifier:  BaggingClassifier
[[6125  635]
 [  59  137]]

TNR: 0.90607
TPR: 0.69898
TNRxTPR: 0.63332
G-mean: 0.79581
__________________________________________________
Classifier:  RandomForestClassifier
[[6378  382]
 [  63  133]]

TNR: 0.94349
TPR: 0.67857
TNRxTPR: 0.64023
G-mean: 0.80014
__________________________________________________
Ensemble predictions (majority voting):
[[6256  504]
 [  25  171]]

TNR: 0.92544
TPR: 0.87245
TNRxTPR: 0.80

In [38]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [39]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 27039, 0: 5408})
Ratio->  1 : 5.0


In [40]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6307  453]
 [   8  188]]

TNR: 0.93299
TPR: 0.95918
TNRxTPR: 0.89491
G-mean: 0.946
__________________________________________________
Classifier:  LGBMClassifier
[[6234  526]
 [  11  185]]

TNR: 0.92219
TPR: 0.94388
TNRxTPR: 0.87043
G-mean: 0.93297
__________________________________________________
Classifier:  XGBClassifier
[[6129  631]
 [  10  186]]

TNR: 0.90666
TPR: 0.94898
TNRxTPR: 0.8604
G-mean: 0.92758
__________________________________________________
Classifier:  BaggingClassifier
[[6463  297]
 [  72  124]]

TNR: 0.95607
TPR: 0.63265
TNRxTPR: 0.60486
G-mean: 0.77773
__________________________________________________
Classifier:  RandomForestClassifier
[[6608  152]
 [  65  131]]

TNR: 0.97751
TPR: 0.66837
TNRxTPR: 0.65334
G-mean: 0.80829
__________________________________________________
Ensemble predictions (majority voting):
[[6373  387]
 [  10  186]]

TNR: 0.94275
TPR: 0.94898
TNRxTPR: 0.894

In [45]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1)



In [48]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.06)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 27039, 0.0: 1622})
Ratio->  1 : 16.7


In [49]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[6547  213]
 [  18  178]]

TNR: 0.96849
TPR: 0.90816
TNRxTPR: 0.87955
G-mean: 0.93784
__________________________________________________
Classifier:  LGBMClassifier
[[6456  304]
 [  11  185]]

TNR: 0.95503
TPR: 0.94388
TNRxTPR: 0.90143
G-mean: 0.94944
__________________________________________________
Classifier:  XGBClassifier
[[6481  279]
 [  16  180]]

TNR: 0.95873
TPR: 0.91837
TNRxTPR: 0.88046
G-mean: 0.93833
__________________________________________________
Classifier:  BaggingClassifier
[[6219  541]
 [  24  172]]

TNR: 0.91997
TPR: 0.87755
TNRxTPR: 0.80732
G-mean: 0.89851
__________________________________________________
Classifier:  RandomForestClassifier
[[6382  378]
 [  17  179]]

TNR: 0.94408
TPR: 0.91327
TNRxTPR: 0.8622
G-mean: 0.92855
__________________________________________________
Ensemble predictions (majority voting):
[[6496  264]
 [  15  181]]

TNR: 0.96095
TPR: 0.92347
TNRxTPR: 0.8

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
