# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'coil_2000'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [5]:
X

array([[ 0.67713262, -0.26387007,  0.41303395, ..., -0.15048346,
        -0.09121535, -0.11762048],
       [ 0.98679246, -0.26387007, -0.86793209, ..., -0.15048346,
        -0.09121535, -0.11762048],
       [ 0.98679246, -0.26387007, -0.86793209, ..., -0.15048346,
        -0.09121535, -0.11762048],
       ...,
       [ 0.9093775 , -0.26387007, -0.86793209, ..., -0.15048346,
        10.70296685, -0.11762048],
       [ 0.67713262, -0.26387007,  0.41303395, ..., -0.15048346,
        -0.09121535, -0.11762048],
       [-1.25824138, -0.26387007, -0.86793209, ..., -0.15048346,
        -0.09121535, -0.11762048]])

In [6]:
y[y == -1] = 0

We split data into train and test partitions.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [8]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [9]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line)
Which is starting score?


In [10]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 9236, 1: 586})
Ratio-> 15.8 : 1


In [11]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1842    6]
 [ 115    2]]

TNR: 0.99675
TPR: 0.01709
TNRxTPR: 0.01704
G-mean: 0.13053
__________________________________________________
Classifier:  LGBMClassifier
[[1833   15]
 [ 112    5]]

TNR: 0.99188
TPR: 0.04274
TNRxTPR: 0.04239
G-mean: 0.20588
__________________________________________________
Classifier:  XGBClassifier
[[1827   21]
 [ 110    7]]

TNR: 0.98864
TPR: 0.05983
TNRxTPR: 0.05915
G-mean: 0.24321
__________________________________________________
Classifier:  BaggingClassifier
[[1799   49]
 [ 112    5]]

TNR: 0.97348
TPR: 0.04274
TNRxTPR: 0.0416
G-mean: 0.20397
__________________________________________________
Classifier:  RandomForestClassifier
[[1813   35]
 [ 113    4]]

TNR: 0.98106
TPR: 0.03419
TNRxTPR: 0.03354
G-mean: 0.18314
__________________________________________________
Ensemble predictions (majority voting):
[[1833   15]
 [ 113    4]]

TNR: 0.99188
TPR: 0.03419
TNRxTPR: 0.0

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [12]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [13]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 7388, 1: 7388})
Ratio->  1 : 1.0


In [14]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1823   25]
 [ 114    3]]

TNR: 0.98647
TPR: 0.02564
TNRxTPR: 0.02529
G-mean: 0.15904
__________________________________________________
Classifier:  LGBMClassifier
[[1825   23]
 [ 113    4]]

TNR: 0.98755
TPR: 0.03419
TNRxTPR: 0.03376
G-mean: 0.18375
__________________________________________________
Classifier:  XGBClassifier
[[1816   32]
 [ 112    5]]

TNR: 0.98268
TPR: 0.04274
TNRxTPR: 0.042
G-mean: 0.20493
__________________________________________________
Classifier:  BaggingClassifier
[[1783   65]
 [ 110    7]]

TNR: 0.96483
TPR: 0.05983
TNRxTPR: 0.05772
G-mean: 0.24026
__________________________________________________
Classifier:  RandomForestClassifier
[[1787   61]
 [ 109    8]]

TNR: 0.96699
TPR: 0.06838
TNRxTPR: 0.06612
G-mean: 0.25714
__________________________________________________
Ensemble predictions (majority voting):
[[1822   26]
 [ 113    4]]

TNR: 0.98593
TPR: 0.03419
TNRxTPR: 0.03

In [15]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [16]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 7388, 1: 7388})
Ratio->  1 : 1.0


In [17]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1663  185]
 [  93   24]]

TNR: 0.89989
TPR: 0.20513
TNRxTPR: 0.18459
G-mean: 0.42964
__________________________________________________
Classifier:  LGBMClassifier
[[1601  247]
 [  79   38]]

TNR: 0.86634
TPR: 0.32479
TNRxTPR: 0.28138
G-mean: 0.53045
__________________________________________________
Classifier:  XGBClassifier
[[1663  185]
 [  95   22]]

TNR: 0.89989
TPR: 0.18803
TNRxTPR: 0.16921
G-mean: 0.41135
__________________________________________________
Classifier:  BaggingClassifier
[[1748  100]
 [ 103   14]]

TNR: 0.94589
TPR: 0.11966
TNRxTPR: 0.11318
G-mean: 0.33643
__________________________________________________
Classifier:  RandomForestClassifier
[[1759   89]
 [ 105   12]]

TNR: 0.95184
TPR: 0.10256
TNRxTPR: 0.09762
G-mean: 0.31245
__________________________________________________
Ensemble predictions (majority voting):
[[1692  156]
 [  96   21]]

TNR: 0.91558
TPR: 0.17949
TNRxTPR: 0.

In [18]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [19]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 7388, 1.0: 7388})
Ratio->  1 : 1.0


In [20]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1837   11]
 [ 114    3]]

TNR: 0.99405
TPR: 0.02564
TNRxTPR: 0.02549
G-mean: 0.15965
__________________________________________________
Classifier:  LGBMClassifier
[[1840    8]
 [ 115    2]]

TNR: 0.99567
TPR: 0.01709
TNRxTPR: 0.01702
G-mean: 0.13046
__________________________________________________
Classifier:  XGBClassifier
[[1828   20]
 [ 112    5]]

TNR: 0.98918
TPR: 0.04274
TNRxTPR: 0.04227
G-mean: 0.2056
__________________________________________________
Classifier:  BaggingClassifier
[[1801   47]
 [ 111    6]]

TNR: 0.97457
TPR: 0.05128
TNRxTPR: 0.04998
G-mean: 0.22356
__________________________________________________
Classifier:  RandomForestClassifier
[[1815   33]
 [ 113    4]]

TNR: 0.98214
TPR: 0.03419
TNRxTPR: 0.03358
G-mean: 0.18324
__________________________________________________
Ensemble predictions (majority voting):
[[1835   13]
 [ 114    3]]

TNR: 0.99297
TPR: 0.02564
TNRxTPR: 0.0

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [54]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [55]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.06)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 7388, 0: 443})
Ratio->  1 : 16.7


In [56]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1032  816]
 [  41   76]]

TNR: 0.55844
TPR: 0.64957
TNRxTPR: 0.36275
G-mean: 0.60229
__________________________________________________
Classifier:  LGBMClassifier
[[1173  675]
 [  50   67]]

TNR: 0.63474
TPR: 0.57265
TNRxTPR: 0.36348
G-mean: 0.6029
__________________________________________________
Classifier:  XGBClassifier
[[1154  694]
 [  45   72]]

TNR: 0.62446
TPR: 0.61538
TNRxTPR: 0.38428
G-mean: 0.61991
__________________________________________________
Classifier:  BaggingClassifier
[[1086  762]
 [  44   73]]

TNR: 0.58766
TPR: 0.62393
TNRxTPR: 0.36666
G-mean: 0.60553
__________________________________________________
Classifier:  RandomForestClassifier
[[990 858]
 [ 38  79]]

TNR: 0.53571
TPR: 0.67521
TNRxTPR: 0.36172
G-mean: 0.60143
__________________________________________________
Ensemble predictions (majority voting):
[[1090  758]
 [  43   74]]

TNR: 0.58983
TPR: 0.63248
TNRxTPR: 0.37305

In [49]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [52]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.1)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 7388, 0: 739})
Ratio->  1 : 10.0


In [53]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[920 928]
 [ 28  89]]

TNR: 0.49784
TPR: 0.76068
TNRxTPR: 0.3787
G-mean: 0.61538
__________________________________________________
Classifier:  LGBMClassifier
[[1104  744]
 [  39   78]]

TNR: 0.5974
TPR: 0.66667
TNRxTPR: 0.39827
G-mean: 0.63109
__________________________________________________
Classifier:  XGBClassifier
[[1218  630]
 [  47   70]]

TNR: 0.65909
TPR: 0.59829
TNRxTPR: 0.39433
G-mean: 0.62796
__________________________________________________
Classifier:  BaggingClassifier
[[1339  509]
 [  65   52]]

TNR: 0.72457
TPR: 0.44444
TNRxTPR: 0.32203
G-mean: 0.56748
__________________________________________________
Classifier:  RandomForestClassifier
[[1330  518]
 [  61   56]]

TNR: 0.7197
TPR: 0.47863
TNRxTPR: 0.34447
G-mean: 0.58692
__________________________________________________
Ensemble predictions (majority voting):
[[1201  647]
 [  46   71]]

TNR: 0.64989
TPR: 0.60684
TNRxTPR: 0.39438
G

In [46]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1.5)



In [47]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.05)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 10847, 0.0: 369})
Ratio->  1 : 29.4


In [48]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1098  750]
 [  43   74]]

TNR: 0.59416
TPR: 0.63248
TNRxTPR: 0.37579
G-mean: 0.61302
__________________________________________________
Classifier:  LGBMClassifier
[[1175  673]
 [  41   76]]

TNR: 0.63582
TPR: 0.64957
TNRxTPR: 0.41301
G-mean: 0.64266
__________________________________________________
Classifier:  XGBClassifier
[[1148  700]
 [  43   74]]

TNR: 0.62121
TPR: 0.63248
TNRxTPR: 0.3929
G-mean: 0.62682
__________________________________________________
Classifier:  BaggingClassifier
[[1248  600]
 [  61   56]]

TNR: 0.67532
TPR: 0.47863
TNRxTPR: 0.32323
G-mean: 0.56854
__________________________________________________
Classifier:  RandomForestClassifier
[[1133  715]
 [  42   75]]

TNR: 0.6131
TPR: 0.64103
TNRxTPR: 0.39301
G-mean: 0.6269
__________________________________________________
Ensemble predictions (majority voting):
[[1160  688]
 [  45   72]]

TNR: 0.62771
TPR: 0.61538
TNRxTPR: 0.386

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
