# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


Import packages, classifiers and etc.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'protein_homo'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [5]:
X

array([[-4.82492993e-01,  1.36726796e+00,  9.56085932e-02, ...,
        -7.66763046e-01, -2.90876703e-03,  1.13706209e+00],
       [-1.66574940e-01,  1.50849637e+00, -1.45710580e-01, ...,
         6.40459866e-01,  6.90526581e-01,  1.07701990e+00],
       [ 8.33832229e-01,  1.71239795e-01, -8.77712073e-01, ...,
        -1.14885668e+00, -2.90876703e-03,  9.63308956e-02],
       ...,
       [ 2.04485143e+00,  9.98882151e+00,  3.35746219e+01, ...,
         2.73731520e+00,  9.57232484e-01,  1.17709021e+00],
       [ 1.28928076e+00, -7.50242054e-03,  1.34242432e+00, ...,
         2.24338927e+00,  1.49064429e+00,  1.15707615e+00],
       [ 1.38668882e+00,  6.25818764e-01,  4.55196933e+00, ...,
        -9.81108258e-01, -5.36320573e-01,  7.63168342e-02]])

In [6]:
y[y == -1] = 0

We split data into train and test partitions.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [8]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res

    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)

    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [9]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))


## Without resampling (base line)
Which is starting score?


In [10]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 144455, 1: 1296})
Ratio-> 111.5 : 1


In [11]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28882    10]
 [   47   212]]

TNR: 0.99965
TPR: 0.81853
TNRxTPR: 0.81825
G-mean: 0.90457
__________________________________________________
Classifier:  LGBMClassifier
[[28870    22]
 [   46   213]]

TNR: 0.99924
TPR: 0.82239
TNRxTPR: 0.82177
G-mean: 0.90651
__________________________________________________
Classifier:  XGBClassifier
[[28886     6]
 [   52   207]]

TNR: 0.99979
TPR: 0.79923
TNRxTPR: 0.79906
G-mean: 0.8939
__________________________________________________
Classifier:  BaggingClassifier
[[28884     8]
 [   60   199]]

TNR: 0.99972
TPR: 0.76834
TNRxTPR: 0.76813
G-mean: 0.87643
__________________________________________________
Classifier:  RandomForestClassifier
[[28888     4]
 [   59   200]]

TNR: 0.99986
TPR: 0.7722
TNRxTPR: 0.77209
G-mean: 0.87869
__________________________________________________
Ensemble predictions (majority voting):
[[28886     6]
 [   53   206]]

TNR: 0.99979
TP

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [12]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [13]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 115563, 1: 115563})
Ratio->  1 : 1.0


In [14]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28838    54]
 [   27   232]]

TNR: 0.99813
TPR: 0.89575
TNRxTPR: 0.89408
G-mean: 0.94556
__________________________________________________
Classifier:  LGBMClassifier
[[28773   119]
 [   25   234]]

TNR: 0.99588
TPR: 0.90347
TNRxTPR: 0.89975
G-mean: 0.94855
__________________________________________________
Classifier:  XGBClassifier
[[28844    48]
 [   33   226]]

TNR: 0.99834
TPR: 0.87259
TNRxTPR: 0.87114
G-mean: 0.93335
__________________________________________________
Classifier:  BaggingClassifier
[[28811    81]
 [   42   217]]

TNR: 0.9972
TPR: 0.83784
TNRxTPR: 0.83549
G-mean: 0.91405
__________________________________________________
Classifier:  RandomForestClassifier
[[28870    22]
 [   38   221]]

TNR: 0.99924
TPR: 0.85328
TNRxTPR: 0.85263
G-mean: 0.92338
__________________________________________________
Ensemble predictions (majority voting):
[[28853    39]
 [   32   227]]

TNR: 0.99865
T

In [15]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [16]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 115563, 1: 115563})
Ratio->  1 : 1.0


In [17]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28850    42]
 [   30   229]]

TNR: 0.99855
TPR: 0.88417
TNRxTPR: 0.88288
G-mean: 0.93962
__________________________________________________
Classifier:  LGBMClassifier
[[28841    51]
 [   27   232]]

TNR: 0.99823
TPR: 0.89575
TNRxTPR: 0.89417
G-mean: 0.94561
__________________________________________________
Classifier:  XGBClassifier
[[28874    18]
 [   36   223]]

TNR: 0.99938
TPR: 0.861
TNRxTPR: 0.86047
G-mean: 0.92761
__________________________________________________
Classifier:  BaggingClassifier
[[28871    21]
 [   60   199]]

TNR: 0.99927
TPR: 0.76834
TNRxTPR: 0.76778
G-mean: 0.87623
__________________________________________________
Classifier:  RandomForestClassifier
[[28885     7]
 [   55   204]]

TNR: 0.99976
TPR: 0.78764
TNRxTPR: 0.78745
G-mean: 0.88739
__________________________________________________
Ensemble predictions (majority voting):
[[28879    13]
 [   34   225]]

TNR: 0.99955
TP

In [18]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)



In [19]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 115563, 1.0: 115563})
Ratio->  1 : 1.0


In [20]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28884     8]
 [   53   206]]

TNR: 0.99972
TPR: 0.79537
TNRxTPR: 0.79515
G-mean: 0.89171
__________________________________________________
Classifier:  LGBMClassifier
[[28885     7]
 [   49   210]]

TNR: 0.99976
TPR: 0.81081
TNRxTPR: 0.81061
G-mean: 0.90034
__________________________________________________
Classifier:  XGBClassifier
[[28884     8]
 [   50   209]]

TNR: 0.99972
TPR: 0.80695
TNRxTPR: 0.80673
G-mean: 0.89818
__________________________________________________
Classifier:  BaggingClassifier
[[28884     8]
 [   58   201]]

TNR: 0.99972
TPR: 0.77606
TNRxTPR: 0.77585
G-mean: 0.88082
__________________________________________________
Classifier:  RandomForestClassifier
[[28888     4]
 [   61   198]]

TNR: 0.99986
TPR: 0.76448
TNRxTPR: 0.76437
G-mean: 0.87428
__________________________________________________
Ensemble predictions (majority voting):
[[28887     5]
 [   50   209]]

TNR: 0.99983


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [21]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=.6) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

In [22]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.07)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 69337, 0: 8089})
Ratio->  1 : 8.6


In [23]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28058   834]
 [    7   252]]

TNR: 0.97113
TPR: 0.97297
TNRxTPR: 0.94489
G-mean: 0.97205
__________________________________________________
Classifier:  LGBMClassifier
[[28285   607]
 [   14   245]]

TNR: 0.97899
TPR: 0.94595
TNRxTPR: 0.92607
G-mean: 0.96233
__________________________________________________
Classifier:  XGBClassifier
[[28350   542]
 [    9   250]]

TNR: 0.98124
TPR: 0.96525
TNRxTPR: 0.94714
G-mean: 0.97321
__________________________________________________
Classifier:  BaggingClassifier
[[28164   728]
 [   26   233]]

TNR: 0.9748
TPR: 0.89961
TNRxTPR: 0.87695
G-mean: 0.93645
__________________________________________________
Classifier:  RandomForestClassifier
[[28398   494]
 [   19   240]]

TNR: 0.9829
TPR: 0.92664
TNRxTPR: 0.9108
G-mean: 0.95436
__________________________________________________
Ensemble predictions (majority voting):
[[28328   564]
 [   13   246]]

TNR: 0.98048
TPR

In [29]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=.7) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [30]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.05)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 80894, 0: 5778})
Ratio->  1 : 14.0


In [31]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[27920   972]
 [    6   253]]

TNR: 0.96636
TPR: 0.97683
TNRxTPR: 0.94397
G-mean: 0.97158
__________________________________________________
Classifier:  LGBMClassifier
[[28440   452]
 [   15   244]]

TNR: 0.98436
TPR: 0.94208
TNRxTPR: 0.92735
G-mean: 0.96299
__________________________________________________
Classifier:  XGBClassifier
[[28482   410]
 [   13   246]]

TNR: 0.98581
TPR: 0.94981
TNRxTPR: 0.93633
G-mean: 0.96764
__________________________________________________
Classifier:  BaggingClassifier
[[28237   655]
 [   27   232]]

TNR: 0.97733
TPR: 0.89575
TNRxTPR: 0.87545
G-mean: 0.93565
__________________________________________________
Classifier:  RandomForestClassifier
[[28721   171]
 [   27   232]]

TNR: 0.99408
TPR: 0.89575
TNRxTPR: 0.89045
G-mean: 0.94364
__________________________________________________
Ensemble predictions (majority voting):
[[28528   364]
 [   15   244]]

TNR: 0.9874
T

In [67]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=.7)



In [72]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.013)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 81205, 0.0: 1502})
Ratio->  1 : 54.1


In [73]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[28040   852]
 [   10   249]]

TNR: 0.97051
TPR: 0.96139
TNRxTPR: 0.93304
G-mean: 0.96594
__________________________________________________
Classifier:  LGBMClassifier
[[28105   787]
 [   15   244]]

TNR: 0.97276
TPR: 0.94208
TNRxTPR: 0.91642
G-mean: 0.9573
__________________________________________________
Classifier:  XGBClassifier
[[28122   770]
 [    8   251]]

TNR: 0.97335
TPR: 0.96911
TNRxTPR: 0.94328
G-mean: 0.97123
__________________________________________________
Classifier:  BaggingClassifier
[[27839  1053]
 [   15   244]]

TNR: 0.96355
TPR: 0.94208
TNRxTPR: 0.90775
G-mean: 0.95276
__________________________________________________
Classifier:  RandomForestClassifier
[[28383   509]
 [   17   242]]

TNR: 0.98238
TPR: 0.93436
TNRxTPR: 0.9179
G-mean: 0.95807
__________________________________________________
Ensemble predictions (majority voting):
[[28266   626]
 [   12   247]]

TNR: 0.97833
TP

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
