# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

In [None]:
!pip install catboost

Import packages, classifiers and etc.

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from collections import Counter

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset of Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'satimage'
dset = fetch_datasets()[dset_name]
dset.data.shape

print(sorted(Counter(dset.target).items()))

[(-1, 5809), (1, 626)]


In [5]:
dset.data

array([[ 92., 115., 120., ..., 107., 113.,  87.],
       [ 84., 102., 106., ...,  99., 104.,  79.],
       [ 84., 102., 102., ...,  99., 104.,  79.],
       ...,
       [ 56.,  68.,  91., ...,  83.,  92.,  74.],
       [ 56.,  68.,  87., ...,  83.,  92.,  70.],
       [ 60.,  71.,  91., ...,  79., 108.,  92.]])

In [6]:
X = pd.DataFrame(dset.data)
y = dset.target

In [7]:
y[y == -1] = 0
X['Class'] = y

We split data into train and test partitions.

In [8]:
dtrain, dtest = train_test_split(X, test_size=0.2, random_state=42, stratify=X.Class)

This is a simple function to undersample freely.  

In [9]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res  
    
    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1)
    
    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [10]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        #print(classification_report(yt, y_pred))
        predictions.append(y_pred)
        tn = cm[0,0]
        fn = cm[1,0]
        tp = cm[1,1]
        fp = cm[0,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fn = cm[1,0]
    tp = cm[1,1]
    fp = cm[0,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))
    

In [11]:
X = dtrain.drop('Class', axis=1)
y = dtrain['Class']
Xt = dtest.drop('Class', axis=1)
yt = dtest['Class']

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [12]:
vae_sampler = VAEOversampler(epochs=50,
                              intermediate_dim=512,
                              batch_size=64,
                              rescale=True,
                              random_state=42,
                              verbose=False)
Xres, yres = vae_sampler.fit_resample(X, y, validation_data=[Xt, yt])

print('Resampled dataset shape %s' % Counter(yres))
print('Ratio->  1 :', round(Counter(yres)[1]/Counter(yres)[0], 1))

Resampled dataset shape Counter({1.0: 4647, 0.0: 4647})
Ratio->  1 : 1.0


In [13]:
train_val(Xres, yres, Xt, yt)

__________________________________________________
Classifier:  CatBoostClassifier
[[1142   20]
 [  46   79]]

TNR: 0.98279
TPR: 0.632
TNRxTPR: 0.62112
G-mean: 0.78811
__________________________________________________
Classifier:  LGBMClassifier
[[1135   27]
 [  46   79]]

TNR: 0.97676
TPR: 0.632
TNRxTPR: 0.61731
G-mean: 0.78569
__________________________________________________
Classifier:  XGBClassifier
[[1138   24]
 [  42   83]]

TNR: 0.97935
TPR: 0.664
TNRxTPR: 0.65029
G-mean: 0.8064
__________________________________________________
Classifier:  BaggingClassifier
[[1141   21]
 [  63   62]]

TNR: 0.98193
TPR: 0.496
TNRxTPR: 0.48704
G-mean: 0.69788
__________________________________________________
Classifier:  RandomForestClassifier
[[1149   13]
 [  58   67]]

TNR: 0.98881
TPR: 0.536
TNRxTPR: 0.53
G-mean: 0.72801
__________________________________________________
Ensemble predictions (majority voting):
[[1143   19]
 [  50   75]]

TNR: 0.98365
TPR: 0.6
TNRxTPR: 0.59019
G-mean: 0.76

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [14]:
Xres, yres = vae_sampler.resample(X, y, sampling_strategy=.6)



In [15]:
# RUS

X_eq, y_eq = RUS(Xres, yres, frac=.15)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 2988, 0.0: 697})
Ratio->  1 : 4.3


In [16]:
train_val(X_eq, y_eq, Xt, yt)

__________________________________________________
Classifier:  CatBoostClassifier
[[1023  139]
 [  16  109]]

TNR: 0.88038
TPR: 0.872
TNRxTPR: 0.76769
G-mean: 0.87618
__________________________________________________
Classifier:  LGBMClassifier
[[1013  149]
 [  11  114]]

TNR: 0.87177
TPR: 0.912
TNRxTPR: 0.79506
G-mean: 0.89166
__________________________________________________
Classifier:  XGBClassifier
[[1028  134]
 [   8  117]]

TNR: 0.88468
TPR: 0.936
TNRxTPR: 0.82806
G-mean: 0.90998
__________________________________________________
Classifier:  BaggingClassifier
[[1032  130]
 [  24  101]]

TNR: 0.88812
TPR: 0.808
TNRxTPR: 0.7176
G-mean: 0.84712
__________________________________________________
Classifier:  RandomForestClassifier
[[1026  136]
 [  16  109]]

TNR: 0.88296
TPR: 0.872
TNRxTPR: 0.76994
G-mean: 0.87746
__________________________________________________
Ensemble predictions (majority voting):
[[1035  127]
 [  14  111]]

TNR: 0.89071
TPR: 0.888
TNRxTPR: 0.79095
G-mean:

XGBClassifier  
  - G-mean: **0.90998**  


https://imbalanced-learn.org/stable/auto_examples/ensemble/plot_comparison_ensemble_classifier.html#sphx-glr-auto-examples-ensemble-plot-comparison-ensemble-classifier-py  

In this web we can compare our results on 'satimage' dataset with some balanced versions of classical algorithms like: **BalancedBaggingClassifier**, **BalancedRandomForestClassifier**.  


https://imbalanced-learn.org/stable/combine.html  
Here we find two versions of SMOTE that combines over- and under-sampling: **SMOTEENN** and **SMOTETomek**.

## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
