# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve over all classes model recognition.  

In [None]:
!pip install catboost

Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

Import VAEOversampler.

In [3]:
from VAEOversampler import VAEOversampler

## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [4]:
from imblearn.datasets import fetch_datasets

dset_name = 'satimage'
dset = fetch_datasets()[dset_name]
dset.data.shape

print(sorted(Counter(dset.target).items()))

[(-1, 5809), (1, 626)]


In [5]:
dset.data

array([[ 92., 115., 120., ..., 107., 113.,  87.],
       [ 84., 102., 106., ...,  99., 104.,  79.],
       [ 84., 102., 102., ...,  99., 104.,  79.],
       ...,
       [ 56.,  68.,  91., ...,  83.,  92.,  74.],
       [ 56.,  68.,  87., ...,  83.,  92.,  70.],
       [ 60.,  71.,  91., ...,  79., 108.,  92.]])

In [6]:
X = pd.DataFrame(dset.data)
y = dset.target

In [7]:
y[y == -1] = 0

In [8]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 5809, 1: 626})
Ratio-> 9.3 : 1


We split data into train and test partitions.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [10]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res  
    
    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)
    
    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [11]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))
    

## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [13]:
vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              rescale=True,
                              random_state=42,
                              verbose=False)
Xres, yres = vae_sampler.fit_resample(X_train, y_train, validation_data=[X_test, y_test])

In [14]:
print('Resampled dataset shape %s' % Counter(yres))
print('Ratio->  1 :', round(Counter(yres)[1]/Counter(yres)[0], 1))

Resampled dataset shape Counter({1.0: 4647, 0.0: 4647})
Ratio->  1 : 1.0


In [15]:
train_val(Xres, yres, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1143   19]
 [  45   80]]

TNR: 0.98365
TPR: 0.64
TNRxTPR: 0.62954
G-mean: 0.79343
__________________________________________________
Classifier:  LGBMClassifier
[[1137   25]
 [  45   80]]

TNR: 0.97849
TPR: 0.64
TNRxTPR: 0.62623
G-mean: 0.79135
__________________________________________________
Classifier:  XGBClassifier
[[1142   20]
 [  43   82]]

TNR: 0.98279
TPR: 0.656
TNRxTPR: 0.64471
G-mean: 0.80294
__________________________________________________
Classifier:  BaggingClassifier
[[1144   18]
 [  62   63]]

TNR: 0.98451
TPR: 0.504
TNRxTPR: 0.49619
G-mean: 0.70441
__________________________________________________
Classifier:  RandomForestClassifier
[[1149   13]
 [  59   66]]

TNR: 0.98881
TPR: 0.528
TNRxTPR: 0.52209
G-mean: 0.72256
__________________________________________________
Ensemble predictions (majority voting):
[[1143   19]
 [  51   74]]

TNR: 0.98365
TPR: 0.592
TNRxTPR: 0.58232
G-mean: 

## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [60]:
Xres, yres = vae_sampler.resample(X_train, y_train, sampling_strategy=.7)

In [65]:
# RUS

X_eq, y_eq = RUS(Xres, yres, frac=.145)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 3403, 0.0: 674})
Ratio->  1 : 5.0


In [66]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[1014  148]
 [  16  109]]

TNR: 0.87263
TPR: 0.872
TNRxTPR: 0.76094
G-mean: 0.87232
__________________________________________________
Classifier:  LGBMClassifier
[[1008  154]
 [   7  118]]

TNR: 0.86747
TPR: 0.944
TNRxTPR: 0.81889
G-mean: 0.90493
__________________________________________________
Classifier:  XGBClassifier
[[1021  141]
 [  10  115]]

TNR: 0.87866
TPR: 0.92
TNRxTPR: 0.80836
G-mean: 0.89909
__________________________________________________
Classifier:  BaggingClassifier
[[1034  128]
 [  23  102]]

TNR: 0.88985
TPR: 0.816
TNRxTPR: 0.72611
G-mean: 0.85212
__________________________________________________
Classifier:  RandomForestClassifier
[[1033  129]
 [  14  111]]

TNR: 0.88898
TPR: 0.888
TNRxTPR: 0.78942
G-mean: 0.88849
__________________________________________________
Ensemble predictions (majority voting):
[[1027  135]
 [  11  114]]

TNR: 0.88382
TPR: 0.912
TNRxTPR: 0.80604
G-mean:

LGBMClassifier  
  - G-mean: **0.90493**  


https://imbalanced-learn.org/stable/auto_examples/ensemble/plot_comparison_ensemble_classifier.html#sphx-glr-auto-examples-ensemble-plot-comparison-ensemble-classifier-py  

In this web we can compare our results on 'satimage' dataset with some balanced versions of classical algorithms like: **BalancedBaggingClassifier**, **BalancedRandomForestClassifier**.  


https://imbalanced-learn.org/stable/combine.html  
Here we find two versions of SMOTE that combines over- and under-sampling: **SMOTEENN** and **SMOTETomek**.

## Cross Validation  

In [67]:
g_mean = make_scorer(geometric_mean_score)

clf = LGBMClassifier(random_state=42)

cv_results = cross_validate(clf, X_eq, y_eq, scoring=g_mean, cv=10,
                            return_estimator=True, n_jobs=-1)

print(f"G-mean CV: {cv_results['test_score'].mean():.3f} (+/-{cv_results['test_score'].std():.3f})")

G-mean CV: 0.927 (+/-0.019)


In [68]:
scores = []
for fold_id, cv_model in enumerate(cv_results['estimator']):
    scores.append(geometric_mean_score(y_test, cv_model.predict(X_test)))
    
print(f"G-mean CV (test): {np.mean(scores):.3f} (+/-{np.std(scores):.3f})")

G-mean CV (test): 0.891 (+/-0.005)


Classification report  

In [69]:
y_pred = LGBMClassifier(random_state=42).fit(X_eq, y_eq).predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.87      0.94      0.93      0.90      0.81      1162
          1       0.43      0.94      0.87      0.59      0.90      0.83       125

avg / total       0.94      0.87      0.94      0.89      0.90      0.81      1287



## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
