[Reference](https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/)

**Parts**
1. Random Resampling Imbalanced Datasets
2. Imbalanced-Learn Library
3. Random Oversampling Imbalanced Datasets
4. Random Undersampling Imbalanced Datasets
5. Combining Random Oversampling and Undersampling

[Random resampling]

Random Oversampling: Randomly duplicate examples in the minority class.

Random Undersampling: Randomly delete examples in the majority class.

[Good]

They are referred to as “naive resampling” methods because they assume nothing about the data and no heuristics are used. 

This makes them simple to implement and fast to execute, which is desirable for very large and complex datasets.

### Resampling - Oversampling

In [58]:
#python -m pip install imbalanced-learn
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

In [59]:
data_cancer = load_breast_cancer()
X = data_cancer['data']
target = data_cancer['target']
print("[*]data: ", X.shape)
print("[*]target: ", target.shape)
print(">>1: ", sum(target==1))
print(">>0: ", sum(target==0))

[*]data:  (569, 30)
[*]target:  (569,)
>>1:  357
>>0:  212


In [60]:
print("[*]Delete part of 0 data to make imbalanced data")
zero_idx = np.where(target==0)
zero_idx_deleted = list(zero_idx[0])[30:]
new_X = np.delete(X, zero_idx_deleted, axis=0)
new_target = np.delete(target, zero_idx_deleted)
print("[*]data: ", new_X.shape)
print("[*]target: ", new_target.shape)
print(">>1: ", sum(new_target==1))
print(">>0: ", sum(new_target==0))

[*]Delete part of 0 data to make imbalanced data
[*]data:  (387, 30)
[*]target:  (387,)
>>1:  357
>>0:  30


In [61]:
from imblearn import over_sampling


oversample = over_sampling.RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(new_X, new_target)
print("[*]After oversampling strategy 'minority'")
print("X_over: ", X_over.shape)
print("y_over: ", y_over.shape)
print(">>1: ", sum(y_over==1))
print(">>0: ", sum(y_over==0))

[*]After oversampling strategy 'minority'
X_over:  (714, 30)
y_over:  (714,)
>>1:  357
>>0:  357


In [62]:
oversample = over_sampling.RandomOverSampler(sampling_strategy=0.5)
X_over, y_over = oversample.fit_resample(new_X, new_target)
print("[*]After oversampling strategy 'minority'")
print("X_over: ", X_over.shape)
print("y_over: ", y_over.shape)
print(">>1: ", sum(y_over==1))
print(">>0: ", sum(y_over==0))

[*]After oversampling strategy 'minority'
X_over:  (535, 30)
y_over:  (535,)
>>1:  357
>>0:  178


In [53]:
# example of random oversampling to balance the class distribution
from sklearn.datasets import make_classification
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler


# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
print("[*]dataset")
print(">>X: ", X.shape)
print(">>y: ")

# summarize class distribution
print(Counter(y))

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
print("[*]dataset - after fit_resample")
X_over, y_over = oversample.fit_resample(X, y)

# summarize class distribution
print(">>y: ")
print(Counter(y_over))

[*]dataset
>>X:  (10000, 20)
>>y: 
Counter({0: 9900, 1: 100})
[*]dataset - after fit_resample
>>y: 
Counter({0: 9900, 1: 9900})


In [54]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline


# define pipeline
steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

F1 Score: 0.988


### Resampling - UnderSampling

In [63]:
print("[*]Delete part of 0 data to make imbalanced data")
zero_idx = np.where(target==0)
zero_idx_deleted = list(zero_idx[0])[30:]
new_X = np.delete(X, zero_idx_deleted, axis=0)
new_target = np.delete(target, zero_idx_deleted)
print("[*]data: ", new_X.shape)
print("[*]target: ", new_target.shape)
print(">>1: ", sum(new_target==1))
print(">>0: ", sum(new_target==0))

[*]Delete part of 0 data to make imbalanced data
[*]data:  (387, 30)
[*]target:  (387,)
>>1:  357
>>0:  30


In [66]:
from imblearn import under_sampling


oversample = under_sampling.RandomUnderSampler(sampling_strategy='majority')
X_over, y_over = oversample.fit_resample(new_X, new_target)
print("[*]After undersampling strategy 'majority'")
print("X_over: ", X_over.shape)
print("y_over: ", y_over.shape)
print(">>1: ", sum(y_over==1))
print(">>0: ", sum(y_over==0))

[*]After oversampling strategy 'majority'
X_over:  (60, 30)
y_over:  (60,)
>>1:  30
>>0:  30


In [69]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler


# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
print("[*]dataset")
print(">>X: ", X.shape)
print(">>y: ")

# summarize class distribution

print(Counter(y))
# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority')

# fit and apply the transform
X_over, y_over = undersample.fit_resample(X, y)

# summarize class distribution
print("[*]dataset - after fit_resample")
print(">>y: ")
print(Counter(y_over))

[*]dataset
>>X:  (10000, 20)
>>y: 
Counter({0: 9900, 1: 100})
[*]dataset - after fit_resample
>>y: 
Counter({0: 100, 1: 100})


In [70]:
# example of evaluating a decision tree with random undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler


# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

# define pipeline
steps = [('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

F1 Score: 0.870


### Oversampling + Undersampling

In [71]:
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

# define pipeline
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under), ('m', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

F1 Score: 0.993
