In [None]:
import lib._util.visualplot as vp

import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [None]:
def custom_blobs(n_samples, n_classes=2, n_features=2, center_box=(-10.0, 10.0), weights=None, random_state=None):
    if weights is not None:
        assert np.sum(weights) == 1, 'sum of weights must be 1.'
        assert len(weights) == n_classes, 'no. of weights element must be same as n_classes.'
    
    # Generate balanced dataset
    X, y = make_blobs(n_samples=n_samples * 2, centers=n_classes, n_features=n_features,
                      center_box=center_box, random_state=random_state)
    
    data_df = to_frame(X, y)
    X = data_df.drop(columns=['target']).copy()
    y = data_df['target'].copy()
    
    if weights is None:
        return X, y
    
    # Generate imbalance dataset
    # Reference: # https://machinelearningmastery.com/how-to-develop-an-intuition-skewed-class-distributions/
    proportions = {}
    for x in range(n_classes):
        proportions[x] = int(np.round(n_samples * weights[x]))
    
    # Collect the examples
    new_X, new_y = [], []
    for k, v in proportions.items():
        indexes = np.where(y == k)[0]
        selected_indexes = indexes[:v]
        new_X.append(X[X.index.isin(selected_indexes)])
        new_y.append(y[y.index.isin(selected_indexes)])
        
    X = np.vstack(new_X)
    y = np.hstack(new_y)
    
    data_df = to_frame(X, y)
    X = data_df.drop(columns=['target']).copy()
    y = data_df['target'].copy()
    
    return X, y

def to_frame(X, y):
    data_df = pd.DataFrame(X, columns=[f'feature_{x+1}' for x in range(X.shape[1])])
    data_df['target'] = y
    
    return data_df

In [None]:
X, y = custom_blobs(n_samples=1000, n_classes=3, n_features=2,
                    center_box=(-10, 5), weights=[.75, .15, .1], random_state=0)
data_df = to_frame(X, y)

X.shape, y.shape

In [None]:
print(y.value_counts(normalize=True))
print(y.value_counts())

vp.scatter(data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

# 1. Under-Sampling

### 1.1. Condensed Nearest Neighbour

In [None]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [None]:
sampling = CondensedNearestNeighbour(n_jobs=-1, random_state=0)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.2. Edited Nearest Neighbours

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
sampling = EditedNearestNeighbours(n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.3. Repeated Edited Nearest Neighbours

In [None]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [None]:
sampling = RepeatedEditedNearestNeighbours(n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.4. All KNN

In [None]:
from imblearn.under_sampling import AllKNN

In [None]:
sampling = AllKNN(n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.5. Instance Hardness Threshold

In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.ensemble import RandomForestClassifier

In [None]:
sampling = InstanceHardnessThreshold(estimator=RandomForestClassifier(), n_jobs=-1, random_state=0)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.6. Near Miss

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
sampling = NearMiss(version=1, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

In [None]:
sampling = NearMiss(version=2, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

In [None]:
sampling = NearMiss(version=3, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.7. Neighbourhood Cleaning Rule

In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [None]:
sampling = NeighbourhoodCleaningRule(n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.8. One Sided Selection

In [None]:
from imblearn.under_sampling import OneSidedSelection

In [None]:
sampling = OneSidedSelection(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.9. Random Under Sampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
sampling = RandomUnderSampler(random_state=0)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 1.10. Tomek Links

In [None]:
from imblearn.under_sampling import TomekLinks

In [None]:
sampling = TomekLinks(n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

# Removed samples
# X[X.index.isin(sampling.sample_indices_) == False]
# y[y.index.isin(sampling.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

# 2. Over-Sampling

### 2.1. Adaptive Synthetic (ADASYN)

In [None]:
from imblearn.over_sampling import ADASYN

In [None]:
sampling = ADASYN(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 2.2. Borderline SMOTE

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

In [None]:
sampling = BorderlineSMOTE(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 2.3. K-Means SMOTE

In [None]:
from imblearn.over_sampling import KMeansSMOTE

In [None]:
sampling = KMeansSMOTE(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 2.4. Random Over Sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
sampling = RandomOverSampler(random_state=0)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 2.5. SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sampling = SMOTE(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 2.6. SVM SMOTE

In [None]:
from imblearn.over_sampling import SVMSMOTE

In [None]:
sampling = SVMSMOTE(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

# 3. Combination (Over+Under-Sampling)

### 3.1. SMOTE+ENN

In [None]:
from imblearn.combine import SMOTEENN

In [None]:
sampling = SMOTEENN(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

### 3.2. SMOTE+Tomek

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
sampling = SMOTETomek(random_state=0, n_jobs=-1)
new_X, new_y = sampling.fit_resample(X, y)
new_data_df = to_frame(new_X, new_y)

new_X.shape, new_y.shape

In [None]:
print(new_y.value_counts(normalize=True))
print(new_y.value_counts())

vp.scatter(new_data_df, x_col='feature_1', y_col='feature_2', category_col='target', layout_width=500, layout_height=350)

# 4. Class Weights