In [None]:
import lib._util.visualplot as vp

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler

In [None]:
import numpy as np
import pandas as pd

# Imbalanced-Learn
from imblearn.pipeline import Pipeline

# Scikit-Learn
from sklearn.datasets import make_blobs

# Plotly
import plotly.express as px

In [None]:
OUT_PATH_GRAPH = 'resources/output/graph/'

def custom_blobs(n_samples, n_classes=2, n_features=2, center_box=(-10.0, 10.0), weights=None, random_state=None):
    if weights is not None:
        assert np.sum(weights) == 1, 'sum of weights must be 1.'
        assert len(weights) == n_classes, 'no. of weights element must be same as n_classes.'
    
    # Generate balanced dataset
    X, y = make_blobs(n_samples=n_samples * n_classes, centers=n_classes, n_features=n_features,
                      center_box=center_box, random_state=random_state)
    
    data_df = to_frame(X, y)
    X = data_df.drop(columns=['target']).copy()
    y = data_df['target'].copy()
    
    if weights is None:
        return X, y
    
    # Generate imbalance dataset
    # Reference: # https://machinelearningmastery.com/how-to-develop-an-intuition-skewed-class-distributions/
    proportions = {}
    for x in range(n_classes):
        proportions[x] = int(np.round(n_samples * weights[x]))
    
    # Collect the examples
    new_X, new_y = [], []
    for k, v in proportions.items():
        indexes = np.where(y == k)[0]
        selected_indexes = indexes[:v]
        new_X.append(X[X.index.isin(selected_indexes)])
        new_y.append(y[y.index.isin(selected_indexes)])
        
    X = np.vstack(new_X)
    y = np.hstack(new_y)
    
    data_df = to_frame(X, y)
    X = data_df.drop(columns=['target']).copy()
    y = data_df['target'].copy()
    
    return X, y

def to_frame(X, y):
    data_df = pd.DataFrame(X, columns=[f'feature_{x+1}' for x in range(X.shape[1])])
    data_df['target'] = y
    
    return data_df

def class_ratio(y):
    return {k: round(v,2) for k,v in sorted(y.value_counts(normalize=True).to_dict().items())}

In [None]:
data_groups    = []
subplot_titles = []

X, y = custom_blobs(n_samples=1000, n_classes=2, n_features=2,
                    center_box=(-10, 5), weights=[.9, .1], random_state=0)

X.shape, y.shape

In [None]:
vp.value_count(y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([DFStandardScaler().fit_transform(X), y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Original Dataset {class_ratio(y)}')

# 1. Under-Sampling

### 1.1. Condensed Nearest Neighbour

In [None]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [None]:
model = CondensedNearestNeighbour(n_jobs=-1, random_state=0)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Condensed Nearest Neighbour {class_ratio(new_y)}')

In [None]:
subplot_titles

### 1.2. Edited Nearest Neighbour

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
model = EditedNearestNeighbours(n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Edited Nearest Neighbours {class_ratio(new_y)}')

### 1.3. Repeated Edited Nearest Neighbour

In [None]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [None]:
model = RepeatedEditedNearestNeighbours(n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Repeated Edited Nearest Neighbours {class_ratio(new_y)}')

### 1.4. All KNN

In [None]:
from imblearn.under_sampling import AllKNN

In [None]:
model = AllKNN(n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'All KNN {class_ratio(new_y)}')

### 1.5. Instance Hardness Threshold

In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = InstanceHardnessThreshold(estimator=RandomForestClassifier(), n_jobs=-1, random_state=0)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Instance Hardness Threshold {class_ratio(new_y)}')

### 1.6. Near Miss - V1

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
model = NearMiss(version=1, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Near Miss - V1 {class_ratio(new_y)}')

### 1.6. Near Miss - V2

In [None]:
model = NearMiss(version=2, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Near Miss - V2 {class_ratio(new_y)}')

### 1.6. Near Miss - V3

In [None]:
model = NearMiss(version=3, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Near Miss - V3 {class_ratio(new_y)}')

### 1.7. Neighbourhood Cleaning Rule

In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [None]:
model = NeighbourhoodCleaningRule(n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Neighbourhood Cleaning Rule {class_ratio(new_y)}')

### 1.8. One Sided Selection

In [None]:
from imblearn.under_sampling import OneSidedSelection

In [None]:
model = OneSidedSelection(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'One Sided Selection {class_ratio(new_y)}')

### 1.9. Random Under Sampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
model = RandomUnderSampler(random_state=0)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Random Under Sampler {class_ratio(new_y)}')

### 1.10. Tomek Links

In [None]:
from imblearn.under_sampling import TomekLinks

In [None]:
model = TomekLinks(n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

# Removed samples
# X[X.index.isin(model.sample_indices_) == False]
# y[y.index.isin(model.sample_indices_) == False]

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Tomek Links {class_ratio(new_y)}')

In [None]:
# Scatter plot
vp.datagroups_subplots(
    data_groups,
    max_col=3,
    title='Under-Sampling',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .05
    },
    layout_kwargs={'height': 1500})

# 2. Over-Sampling

In [None]:
data_groups    = []
subplot_titles = []

fig = px.scatter(pd.concat([DFStandardScaler().fit_transform(X), y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Original Dataset {class_ratio(y)}')

### 2.1. Adaptive Synthetic (ADASYN)

In [None]:
from imblearn.over_sampling import ADASYN

In [None]:
model = ADASYN(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'ADASYN {class_ratio(new_y)}')

### 2.2. Borderline SMOTE

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

In [None]:
model = BorderlineSMOTE(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Borderline SMOTE {class_ratio(new_y)}')

### 2.3. K-Means SMOTE

In [None]:
from imblearn.over_sampling import KMeansSMOTE

In [None]:
model = KMeansSMOTE(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'K-Means SMOTE {class_ratio(new_y)}')

### 2.4. Random Over Sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
model = RandomOverSampler(random_state=0)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Random Over Sampler {class_ratio(new_y)}')

### 2.5. SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
model = SMOTE(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SMOTE {class_ratio(new_y)}')

### 2.6. SVM SMOTE

In [None]:
from imblearn.over_sampling import SVMSMOTE

In [None]:
model = SVMSMOTE(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SVM SMOTE {class_ratio(new_y)}')

In [None]:
# Scatter plot
vp.datagroups_subplots(
    data_groups,
    max_col=3,
    title='Over-Sampling',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .09
    },
    layout_kwargs={'height': 950})

# 3. Combination (Over+Under-Sampling)

In [None]:
data_groups    = []
subplot_titles = []

fig = px.scatter(pd.concat([DFStandardScaler().fit_transform(X), y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Original Dataset {class_ratio(y)}')

### 3.1. SMOTE+ENN

In [None]:
from imblearn.combine import SMOTEENN

In [None]:
model = SMOTEENN(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SMOTE+ENN {class_ratio(new_y)}')

### 3.2. SMOTE+Tomek

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
model = SMOTETomek(random_state=0, n_jobs=-1)

steps = [
    ('standard_scaler', DFStandardScaler()),
    ('model', model),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SMOTE+Tomek {class_ratio(new_y)}')

### 3.3. ADASYN+ENN

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('ADASYN', ADASYN(random_state=0, n_jobs=-1)),
    ('ENN', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'ADASYN+ENN {class_ratio(new_y)}')

### 3.4. ADASYN+Tomek

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('ADASYN', ADASYN(random_state=0, n_jobs=-1)),
    ('Tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'ADASYN+Tomek {class_ratio(new_y)}')

### 3.5. Borderline SMOTE+ENN

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('BorderlineSMOTE', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('ENN', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Borderline SMOTE+ENN {class_ratio(new_y)}')

### 3.6. Borderline SMOTE+Tomek

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('BorderlineSMOTE', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('Tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Borderline SMOTE+Tomek {class_ratio(new_y)}')

### 3.7. K-Means SMOTE+Tomek

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('KMeansSMOTE', KMeansSMOTE(random_state=0, n_jobs=-1)),
    ('ENN', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'K-Means SMOTE+ENN {class_ratio(new_y)}')

### 3.8. K-Means SMOTE+Tomek

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('KMeansSMOTE', KMeansSMOTE(random_state=0, n_jobs=-1)),
    ('Tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'K-Means SMOTE+Tomek {class_ratio(new_y)}')

### 3.9. SVM SMOTE+ENN

In [None]:
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('SVMSMOTE', SVMSMOTE(random_state=0, n_jobs=-1)),
    ('ENN', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SVM SMOTE+ENN {class_ratio(new_y)}')

### 3.10. SVM SMOTE+Tomek

In [None]:
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('SVMSMOTE', SVMSMOTE(random_state=0, n_jobs=-1)),
    ('Tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)

new_X.shape, new_y.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SVM SMOTE+Tomek {class_ratio(new_y)}')

In [None]:
# Scatter plot
vp.datagroups_subplots(
    data_groups,
    max_col=3,
    title='Combination-Sampling',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .07
    },
    layout_kwargs={'height': 1250})