In [None]:
import lib._util.visualplot as vp

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

# Feature extraction
from lib._class.DFPCA import DFPCA
from lib._class.DFIvis import DFIvis

In [None]:
import numpy as np
import pandas as pd

# Imbalanced-Learn
from imblearn.pipeline import Pipeline

# Scikit-Learn
from sklearn.datasets import make_blobs

# Plotly
import plotly.express as px

In [None]:
OUT_PATH_GRAPH = 'resources/output/graph/'

def custom_blobs(n_samples, n_features=2, weights=[.5, .5], **kwargs):
    if weights is not None:
        assert np.sum(weights) == 1, 'sum of weights must be 1.'
    else:
        weights = [1]
    
    X, y = make_blobs(n_samples=[int(np.round(x * n_samples)) for x in weights],
                      n_features=n_features,
                      **kwargs)
    
    data_df = to_frame(X, y)
    X = data_df.drop(columns=['target']).copy()
    y = data_df['target'].copy()
    
    return X, y

def to_frame(X, y):
    data_df = pd.DataFrame(X, columns=[f'feature_{x+1}' for x in range(X.shape[1])])
    data_df['target'] = y
    
    return data_df

def class_ratio(y):
    return {k: round(v,2) for k,v in sorted(y.value_counts(normalize=True).to_dict().items())}

In [None]:
data_groups    = []
subplot_titles = []

# Generate balanced dataset
X, y = custom_blobs(n_samples=10_000,
                    n_features=10,
                    weights=[.99, .01],
                    centers=[[0,0],[1,1]],
                    random_state=0)

X.shape, y.shape

In [None]:
vp.value_count(y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([DFStandardScaler().fit_transform(X), y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Original Dataset {class_ratio(y)}')

# 1. Feature Extraction
- Original dataset

### 1.1. PCA

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('pca', DFPCA(n_components=2,
                  rescale_with_mean=False, rescale_with_std=False)),
]
new_X = Pipeline(steps, verbose=True).fit_transform(X)

new_X.shape

In [None]:
fig = px.scatter(pd.concat([new_X, y.astype(str)], axis=1).sort_values(by='target'),
                 x='pca_0', y='pca_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'PCA {class_ratio(y)}')

### 1.2. Ivis

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(X, y)

new_X.shape

In [None]:
fig = px.scatter(pd.concat([new_X, y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Ivis {class_ratio(y)}')

### 1.3. PCA + Ivis

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('pca', DFPCA(n_components=2,
                  rescale_with_mean=False, rescale_with_std=False)),
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(X, y)

new_X.shape

In [None]:
fig = px.scatter(pd.concat([new_X, y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'PCA + Ivis {class_ratio(y)}')

In [None]:
# Scatter plot
vp.datagroups_subplots(
    data_groups,
    max_col=3,
    title='Original Dataset',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .05
    },
    layout_kwargs={'height': 1500})

# 2. Feature Extraction
- Balanced dataset

In [None]:
data_groups    = []
subplot_titles = []

fig = px.scatter(pd.concat([DFStandardScaler().fit_transform(X), y.astype(str)], axis=1).sort_values(by='target'),
                 x='feature_1', y='feature_2', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Original Dataset {class_ratio(y)}')

### 2.1. SMOTE + ENN + Ivis

In [None]:
from imblearn.combine import SMOTEENN

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('smote_enn', SMOTEENN(random_state=0, n_jobs=-1)),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SMOTE + ENN + Ivis {class_ratio(new_y)}')

### 2.2. SMOTE + Tomek + Ivis

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('smote_tomek', SMOTETomek(random_state=0, n_jobs=-1)),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SMOTE + Tomek + Ivis {class_ratio(new_y)}')

### 2.3. ADASYN + ENN + Ivis

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
    ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'ADASYN + ENN + Ivis {class_ratio(new_y)}')

### 2.4. ADASYN + Tomek + Ivis

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'ADASYN + Tomek + Ivis {class_ratio(new_y)}')

### 2.5. Borderline SMOTE + ENN + Ivis

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Borderline SMOTE + ENN + Ivis {class_ratio(new_y)}')

### 2.6. Borderline SMOTE + Tomek + Ivis

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'Borderline SMOTE + Tomek + Ivis {class_ratio(new_y)}')

### 2.7. K-Means SMOTE + ENN + Ivis

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('kmeans_smote', KMeansSMOTE(random_state=0, n_jobs=-1, cluster_balance_threshold=.05)),
    ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'K-Means SMOTE + ENN + Ivis {class_ratio(new_y)}')

### 2.8. K-Means SMOTE + Tomek + Ivis

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('kmeans_smote', KMeansSMOTE(random_state=0, n_jobs=-1, cluster_balance_threshold=.05)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'K-Means SMOTE + Tomek + Ivis {class_ratio(new_y)}')

### 2.9. SVM SMOTE + ENN + Ivis

In [None]:
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('svm_smote', SVMSMOTE(random_state=0, n_jobs=-1)),
    ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SVM SMOTE + ENN + Ivis {class_ratio(new_y)}')

### 2.10. SVM SMOTE + Tomek + Ivis

In [None]:
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
steps = [
    ('standard_scaler', DFStandardScaler()),
    ('svm_smote', SVMSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
]
new_X, new_y = Pipeline(steps, verbose=True).fit_resample(X, y)
print(f'Balanced: {new_X.shape, new_y.shape}')

steps = [
    ('minmax_scaler', DFMinMaxScaler()),
    ('ivis', DFIvis(embedding_dims=2,
                    epochs=500, verbose=2,
                    supervision_weight=.5, supervision_metric='binary_crossentropy',
                    k=15, n_epochs_without_progress=15, model='szubert')),
]
new_X = Pipeline(steps, verbose=True).fit_transform(new_X, new_y)

new_X.shape

In [None]:
vp.value_count(new_y.to_frame(), 'target')

In [None]:
fig = px.scatter(pd.concat([new_X, new_y.astype(str)], axis=1).sort_values(by='target'),
                 x='ivis_0', y='ivis_1', color='target')

data_groups.append(fig['data'])
subplot_titles.append(f'SVM SMOTE + Tomek + Ivis {class_ratio(new_y)}')

In [None]:
# Scatter plot
vp.datagroups_subplots(
    data_groups,
    max_col=3,
    title='Re-Sampling Dataset',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .08
    },
    layout_kwargs={'height': 1250})