### Exploring the impact of clustering on the quality of SMOTE preprocessing. Comparative analysis
##### Maksym Malicki, Jacek Glapiński
###### Wrocław University of Technology
In this notebook we present a comparative analysis of the impact of clustering using various methods on the quality of SMOTE preprocessing.

#### load_dataset()
This method allows us to load datasets listed in the paper.

In [3]:
import numpy as np

def load_dataset(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('@'):
                continue
            line_data = line.strip().split(',')
            sample_class = line_data[-1].strip().lower().replace(" ", "")
            label = 1 if sample_class == 'positive' else 0
            converted_data = []
            for x in line_data[:-1]:
                try:
                    converted_data.append(float(x))
                except ValueError:
                    converted_data.append(ord(x))
            data.append(converted_data)
            labels.append(label)
    X = np.array(data)
    y = np.array(labels)

    return X, y

#### Clustering with SMOTE

In [4]:
from sklearn.cluster import KMeans, MeanShift

def oversample_clustered_data(X_minority, y_minority, minority_indices, cluster_labeled_data, X, y):
    imbalance_ratios = []
    cluster_labels = np.unique(cluster_labeled_data)
    for cluster in cluster_labels:
        cluster_samples_indices = np.where(cluster_labeled_data == cluster)[0]
        samples_labels_in_cluster = y_minority[cluster_samples_indices]
        imbalance_ratios.append((cluster, len(samples_labels_in_cluster)))
    sorted_clusters = sorted(imbalance_ratios, key=lambda x: x[1])
    cluster_to_oversample = sorted_clusters[-1][0]
    indexes_of_minority_samples_with_given_cluster = np.where(cluster_labeled_data == cluster_to_oversample)[0]
    indexes_of_samples_with_given_cluster_in_dataset = minority_indices[indexes_of_minority_samples_with_given_cluster]
    majority_indices = np.where(y == 0)[0]
    cluster_indices_to_oversample = np.concatenate((indexes_of_samples_with_given_cluster_in_dataset, majority_indices), axis=None)
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X[cluster_indices_to_oversample], y[cluster_indices_to_oversample])
    _, label_resampled_counts_whole = np.unique(y_resampled, return_counts=True)
    return X_resampled, y_resampled

def KMeans_SMOTE(X, y, num_clusters):
    minority_indices = np.where(y == 1)[0]
    X_minority = X[minority_indices]
    y_minority = y[minority_indices]
    kmeans_labels_minority = KMeans(n_clusters=2, random_state=0, n_init="auto").fit_predict(X_minority)
    return oversample_clustered_data(X_minority, y_minority, minority_indices, kmeans_labels_minority, X, y)


def MeanShift_SMOTE(X, y):
    minority_indices = np.where(y == 1)[0]
    X_minority = X[minority_indices]
    y_minority = y[minority_indices]
    mean_shift_labels = MeanShift().fit_predict(X_minority)
    return oversample_clustered_data(X_minority, y_minority, minority_indices, mean_shift_labels, X, y)

#### Experiment for single dataset

In [47]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score
from imblearn.metrics import specificity_score

def experiment(X, y):
    preprocessings = {
        "KMeansSMOTE": True,
        "MeansShiftSMOTE": True,
        "SMOTE": SMOTE(),
        "ROS": RandomOverSampler(),
        "BorderlineSMOTE": BorderlineSMOTE(),
    }
    classifier = svm.SVC(),
    classifier = classifier[0]
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1234)
    result = {}
    for key in preprocessings:
        precision_scores = []
        recall_scores = []
        specifity_scores = []
        for train_index, test_index in rskf.split(X,y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if key == "KMeansSMOTE":
                X_train_oversampled, y_train_oversampled = KMeans_SMOTE(X_train, y_train, 2)
            elif key == "MeansShiftSMOTE":
                X_train_oversampled, y_train_oversampled = MeanShift_SMOTE(X_train, y_train)
            else:
                X_train_oversampled, y_train_oversampled = preprocessings[key].fit_resample(X_train, y_train)
            classifier.fit(X_train_oversampled, y_train_oversampled)
            predict = classifier.predict(X_test)
            precision_scores.append(precision_score(y_test, predict))
            recall_scores.append(recall_score(y_test, predict))
            specifity_scores.append(specificity_score(y_test, predict))
        mean_precision_score = np.mean(precision_scores)
        std_precision_score = np.std(precision_scores)
        mean_recall_score = np.mean(recall_scores)
        std_recall_score = np.std(recall_scores)
        mean_specifity_score = np.mean(specifity_scores)
        std_specifity_score = np.std(specifity_scores)
#         print(f"Precission score {key}: %.3f (%.3f)" % (mean_precision_score, std_precision_score))
#         print(f"Specifity score {key}: %.3f (%.3f)" % (mean_specifity_score, std_specifity_score))
#         print(f"Recall score {key}: %.3f (%.3f)" % (mean_recall_score, std_recall_score))
        result[key] = {
            "precission_scores": precision_scores,
            "recall_scores": recall_scores,
            "specifity_scores": specifity_scores,
            "mean_precission_score": mean_precision_score,
            "mean_recall_scores": mean_recall_score,
            "mean_specifity_scores": mean_specifity_score,
        }
    return result

#### Running experiments on the datasets

In [48]:
import os

directories = ['mild-imbalance', 'high-imbalance']
results = {}
for directory in directories:
    print(f"Processing files in directory: {directory}")
    files = os.listdir(directory)
    results = {}
    for file_name in files:
        file_path = os.path.join(directory, file_name)
        print(f"File: {file_path}")
        X, y = load_dataset(file_path)
        experiment_result = experiment(X, y)
        results[file_name] = experiment_result
# print(results)

Processing files in directory: mild-imbalance
File: mild-imbalance/vehicle1.dat
File: mild-imbalance/vehicle0.dat
File: mild-imbalance/vehicle2.dat
File: mild-imbalance/vehicle3.dat
File: mild-imbalance/yeast3.dat
File: mild-imbalance/page-blocks0.dat
File: mild-imbalance/yeast1.dat
File: mild-imbalance/pima.dat
File: mild-imbalance/segment0.dat
File: mild-imbalance/wisconsin.dat
Processing files in directory: high-imbalance
File: high-imbalance/yeast4.dat
File: high-imbalance/yeast5.dat
File: high-imbalance/yeast-0-2-5-7-9_vs_3-6-8.dat
File: high-imbalance/shuttle-c0-vs-c4.dat


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


File: high-imbalance/yeast-0-2-5-6_vs_3-7-8-9.dat
File: high-imbalance/kr-vs-k-zero-one_vs_draw.dat
File: high-imbalance/shuttle-2_vs_5.dat
File: high-imbalance/abalone-17_vs_7-8-9-10.dat
File: high-imbalance/abalone19.dat
File: high-imbalance/kr-vs-k-three_vs_eleven.dat


### Statistically significantly better preprocessings in given datasets with given metrics

In [49]:
from scipy.stats import ttest_rel, wilcoxon, shapiro
from tabulate import tabulate

alfa = .05
methods = ["KMeansSMOTE", "MeansShiftSMOTE", "SMOTE", "ROS", "BorderlineSMOTE"]
metrics = ["precission_scores","recall_scores","specifity_scores"]

for directory in results:
    for metric in metrics:
        t_statistic = np.zeros((len(methods), len(methods)))
        p_value = np.zeros((len(methods), len(methods)))
        test_used = np.empty((len(methods), len(methods)), dtype=object)
        for i, preprocessing_method in enumerate(methods):
            for j, comparison_preprocessing_method in enumerate(methods):
                metric_results_one = results[directory][preprocessing_method][metric]
                metric_results_two = results[directory][comparison_preprocessing_method][metric]
                normal_one = shapiro(metric_results_one).pvalue > 0.05
                normal_two = shapiro(metric_results_two).pvalue > 0.05
                if normal_one and normal_two:
                    try:
                        t_statistic[i,j], p_value[i,j] = ttest_rel(metric_results_one, metric_results_two)
                    except:
                        t_statistic[i,j], p_value[i,j] = 0,0
                    test_used[i,j] = "t-test"
                else:
                    try:
                        t_statistic[i,j], p_value[i,j] = wilcoxon(metric_results_one, metric_results_two)
                    except: 
                        t_statistic[i,j], p_value[i,j] = 0, 0
                    test_used[i,j] = "Wilcoxon"
        advantage = np.zeros((len(methods), len(methods)))
        advantage[t_statistic > 0] = 1
        significance = np.zeros((len(methods), len(methods)))
        significance[p_value <= alfa] = 1
        stat_better = significance * advantage
        stat_better_table = tabulate(stat_better, methods)
        print(f"Statistically significantly better {metric}:")
        print(stat_better_table)
        print()
        

Statistically significantly better precission_scores:
  KMeansSMOTE    MeansShiftSMOTE    SMOTE    ROS    BorderlineSMOTE
-------------  -----------------  -------  -----  -----------------
            0                  0        1      1                  0
            0                  0        0      1                  0
            0                  0        0      0                  0
            0                  0        0      0                  0
            0                  0        1      1                  0

Statistically significantly better recall_scores:
  KMeansSMOTE    MeansShiftSMOTE    SMOTE    ROS    BorderlineSMOTE
-------------  -----------------  -------  -----  -----------------
            0                  0        0      0                  0
            0                  0        0      0                  0
            0                  0        0      0                  1
            0                  0        0      0                  1
           



### Statistically significantly better preprocessings for all datasets

In [62]:
from scipy.stats import rankdata, ranksums

methods = ["KMeansSMOTE", "MeansShiftSMOTE", "SMOTE", "ROS", "BorderlineSMOTE"]
metrics = ["mean_precission_score", "mean_recall_scores", "mean_specifity_scores"]
for metric in metrics:
    mean = []
    for directory in results:
        preprocessing_mean = []
        for i, preprocessing_method in enumerate(methods):
            preprocessing_mean.append(results[directory][preprocessing_method][metric])
        mean.append(preprocessing_mean)

    ranks = []
    for mean_score in mean:
        ranks.append(rankdata(mean_score).tolist())
    ranks = np.array(ranks)

    alfa = .05
    w_statistic = np.zeros((len(methods), len(methods)))
    p_value = np.zeros((len(methods), len(methods)))
    for i in range(len(methods)):
        for j in range(len(methods)):
            w_statistic[i, j], p_value[i, j] = ranksums(ranks.T[i], ranks.T[j])
    names_column = np.expand_dims(np.array(list(methods)), axis=1)
    w_statistic_table = np.concatenate((names_column, w_statistic), axis=1)
    w_statistic_table = tabulate(w_statistic_table, methods, floatfmt=".2f")
    p_value_table = np.concatenate((names_column, p_value), axis=1)
    p_value_table = tabulate(p_value_table, methods, floatfmt=".2f")
    advantage = np.zeros((len(methods), len(methods)))
    advantage[w_statistic > 0] = 1
    advantage_table = tabulate(np.concatenate(
    (names_column, advantage), axis=1), methods)
    significance = np.zeros((len(methods), len(methods)))
    significance[p_value <= alfa] = 1
    significance_table = tabulate(np.concatenate(
    (names_column, significance), axis=1), methods)
    print(f"Metric: {metric}")
    print("Statistical significance (alpha = 0.05):")
    print(significance_table)
    print()
    print()

Metric: mean_precission_score
Statistical significance (alpha = 0.05):
                   KMeansSMOTE    MeansShiftSMOTE    SMOTE    ROS    BorderlineSMOTE
---------------  -------------  -----------------  -------  -----  -----------------
KMeansSMOTE                  0                  0        0      0                  0
MeansShiftSMOTE              0                  0        0      0                  0
SMOTE                        0                  0        0      0                  0
ROS                          0                  0        0      0                  0
BorderlineSMOTE              0                  0        0      0                  0


Metric: mean_recall_scores
Statistical significance (alpha = 0.05):
                   KMeansSMOTE    MeansShiftSMOTE    SMOTE    ROS    BorderlineSMOTE
---------------  -------------  -----------------  -------  -----  -----------------
KMeansSMOTE                  0                  0        1      1                  1
MeansShif