### Exploring the impact of clustering on the quality of SMOTE preprocessing. Comparative analysis
##### Maksym Malicki, Jacek Glapiński
###### Wrocław University of Technology
In this notebook we present a comparative analysis of the impact of clustering using various methods on the quality of SMOTE preprocessing.

#### load_dataset()
This method allows us to load datasets listed in the paper.

In [1]:
import numpy as np

def load_dataset(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('@'):
                continue
            line_data = line.strip().split(',')
            sample_class = line_data[-1].strip().lower().replace(" ", "")
            label = 1 if sample_class == 'positive' else 0
            converted_data = []
            for x in line_data[:-1]:
                try:
                    converted_data.append(float(x))
                except ValueError:
                    converted_data.append(ord(x))
            data.append(converted_data)
            labels.append(label)
    X = np.array(data)
    y = np.array(labels)

    return X, y

#### Clustering with SMOTE

In [41]:
from sklearn.cluster import KMeans, MeanShift

def imbalance_ratio(labels):
    _, label_counts = np.unique(labels, return_counts=True)
    minority_class_count = np.min(label_counts)
    majority_class_count = np.max(label_counts)
    if majority_class_count == 0:
        return 0
    return minority_class_count / majority_class_count

def oversample_clustered_data(X, y, cluster_labeled_data):
    imbalance_ratios = []
    ### find best cluster (with highest imbalance ratio)
    cluster_labels = np.unique(cluster_labeled_data)
    for cluster in cluster_labels:
        cluster_samples_indices = np.where(cluster_labeled_data == cluster)[0]
        samples_labels_in_cluster = y[cluster_samples_indices]
        imbalance_ratios.append((cluster, imbalance_ratio(samples_labels_in_cluster)))
    sorted_clusters = sorted(imbalance_ratios, key=lambda x: x[1])
    # Search for clusters with imbalance ratio different than 1
    cluster_to_oversample = next((cluster for cluster, imbalance_ratio in reversed(sorted_clusters) if imbalance_ratio != 1), None)
    if cluster_to_oversample is None:
        return X, y
    
    ### oversample minority class in given cluster until it has the size of majority class
    cluster_indices_to_oversample_labels = np.where(cluster_labeled_data == cluster_to_oversample)[0]
    cluster_indices_to_oversample_minority = np.where((cluster_labeled_data == cluster_to_oversample) & (y == 1))[0]
    majority_indices = np.where(y == 0)[0]
    cluster_indices_to_oversample = np.concatenate((cluster_indices_to_oversample_minority, majority_indices))
    smote = SMOTE()
    _, label_counts_whole = np.unique(y, return_counts=True)
    _, label_counts_best_cluster = np.unique(y[cluster_indices_to_oversample_labels], return_counts=True)
    
    ### print results
    print("whole: ", label_counts_whole, "best cluster:", label_counts_best_cluster)
    X_resampled, y_resampled = smote.fit_resample(X[cluster_indices_to_oversample], y[cluster_indices_to_oversample])
    print("resampled: ", len(X_resampled), len(y_resampled))
    return X_resampled, y_resampled

def KMeans_SMOTE(X, y, num_clusters):
    kmeans_labels = KMeans(n_clusters=2, random_state=0, n_init="auto").fit_predict(X)
    return oversample_clustered_data(X, y, kmeans_labels)


def MeanShift_SMOTE(X, y):
    mean_shift_labels = MeanShift().fit_predict(X)
    return oversample_clustered_data(X, y, mean_shift_labels)

#### Experiment for single dataset

In [42]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score
from imblearn.metrics import specificity_score

def experiment(X, y):
    preprocessings = {
        "KMeansSMOTE": True,
        "MeansShiftSMOTE": True,
        "SMOTE": SMOTE(),
        "ROS": RandomOverSampler(),
        "BorderlineSMOTE": BorderlineSMOTE(),
    }
    classifier = svm.SVC(),
    classifier = classifier[0]
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1234)
    precision_scores = []
    recall_scores = []
    specifity_scores = []
    for key in preprocessings:
        for train_index, test_index in rskf.split(X,y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if key == "KMeansSMOTE":
                X_train_oversampled, y_train_oversampled = KMeans_SMOTE(X_train, y_train, 2)
            elif key == "MeansShiftSMOTE":
                X_train_oversampled, y_train_oversampled = MeanShift_SMOTE(X_train, y_train)
            else:
                X_train_oversampled, y_train_oversampled = preprocessings[key].fit_resample(X_train, y_train)
            classifier.fit(X_train_oversampled, y_train_oversampled)
            predict = classifier.predict(X_test)
            precision_scores.append(precision_score(y_test, predict))
            recall_scores.append(recall_score(y_test, predict))
            specifity_scores.append(specificity_score(y_test, predict))
        mean_precision_score = np.mean(precision_scores)
        std_precision_score = np.std(precision_scores)
        mean_recall_scores = np.mean(recall_scores)
        std_recall_scores = np.std(recall_scores)
        mean_specifity_scores = np.mean(specifity_scores)
        std_specifity_scores = np.std(specifity_scores)
        print(f"Precission score {key}: %.3f (%.3f)" % (mean_precision_score, std_precision_score))
        print(f"Specifity score {key}: %.3f (%.3f)" % (mean_specifity_scores, std_specifity_scores))
        print(f"Recall score {key}: %.3f (%.3f)" % (mean_recall_scores, std_recall_scores))
    

#### Running experiments on the datasets

In [43]:
import os

directories = ['mild-imbalance', 'high-imbalance']

for directory in directories:
    print(f"Processing files in directory: {directory}")
    files = os.listdir(directory)
    print(files)
    for file_name in files:
        file_path = os.path.join(directory, file_name)
        print(f"File: {file_path}")
        X, y = load_dataset(file_path)
        experiment(X, y)

Processing files in directory: mild-imbalance
['vehicle1.dat', 'vehicle0.dat', 'vehicle2.dat', 'vehicle3.dat', 'yeast3.dat', 'page-blocks0.dat', 'yeast1.dat', 'pima.dat', 'segment0.dat', 'wisconsin.dat']
File: mild-imbalance/vehicle1.dat
whole:  [503 173] best cluster: [133  82]
resampled:  1006 1006
whole:  [503 174] best cluster: [135  90]
resampled:  1006 1006
whole:  [503 174] best cluster: [140  88]
resampled:  1006 1006
whole:  [503 174] best cluster: [134  91]
resampled:  1006 1006
whole:  [504 173] best cluster: [133  89]
resampled:  1008 1008
whole:  [503 173] best cluster: [127  90]
resampled:  1006 1006
whole:  [503 174] best cluster: [134  92]
resampled:  1006 1006
whole:  [503 174] best cluster: [138  87]
resampled:  1006 1006
whole:  [503 174] best cluster: [133  84]
resampled:  1006 1006
whole:  [504 173] best cluster: [142  87]
resampled:  1008 1008
Precission score KMeansSMOTE: 0.426 (0.043)
Specifity score KMeansSMOTE: 0.750 (0.027)
Recall score KMeansSMOTE: 0.537 (0.

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 6