### Exploring the impact of clustering on the quality of SMOTE preprocessing. Comparative analysis
##### Maksym Malicki, Jacek Glapiński
##### Wrocław University of Technology
In this notebook we present a comparative analysis of the impact of clustering using various methods on the quality of SMOTE preprocessing.

#### load_dataset()
This method allows us to load datasets listed in the paper.

In [50]:
import numpy as np

def load_dataset(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('@'):
                continue
            line_data = line.strip().split(',')
            sample_class = line_data[-1].strip().lower().replace(" ", "")
            label = 1 if sample_class == 'positive' else 0
            converted_data = []
            for x in line_data[:-1]:
                try:
                    converted_data.append(float(x))
                except ValueError:
                    converted_data.append(ord(x))
            data.append(converted_data)
            labels.append(label)
    X = np.array(data)
    y = np.array(labels)

    return X, y

#### Clustering with SMOTE

In [2]:
from sklearn.cluster import KMeans, MeanShift

def KMeans_SMOTE(X, num_clusters):
    kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)

def MeanShift_SMOTE(X):
    mean_shift = MeanShift().fit(X)

#### Experiment for single dataset

In [73]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score

def experiment(X, y):
    preprocessings = {
        "SMOTE": SMOTE(),
        "ROS": RandomOverSampler(),
        "BorderlineSMOTE": BorderlineSMOTE(),
    }
    classifier = svm.SVC(),
    classifier = classifier[0]
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1234)
    precision_scores = []
    recall_scores = []
    for key in preprocessings:
        for train_index, test_index in rskf.split(X,y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train_oversampled, y_train_oversampled = preprocessings[key].fit_resample(X_train, y_train)
            classifier.fit(X_train_oversampled, y_train_oversampled)
            predict = classifier.predict(X_test)
            precision_scores.append(precision_score(y_test, predict))
            recall_scores.append(recall_score(y_test, predict))
        mean_precision_score = np.mean(precision_scores)
        std_precision_score = np.std(precision_scores)
        mean_recall_scores = np.mean(recall_scores)
        std_recall_scores = np.std(recall_scores)
        print(f"Precission score {key}: %.3f (%.3f)" % (mean_precision_score, std_precision_score))
        print(f"Recall score {key}: %.3f (%.3f)" % (mean_recall_scores, std_recall_scores))
    

#### Running experiments on the datasets

In [74]:
import os

directories = ['mild-imbalance', 'high-imbalance']

for directory in directories:
    print(f"Processing files in directory: {directory}")
    files = os.listdir(directory)
    print(files)
    for file_name in files:
        file_path = os.path.join(directory, file_name)
        print(f"File: {file_path}")
        X, y = load_dataset(file_path)
        experiment(X, y)

Processing files in directory: mild-imbalance
['vehicle1.dat', 'vehicle0.dat', 'vehicle2.dat', 'vehicle3.dat', 'yeast3.dat', 'page-blocks0.dat', 'yeast1.dat', 'pima.dat', 'segment0.dat', 'wisconsin.dat']
File: mild-imbalance/vehicle1.dat
Precission score SMOTE: 0.425 (0.035)
Recall score SMOTE: 0.629 (0.052)
Precission score ROS: 0.423 (0.034)
Recall score ROS: 0.626 (0.053)
Precission score BorderlineSMOTE: 0.422 (0.032)
Recall score BorderlineSMOTE: 0.641 (0.055)
File: mild-imbalance/vehicle0.dat
Precission score SMOTE: 0.449 (0.015)
Recall score SMOTE: 1.000 (0.000)
Precission score ROS: 0.449 (0.016)
Recall score ROS: 1.000 (0.000)
Precission score BorderlineSMOTE: 0.451 (0.018)
Recall score BorderlineSMOTE: 1.000 (0.000)
File: mild-imbalance/vehicle2.dat
Precission score SMOTE: 0.458 (0.030)
Recall score SMOTE: 0.913 (0.020)
Precission score ROS: 0.459 (0.029)
Recall score ROS: 0.911 (0.022)
Precission score BorderlineSMOTE: 0.473 (0.038)
Recall score BorderlineSMOTE: 0.890 (0.071

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precission score BorderlineSMOTE: 0.850 (0.334)
Recall score BorderlineSMOTE: 0.819 (0.336)
File: high-imbalance/yeast-0-2-5-6_vs_3-7-8-9.dat
Precission score SMOTE: 0.544 (0.072)
Recall score SMOTE: 0.692 (0.116)
Precission score ROS: 0.533 (0.068)
Recall score ROS: 0.699 (0.116)
Precission score BorderlineSMOTE: 0.483 (0.101)
Recall score BorderlineSMOTE: 0.703 (0.119)
File: high-imbalance/kr-vs-k-zero-one_vs_draw.dat
Precission score SMOTE: 0.298 (0.032)
Recall score SMOTE: 1.000 (0.000)
Precission score ROS: 0.294 (0.031)
Recall score ROS: 1.000 (0.000)
Precission score BorderlineSMOTE: 0.262 (0.053)
Recall score BorderlineSMOTE: 1.000 (0.000)
File: high-imbalance/shuttle-2_vs_5.dat
Precission score SMOTE: 0.880 (0.084)
Recall score SMOTE: 1.000 (0.000)
Precission score ROS: 0.880 (0.084)
Recall score ROS: 1.000 (0.000)
Precission score BorderlineSMOTE: 0.880 (0.084)
Recall score BorderlineSMOTE: 1.000 (0.000)
File: high-imbalance/abalone-17_vs_7-8-9-10.dat
Precission score SMOTE: 