In [30]:
import numpy as np

def load_dataset(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('@'):
                continue
            line_data = line.strip().split(',')
            print(line_data[-1] == 'positive', line_data[-1] == 'negative', 'positive', type('positive'), line_data[-1], type(line_data[-1]))
            label = 1 if line_data[-1] == 'positive' else 0
            data.append([float(x) for x in line_data[:-1]])
            labels.append(label)
    X = np.array(data)
    y = np.array(labels)

    return X, y

In [31]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold


def experiment(X, y):
    preprocessings = {
        "SMOTE": SMOTE(),
        "ROS": RandomOverSampler(),
        "BorderlineSMOTE": BorderlineSMOTE(),
    }
    classifier = svm.SVC(),
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1234)
    scores = []
    for key in preprocessings:
        for train_index, test_index in rskf.split(X,y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            print(X_train, y_train)
            X_train_oversampled, y_train_oversampled = preprocessings[key].fit_resample(X_train, y_train)
            clf.fit(X_train_oversampled, y_train_oversampled)
            predict = clf.predict(X_test)
            scores.append(accuracy_score(y_test, predict))
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("Accuracy score: %.3f (%.3f)" % (mean_score, std_score))
    

In [32]:
import os

directories = ['mild-imbalance', 'high-imbalance']

for directory in directories:
    print(f"Processing files in directory: {directory}")
    files = os.listdir(directory)
    for file_name in files:
        file_path = os.path.join(directory, file_name)
        print(f"File: {file_path}")
        X, y = load_dataset(file_path)
        experiment(X, y)
        print("X:")
        print(X)
        print("y:")
        print(y)
        print()

Processing files in directory: mild-imbalance
File: mild-imbalance/vehicle1.dat
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  positive <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  positive <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  positive <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'>  negative <class 'str'>
False False positive <class 'str'> 

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead