In [50]:
import numpy as np

def load_dataset(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('@'):
                continue
            line_data = line.strip().split(',')
            sample_class = line_data[-1].strip().lower().replace(" ", "")
            label = 1 if sample_class == 'positive' else 0
            converted_data = []
            for x in line_data[:-1]:
                try:
                    converted_data.append(float(x))
                except ValueError:
                    converted_data.append(ord(x))
            data.append(converted_data)
            labels.append(label)
    X = np.array(data)
    y = np.array(labels)

    return X, y

In [60]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score

def experiment(X, y):
    preprocessings = {
        "SMOTE": SMOTE(),
        "ROS": RandomOverSampler(),
        "BorderlineSMOTE": BorderlineSMOTE(),
    }
    classifier = svm.SVC(),
    classifier = classifier[0]
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1234)
    precision_scores = []
    recall_scores = []
    for key in preprocessings:
        for train_index, test_index in rskf.split(X,y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train_oversampled, y_train_oversampled = preprocessings[key].fit_resample(X_train, y_train)
            classifier.fit(X_train_oversampled, y_train_oversampled)
            predict = classifier.predict(X_test)
            precision_scores.append(precision_score(y_test, predict))
            recall_scores.append(recall_score(y_test, predict))
        mean_precision_score = np.mean(precision_scores)
        std_precision_score = np.std(precision_scores)
        recall_scores = np.mean(recall_scores)
        std_recall_scores = np.std(recall_scores)
        print("Precission score: %.3f (%.3f)" % (mean_precision_score, std_precision_score))
        print("Recall score: %.3f (%.3f)" % (recall_scores, std_recall_scores))
    

In [58]:
import os

directories = ['mild-imbalance', 'high-imbalance']

for directory in directories:
    print(f"Processing files in directory: {directory}")
    files = os.listdir(directory)
    print(files)
    for file_name in files:
        file_path = os.path.join(directory, file_name)
        print(f"File: {file_path}")
        X, y = load_dataset(file_path)
        experiment(X, y)

Processing files in directory: mild-imbalance
['vehicle1.dat', 'vehicle0.dat', 'vehicle2.dat', 'vehicle3.dat', 'yeast3.dat', 'page-blocks0.dat', 'yeast1.dat', 'pima.dat', 'segment0.dat', 'wisconsin.dat']
File: mild-imbalance/vehicle1.dat


NameError: name 'recalls_scores' is not defined

## 