In [1]:
import generate_datasets
import pandas as pd
import random
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn_lvq import GlvqModel
from sklearn.svm import SVC

In [2]:
DATASETS_DIR = "datasets"
RANDOM_STATE = 10
N_FOLDS = 3

In [3]:
random.seed(RANDOM_STATE)

In [4]:
datasets = []
ratios, weights = generate_datasets.get_imbalance()
for dataset in os.listdir(DATASETS_DIR):
  dataset_path = os.path.join(DATASETS_DIR, dataset)
  dataset_names = [dataset + f'_w_{w:.3f}.csv' for w in weights]
  datasets_df = [pd.read_csv(os.path.join(dataset_path, dataset_name)) for dataset_name in dataset_names]
  datasets.append(datasets_df)

In [5]:
models = {
  'SVM_lin': SVC(kernel='linear', random_state=RANDOM_STATE),
  'SVM_RBF': SVC(kernel='rbf', random_state=RANDOM_STATE),
  # 'KNN': KNeighborsClassifier(n_neighbors=5),
  # 'GLQV': GlvqModel(prototypes_per_class=1, max_iter=2500, gtol=1e-5, beta=5, random_state=RANDOM_STATE)
}

In [None]:
results = {}
folds = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_STATE, shuffle=True)

for name, model in models.items():
  results[name] = []

  for i in range(len(datasets)-50):
    dataset_results = []
    for j in range(len(weights)):
      dataset = datasets[i][j]
      X = dataset.iloc[:, :-1]
      y = dataset.iloc[:, -1]
      mean_acc = 0
      for train_index, test_index in folds.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        models[name].fit(X_train, y_train)
        acc = accuracy_score(y_test, models[name].predict(X_test))
        mean_acc += acc
      mean_acc /= N_FOLDS
      dataset_results.append(mean_acc)
    results[name].append(dataset_results)