In [None]:
from joblib import Parallel, delayed
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from glob import glob
import numpy as np
import os
from xgboost import XGBRegressor
import itertools as it

import warnings
warnings.filterwarnings('ignore')

In [None]:
def repeatedKfold(X, y, dataset_name):

  outer = RepeatedKFold(n_splits=10, n_repeats=2, random_state=42)

  regressors = {
    'BG': BaggingRegressor(),
    'DT': DecisionTreeRegressor(),
    'MLP': MLPRegressor(),
    'RF': RandomForestRegressor(),
    'SVM': SVR(),
    'XG': XGBRegressor()
  }

  for regressor_name, regressor in regressors.items():
      print(regressor_name)
      for fold, (train_index, test_index) in enumerate(outer.split(X, y)):
            print("outer")
            print("Fold:", fold)
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = regressor.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            model_name = type(model).__name__

            pred = np.column_stack((test_index, y_pred))
            pd.DataFrame(pred).to_csv('/project/def-menelau/jga/SURVEY/NONE/'+dataset_name+'.csv/'+model_name+'/Pred{}_{}.csv'.format(fold, model_name, index = False))

In [None]:
def processar_dataset(dataset):
    ds = pd.read_csv(dataset)
    dataset_name = dataset.split('/')[-1].replace('.csv', '')  # Obtém o nome do dataset

    X = ds.drop([ds.columns[0]], axis = 1)
    y = ds[ds.columns[0]]

    X = X.to_numpy()
    y = y.to_numpy()

    repeatedKfold(X, y, dataset_name)

data_sets = sorted(glob(r'ds/*.csv'))

num_jobs = -1  # Isso usa todos os núcleos disponíveis, você pode ajustar conforme necessário

# Executa em paralelo para processar múltiplos conjuntos de dados
Parallel(n_jobs=num_jobs)(delayed(processar_dataset)(dataset) for dataset in data_sets)