In [None]:
!pip install ImbalancedLearningRegression

In [None]:
!pip install smogn
!pip install resreg

In [11]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, accuracy_score
from glob import glob
import xgboost as xgb
import numpy as np
import os
import itertools as it
import smogn
import resreg
from xgboost import XGBRegressor

import ImbalancedLearningRegression as iblr

import warnings
warnings.filterwarnings('ignore')

In [26]:
def balance(train, strategy, c):

  train = pd.read_csv("train.csv")

  if strategy == "GN":
    train = iblr.gn(data = train, y = "0", samp_method=c[0], pert=c[1],  rel_thres = 0.8)
  elif strategy == "RO":
    train = iblr.ro(data = train, y = "0", samp_method=c[0], rel_thres = 0.8)
  elif strategy == "RU":
    train = iblr.random_under(data = train, y = "0", samp_method=c[0], rel_thres = 0.8)
  elif strategy == "SG":
    train =  train.dropna()
    train = smogn.smoter(data = train, y = train.columns[0], samp_method=c[0], rel_xtrm_type = 'high', rel_thres = 0.8)
    train =  train.dropna()
  elif strategy == "SMT":
    train = iblr.smote(data = train, y = "0", samp_method=c[0], rel_thres = 0.8)
  elif strategy == "WC":
    X_train = train.drop([train.columns[0]], axis = 1)
    y_train  = train[train.columns[0]]
    relevance = resreg.pdf_relevance(y_train)
    X_wercs, y_wercs = resreg.wercs(X_train, y_train, relevance, over=c[0], under=c[1])
    trainWC = np.column_stack((y_wercs, X_wercs))
    pd.DataFrame(trainWC).to_csv("trainWC.csv", index=False)
    train = pd.read_csv("trainWC.csv")
  return train

In [28]:
def repeatedKfold(n_splits=10, n_repeats=2, random_state=42, pipeline=None, param_grid=None) :
  rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
  all_result = []

  strategys = {"SG":{"samp_method":["balance", "extreme"]},
                "RU":{"C.perc":["balance", "extreme"]},
                "RO":{"C.perc":["balance", "extreme"]},
                "SMT":{"C.perc":["balance", "extreme"]},
                "GN":{"C.perc":["balance", "extreme"], "pert":[0.05, 0.1, 0.5]},
                "WC":{"over":[0.5, 0.8], "under":[0.5, 0.8]},
                'None': {None}}



  path = dataset
  head, tail = os.path.split(path)

  for strategy in strategys:
      data_frame = []
      params = strategys[strategy]

      keys = sorted(params)

      if strategy != "None":
          combinations = it.product(*(params[Name] for Name in keys))
      else:
        combinations = ['None']
      for c in list(combinations):
        score_perc = []
        i = 1
        for train_index, test_index in rkf.split(X, y):

          X_train, X_test = X[train_index], X[test_index]
          y_train, y_test = y[train_index], y[test_index]

          train = np.column_stack((y_train, X_train))
          pd.DataFrame(train).to_csv("train.csv", index=False)
          train = pd.read_csv("train.csv")

          if c != 'None':
            try:
              train = balance(train, strategy, c)
            except ValueError:
              pass

          X_train = train.drop([train.columns[0]], axis = 1)
          y_train  = train[train.columns[0]]

          if len(X_train) > 10:

            grid_search = GridSearchCV(pipeline, cv=rkf, param_grid=param_grid, n_jobs=-1)
            grid_search.fit(X_train.values, y_train.values)

            y_pred  = grid_search.predict(X_test)

            #test = np.column_stack((test_index, y_test))
            #pd.DataFrame(test).to_csv('NEW/RO/'+tail+'/Test{}_{}_{}_{}.csv'.format(i, strategy, c, str(pipeline.steps[0][1]).split('(')[0]), index = False)
            pred = np.column_stack((test_index, y_pred))
            pd.DataFrame(pred).to_csv('/Pred{}_{}_{}_{}.csv'.format(i, strategy, c, str(pipeline.steps[0][1]).split('(')[0]), index = False)

            i = i+1

In [24]:
def pipe_generation():
  clf_param = dict()
  for clf in [BaggingRegressor(DecisionTreeRegressor()), DecisionTreeRegressor(), MLPRegressor(max_iter=200), RandomForestRegressor(), SVR(), XGBRegressor(verbosity=0)]:
      clf_param[str(clf).split('(')[0]] = clf

  pipes_params = []
  for clf,  param_grid in zip([BaggingRegressor(DecisionTreeRegressor()), DecisionTreeRegressor(), MLPRegressor(max_iter=200), RandomForestRegressor(), SVR(), XGBRegressor(verbosity=0)],

                   [{'clf__base_estimator__min_samples_split': [20], 'clf__max_samples':[0.5]},
                    {'clf__min_samples_split': [20]},
                    {'clf__learning_rate_init': [0.1],'clf__momentum': (0.2, 0.7),'clf__tol': (0.01, 0.05)},
                    {'clf__n_estimators': [550, 1500], 'clf__max_features': [5]},
                    {'clf__gamma': [0.01, 0.001], 'clf__C': [10, 300]},
                    {'clf__eta': [0.01], 'clf__max_depth': (10, 15), 'clf__colsample_bytree': (0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), 'clf__num_round': [25]}]):

    configs = []
    clf = str(clf).split('(')[0]
    for p in param_grid:
        aux = p
        for i in param_grid[p]:
          aux += '+'+str(i)
        clf += '|'+aux
    configs.append(clf)

    for config in configs:

      pipeline = Pipeline([('clf', clf_param[config.split('|')[0]])])
      params = config.split('|')

      param_grid = {}
      t, t1 = len(params), 0
      for p in range(len(params)):
        values = ()
        if len(params[p].split('+')) > 2:
          a = params[p].split('+')[1:]
          for j in a:
            if '0.' in j:
              values += (float(j),)
            else:
              values += (int(j),)

          param_grid[params[p].split('+')[0]] = values

        else:

          if t1 == t:
            if '0.' in params[p].split('+')[1]:
              param_grid[params[p].split('+')[0]] = [params[p].split('+')[1]]
            else:
              param_grid[params[p].split('+')[0]] = [params[p].split('+')[1]]
          elif t1 < t:
            for l in params[t1].split('+')[1:]:

              if '0.' in l:
                param_grid[params[t1].split('+')[0]] = [float(l)]
              else:
                param_grid[params[t1].split('+')[0]] = [int(l)]

        t1 += 1

    pipes_params.append([pipeline, param_grid])
  return pipes_params

In [18]:
data_sets = sorted(glob(r'/ds/*.csv'))

In [None]:
pipes_params = pipe_generation()

for j in pipes_params:
    score_ap = []
    for i, dataset in enumerate(data_sets):

          ds = pd.read_csv(dataset)

          path = dataset
          head, tail = os.path.split(path)
          print("=====================")
          print(path)

          X = ds.drop([ds.columns[0]], axis = 1)
          y = ds[ds.columns[0]]

          X = X.to_numpy()
          y = y.to_numpy()


          pipeline, param_grid = j[0], j[1]
          print(str(pipeline.steps[0][1]).split('(')[0])
          repeatedKfold(pipeline=pipeline, param_grid=param_grid)
