In [None]:
pip install rpy2

In [None]:
pip install smogn

In [None]:
pip install resreg

In [5]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import json
from urllib.request import urlopen as urlopen
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, accuracy_score
from glob import glob
from rpy2.robjects import default_converter
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import Converter, localconverter
import rpy2.robjects.numpy2ri
from xgboost import XGBRegressor
import numpy as np
from glob import glob
import os
import itertools as it
import smogn
import resreg


import warnings
warnings.filterwarnings('ignore')

rpy2.robjects.numpy2ri.activate()
pandas2ri.activate()

In [None]:
string = """

U1 <- function(){

  install.packages("devtools")
  library(devtools)

  
  install.packages(c("operators", "class", "fields", "ROCR", "Hmisc", "performanceEstimation"))

  install.packages(c("zoo","xts","quantmod"))

  install.packages( "https://cran.r-project.org/src/contrib/Archive/DMwR/DMwR_0.4.1.tar.gz", repos=NULL, type="source" )

  install_github("nunompmoniz/IRon")
  install_github("paobranco/UBL")
  install_github("rpribeiro/uba")

  library(uba)
  library(UBL)
  library(IRon)
  
}

"""
powerpack = SignatureTranslatedAnonymousPackage(string, "powerpack")

powerpack.U1()

In [8]:
uba = importr("uba")
iron = importr("IRon")

In [9]:
def scores(y_test, y_pred):
    ph = uba.phi_control(y_test)
    ls = uba.loss_control(y_test)
    prec = uba.util(y_pred, y_test, ph, ls, uba.util_control(umetric="P", event_thr=0.8))
    rec = uba.util(y_pred, y_test, ph, ls, uba.util_control(umetric="R", event_thr=0.8))
    sera = iron.sera(y_test, y_pred, phi_trues = uba.phi(y_test,ph))
    F1 = uba.util(y_pred, y_test, ph, ls, uba.util_control(umetric="Fm", beta=1, event_thr=0.8))

    scores_ = list([mean_squared_error(y_test, y_pred), prec, rec, F1, sera])
    return pd.DataFrame(scores_,
              columns = [''],
              index = ['MSE', 'precision', 'recall', 'fscore', 'sera'])

In [30]:
def balance(train, strategy, c):
  string = """
        
  library(uba)
  library(UBL)
  
  train <- read.csv("/content/train.csv")

  RU <- function(perc){
    resample <- RandUnderRegress(X0~., train, thr.rel=0.8, C.perc=perc)
  }

  RO <- function(perc){
    resample <- RandOverRegress(X0~., train, thr.rel=0.8, C.perc=perc)
  }

  SMT <- function(perc){
    resample <- SmoteRegress(X0~., train, thr.rel=0.8, C.perc=perc)
  }

  GN <- function(perc, pert){
    resample <-GaussNoiseRegress(X0~., train, thr.rel=0.8, C.perc=perc, pert=pert)
  }
  
  """
  powerpack = SignatureTranslatedAnonymousPackage(string, "powerpack")
  if strategy == "GN":
    train = pd.DataFrame(powerpack.GN(c[0], c[1]))
  elif strategy == "RO":
    train = pd.DataFrame(powerpack.RO(c[0]))
  elif strategy == "RU":
    train = pd.DataFrame(powerpack.RU(c[0]))
  elif strategy == "SG":
    train = pd.read_csv("/content/train.csv")
    train = train.dropna(axis=0)        
    train = smogn.smoter(data = train, y = train.columns[0], samp_method=c[0], rel_thres = 0.8)
    train = train.dropna(axis=0)        
  elif strategy == "SMT":
    train = pd.DataFrame(powerpack.SMT(c[0]))
  elif strategy == "WC":
    train = pd.read_csv("/content/train.csv")
    X_train = train.drop([train.columns[0]], axis = 1)
    y_train  = train[train.columns[0]]
    relevance = resreg.pdf_relevance(y_train)
    X_wercs, y_wercs = resreg.wercs(X_train, y_train, relevance, over=c[0], under=c[1])
    trainWC = np.column_stack((y_wercs, X_wercs))
    pd.DataFrame(trainWC).to_csv("trainWC.csv", index=False)
    train = pd.read_csv("/content/trainWC.csv") 
  return train

In [33]:
def repeatedKfold(n_splits=10, n_repeats=2, random_state=42, pipeline=None, param_grid=None) :
  rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
  all_result = []

  strategys = {"RU":{"C.perc":["balance", "extreme"]},
             "RO":{"C.perc":["balance", "extreme"]},
             "SMT":{"C.perc":["balance", "extreme"]},
             "GN":{"C.perc":["balance", "extreme"], "pert":[0.05, 0.1, 0.5]},
             "SG":{"samp_method":["balance", "extreme"]},
             "WC":{"over":[0.5, 0.8], "under":[0.5, 0.8]},
             'None': {None}}

  for strategy in strategys:

      data_frame = []
      params = strategys[strategy]

      keys = sorted(params)

      if strategy != "None":
        combinations = it.product(*(params[Name] for Name in keys))
      else:
        combinations = ['None']
      for c in list(combinations):
        score_perc = []
        for train_index, test_index in rkf.split(X, y):
          
          X_train, X_test = X[train_index], X[test_index]
          y_train, y_test = y[train_index], y[test_index]
          
          train = np.column_stack((y_train, X_train))
          pd.DataFrame(train).to_csv("train.csv", index=False)
          train = pd.read_csv("/content/train.csv")
          
          if c != 'None':
            train = balance(train, strategy, c)
          
          X_train = train.drop([train.columns[0]], axis = 1)
          y_train  = train[train.columns[0]]

          X_train = X_train.to_numpy()
          y_train = y_train.to_numpy()
          
          grid_search = GridSearchCV(pipeline, cv=rkf, param_grid=param_grid)
          grid_search.fit(X_train, y_train)
          y_pred  = grid_search.predict(X_test)

          path = dataset
          head, tail = os.path.split(path)

          test = np.column_stack((test_index, y_test))
          pred = np.column_stack((test_index, y_pred))
        

          score_perc.append(scores(y_test, y_pred).T) 

        df = pd.concat(score_perc)
        values = [tail, 
                  str(df.precision.mean().round(3)[0])+ "({})".format(df.precision.std().round(3)),
                  str(df.recall.mean().round(3)[0])+ "({})".format(df.recall.std().round(3)),
                  str(df.MSE.mean().round(3))+ "({})".format(df.MSE.std().round(3)),
                  str(df.fscore.mean().round(3)[0])+ "({})".format(df.fscore.std().round(3)),
                  str(df.sera.mean().round(3)[0])+ "({})".format(df.sera.std().round(3))]

        scores_df = pd.DataFrame([values], columns=["Dataset", "Precision", "Recall", "MSE", "Fscore", "SERA"])

        if len(keys) > 1:
          scores_df[keys[0]]=c[0]
          scores_df[keys[1]]=c[1]
          scores_df['strategy']=strategy           
        else:
          scores_df[keys[0]]=c[0]
          scores_df['strategy']=strategy

        data_frame.append(scores_df)
      data_frame = pd.concat(data_frame)
      data_frame.to_csv('/content/result_{}_{}.csv'.format(strategy, str(pipeline.steps[0][1]).split('(')[0]), index = False)
      all_result.append(data_frame)
  return all_result

In [11]:
def pipe_generation():
  clf_param = dict()
  for clf in [DecisionTreeRegressor()]:
    clf_param[str(clf).split('(')[0]] = clf

  pipes_params = []
  for clf,  param_grid in zip([DecisionTreeRegressor(), MLPRegressor(max_iter=200), RandomForestRegressor(), SVR(), XGBRegressor(silent=True)],
                               
                  [ {'clf__min_samples_split': [20]},
                   {'clf__learning_rate_init': [0.1],'clf__momentum': (0.2, 0.7),'clf__tol': (0.01, 0.05)},
                   {'clf__n_estimators': [550, 1500], 'clf__max_features': [5]},
                   {'clf__gamma': [0.01, 0.001], 'clf__C': [10, 300]},
                   {'clf__eta': [0.01], 'clf__max_depth': (10, 15), 'clf__colsample_bytree': (0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), 'clf__num_round': [25]}]):

    configs = []
    clf = str(clf).split('(')[0]
    for p in param_grid:
        aux = p
        for i in param_grid[p]:
          aux += '+'+str(i)
        clf += '|'+aux
    configs.append(clf)
  
    for config in configs:
    
      pipeline = Pipeline([('clf', clf_param[config.split('|')[0]])])
      params = config.split('|')
    
      param_grid = {}
      t, t1 = len(params), 0
      for p in range(len(params)):
        values = ()
        if len(params[p].split('+')) > 2:
          a = params[p].split('+')[1:]
          for j in a:
            if '0.' in j:
              values += (float(j),)
            else:
              values += (int(j),)
          
          param_grid[params[p].split('+')[0]] = values
        
        else:
    
          if t1 == t:   
            if '0.' in params[p].split('+')[1]:
              param_grid[params[p].split('+')[0]] = [params[p].split('+')[1]]
            else:
              param_grid[params[p].split('+')[0]] = [params[p].split('+')[1]]
          elif t1 < t:
            for l in params[t1].split('+')[1:]:

              if '0.' in l:
                param_grid[params[t1].split('+')[0]] = [float(l)]
              else:
                param_grid[params[t1].split('+')[0]] = [int(l)]

        t1 += 1
  
    pipes_params.append([pipeline, param_grid]) 
  return pipes_params

In [None]:
!git clone 'https://github.com/JusciAvelino/imbalancedRegression.git'

In [None]:
data_sets = sorted(glob(r'/content/imbalancedRegression/data/*'))

In [None]:
score_ap = []
for i, dataset in enumerate(data_sets):
  ds = pd.read_csv(dataset)

  path = dataset
  head, tail = os.path.split(path)
  print("=====================")
  
  X = ds.drop(['Unnamed: 0', ds.columns[1]], axis = 1)
  y = ds[ds.columns[1]]

  X = X.to_numpy()
  y = y.to_numpy()

  pipes_params = pipe_generation()
  for j in pipes_params:
    pipeline, param_grid = j[0], j[1]
    print(str(pipeline.steps[0][1]).split('(')[0])
    score_all = pd.concat(repeatedKfold(pipeline=pipeline, param_grid=param_grid))