In [1]:
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from time import time
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import os

from RTER import RegressionTree
from distribution import TestDistribution
from ensemble import RegressionTreeBoosting, RegressionTreeEnsemble



from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor


In [2]:
sample_generator=TestDistribution(2).returnDistribution()
n_test, n_train = 4000,2000
X_train, Y_train = sample_generator.generate(n_train)
X_test, Y_test = sample_generator.generate(n_test)



In [3]:



parameters= {"min_samples_split":[5,10], "max_depth":[0,1,3],
             "order":[0,2],"splitter":["maxedge","varreduction"],
             "estimator":["naive_estimator","pointwise_extrapolation_estimator"],
             "r_range_up":[0.6,1],"lamda":[0.0001,0.1],"V":[5,25]}



cv_model_RTER=GridSearchCV(estimator=RegressionTree(),param_grid=parameters, cv=3, n_jobs=-1)
cv_model_RTER.fit(X_train, Y_train) ##############

cv_model_RTER.best_params_

RTER_model = cv_model_RTER.best_estimator_
mse_score= - RTER_model.score(X_test, Y_test)
time_end=time()

mse_score


GridSearchCV(cv=3,
             estimator=<RTER.tree.RegressionTree object at 0x7f4be96e8160>,
             n_jobs=-1,
             param_grid={'V': [5, 25],
                         'estimator': ['naive_estimator',
                                       'pointwise_extrapolation_estimator'],
                         'lamda': [0.0001, 0.1], 'max_depth': [0, 1, 3],
                         'min_samples_split': [5, 10], 'order': [0, 2],
                         'r_range_up': [0.6, 1],
                         'splitter': ['maxedge', 'varreduction']})

{'V': 25,
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.0001,
 'max_depth': 3,
 'min_samples_split': 5,
 'order': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}

3.9677594319400695

In [4]:
# ensemble
time_start=time()
parameters= {"min_samples_split":[10], "max_depth":[1,3],
             "order":[0,2],"splitter":["maxedge","varreduction"],
             "estimator":["naive_estimator","pointwise_extrapolation_estimator"],
             "r_range_up":[1],"lamda":[0.1],"V":[10],
            "n_estimators":[5],"max_samples":[0.7,1],"max_features":[0.7,1]}


cv_model_ensemble=GridSearchCV(estimator=RegressionTreeEnsemble(),param_grid=parameters, cv=5, n_jobs=-1)
cv_model_ensemble.fit(X_train, Y_train)
ensemble_model = cv_model_ensemble.best_estimator_
mse_score= - ensemble_model.score(X_test, Y_test)
time_end=time()

cv_model_ensemble.best_params_

mse_score


GridSearchCV(cv=5,
             estimator=<ensemble.ensemble.RegressionTreeEnsemble object at 0x7f4d3d28d730>,
             n_jobs=-1,
             param_grid={'V': [10],
                         'estimator': ['naive_estimator',
                                       'pointwise_extrapolation_estimator'],
                         'lamda': [0.1], 'max_depth': [1, 3],
                         'max_features': [0.7, 1], 'max_samples': [0.7, 1],
                         'min_samples_split': [10], 'n_estimators': [5],
                         'order': [0, 2], 'r_range_up': [1],
                         'splitter': ['maxedge', 'varreduction']})

{'V': 10,
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.1,
 'max_depth': 3,
 'max_features': 0.7,
 'max_samples': 0.7,
 'min_samples_split': 10,
 'n_estimators': 5,
 'order': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}

3.8785718538985328

In [7]:
time_start=time()
parameters= {"min_samples_split":[10], "max_depth":[1,3],
             "order":[0,2],"splitter":["maxedge","varreduction"],
             "estimator":["naive_estimator","pointwise_extrapolation_estimator"],
             "r_range_up":[1],"lamda":[0.1],"V":[10],
            "n_estimators":[20],"max_samples":[0.7,1],"max_features":[0.7,1],
            "rho":[0.1]}
cv_model_boosting=GridSearchCV(estimator=RegressionTreeBoosting(),param_grid=parameters, cv=5, n_jobs=-1)
cv_model_boosting.fit(X_train, Y_train)
boosting_model = cv_model_boosting.best_estimator_
mse_score= - boosting_model.score(X_test, Y_test)
mse_score
time_end=time()

GridSearchCV(cv=10,
             estimator=<ensemble.boosting.RegressionTreeBoosting object at 0x7f4d3c237d90>,
             n_jobs=-1,
             param_grid={'V': [10],
                         'estimator': ['naive_estimator',
                                       'pointwise_extrapolation_estimator'],
                         'lamda': [0.1], 'max_depth': [1, 3],
                         'max_features': [0.7, 1], 'max_samples': [0.7, 1],
                         'min_samples_split': [10], 'n_estimators': [20],
                         'order': [0, 2], 'r_range_up': [1], 'rho': [0.1],
                         'splitter': ['maxedge', 'varreduction']})

In [23]:
a = np.zeros((5,3))
b = np.zeros(5)

In [29]:
np.hstack([a,b.reshape(-1,1)])

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [32]:
np.savetxt("example.csv", np.ones(10), delimiter=",")

In [20]:
import glob
data_seq = glob.glob("{}/*.csv".format("./data/real_data_cleaned"))
data_file_name_seq = [os.path.split(data)[1] for data in data_seq]
data_file_name_seq

['housing_scale.csv',
 'mpg_scale.csv',
 'space_ga_scale.csv',
 'mg_scale.csv',
 'cpusmall_scale.csv',
 'triazines_scale.csv',
 'pyrim_scale.csv',
 'abalone.csv',
 'bodyfat_scale.csv']

In [9]:
a = RandomForestRegressor()

In [11]:
a.fit(np.random.rand(10000).reshape(-1,10),np.random.rand(1000))

RandomForestRegressor()

In [17]:
a.estimators_

[DecisionTreeRegressor(max_features='auto', random_state=11082189),
 DecisionTreeRegressor(max_features='auto', random_state=1726073905),
 DecisionTreeRegressor(max_features='auto', random_state=210129700),
 DecisionTreeRegressor(max_features='auto', random_state=514490936),
 DecisionTreeRegressor(max_features='auto', random_state=2058345684),
 DecisionTreeRegressor(max_features='auto', random_state=1994282266),
 DecisionTreeRegressor(max_features='auto', random_state=1495550170),
 DecisionTreeRegressor(max_features='auto', random_state=1644891510),
 DecisionTreeRegressor(max_features='auto', random_state=1896231031),
 DecisionTreeRegressor(max_features='auto', random_state=1261533965),
 DecisionTreeRegressor(max_features='auto', random_state=347911791),
 DecisionTreeRegressor(max_features='auto', random_state=1043156172),
 DecisionTreeRegressor(max_features='auto', random_state=1605560061),
 DecisionTreeRegressor(max_features='auto', random_state=885513470),
 DecisionTreeRegressor(max

In [None]:
'housing_scale.csv',
 'mpg_scale.csv',
 'airfoil.csv',
 'space_ga_scale.csv',
 'whitewine.csv',
 'dakbilgic.csv',
 'mg_scale.csv',
 'bias.csv',
 'cpusmall_scale.csv',                     
 'aquatic.csv',
 'music.csv',
 'redwine.csv',
 'ccpp.csv',
 'concrete.csv',
 'portfolio.csv',
 'building.csv',
 'yacht.csv',
 'abalone.csv',
 'algerian.csv',
 'fish.csv',
 'communities.csv',
 'forestfires.csv',
 'cbm.csv'

In [42]:
import os
import numpy as np 
import pandas as pd
from time import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE

from RTER import RegressionTree
from ensemble import RegressionTreeBoosting, RegressionTreeEnsemble


from sklearn.tree import DecisionTreeRegressor





data_file_dir = "./data/real_data_cleaned/"
data_file_name_seq = ['airfoil.csv'#'mpg_scale.csv','space_ga_scale.csv','mg_scale.csv',
                     #'cpusmall_scale.csv','triazines_scale.csv',#'pyrim_scale.csv',
                      #'abalone.csv','bodyfat_scale.csv'
                     ]

#data_seq = glob.glob("{}/*.csv".format(log_file_dir))
#data_file_name_seq = [os.path.split(data)[1] for data in data_seq]

for data_file_name in data_file_name_seq:
    # load dataset
    data_name = os.path.splitext(data_file_name)[0]
    data_file_path = os.path.join(data_file_dir, data_file_name)
    data = pd.read_csv(data_file_path)
    data = np.array(data)
    
    X = data[:,1:]
    y = data[:,0]
    
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    

    repeat_times = 5
        
    
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)



    parameters={"min_samples_split":[2,5,10], "max_depth":[1,2,3,4,5,6,7,8],
       "order":[0,1],"splitter":["maxedge"],
        "estimator":["pointwise_extrapolation_estimator"],
       "r_range_low":[0],"r_range_up":[0.6,1],
       "lamda":[0.0001,0.001,0.01,0.1],"V":[2,"auto"]}

    cv_model_RTER=GridSearchCV(estimator=RegressionTree(),param_grid=parameters, cv=3, n_jobs=30)
    cv_model_RTER.fit(X_train, y_train)


    time_start=time()
    RTER_model = cv_model_RTER.best_estimator_
    mse_score= -RTER_model.score(X_test, y_test)
    time_end=time()


cv_model_RTER.best_params_

GridSearchCV(cv=3,
             estimator=<RTER.tree.RegressionTree object at 0x7f94bc7bfdd0>,
             n_jobs=30,
             param_grid={'V': [2, 'auto'],
                         'estimator': ['pointwise_extrapolation_estimator'],
                         'lamda': [0.0001, 0.001, 0.01, 0.1],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
                         'min_samples_split': [2, 5, 10], 'order': [0, 1],
                         'r_range_low': [0], 'r_range_up': [0.6, 1],
                         'splitter': ['maxedge']})

{'V': 'auto',
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.0001,
 'max_depth': 8,
 'min_samples_split': 5,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 1,
 'splitter': 'maxedge'}

In [43]:
mse_score

33.42559041033197

In [56]:
check_model = RegressionTree(V = 20, 
          estimator = "pointwise_extrapolation_estimator",
          lamda = 0.0001,
          max_depth = 8,
          min_samples_split = 5,
          order = 0,
          r_range_up =1,
          splitter = "maxedge")
check_model.fit(X_train, y_train)
-check_model.score(X_test,y_test)

<RTER.tree.RegressionTree at 0x7f9565fc7e10>

28.730758650350232

In [None]:
{'V': 'auto',
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.01,
 'max_depth': 1,
 'min_samples_split': 2,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 0.6,
 'splitter': 'varreduction'}


{'V': 'auto',
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.001,
 'max_depth': 4,
 'min_samples_split': 5,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}

{'V': "auto",
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.1,
 'max_depth': 6,
 'min_samples_split': 10,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}

{'V': "auto",
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.001,
 'max_depth': 3,
 'min_samples_split': 2,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}


In [35]:
check_model = RegressionTreeEnsemble(n_estimators = 50,
                                     max_features = 0.55,
                                     max_samples = 1,
            V = "auto", 
          estimator = "pointwise_extrapolation_estimator",
          lamda = 0.001,
          max_depth = 8,
          min_samples_split = 5,
          order = 0,
          r_range_up =1,
          splitter = "varreduction",
          ensemble_parallel=1)
check_model.fit(X_train, y_train)
-check_model.score(X_test,y_test)

4.603269253647042

In [40]:
import glob
import os
data_file_dir = "./data/real_data_cleaned/"

data_seq = glob.glob("{}/*.csv".format(data_file_dir))
data_file_name_seq = [os.path.split(data)[1] for data in data_seq]


In [41]:
data_file_name_seq

['housing_scale.csv',
 'mpg_scale.csv',
 'airfoil.csv',
 'space_ga_scale.csv',
 'whitewine.csv',
 'dakbilgic.csv',
 'mg_scale.csv',
 'bias.csv',
 'cpusmall_scale.csv',
 'triazines_scale.csv',
 'aquatic.csv',
 'music.csv',
 'redwine.csv',
 'ccpp.csv',
 'concrete.csv',
 'portfolio.csv',
 'building.csv',
 'yacht.csv',
 'pyrim_scale.csv',
 'abalone.csv',
 'bodyfat_scale.csv',
 'facebook.csv',
 'algerian.csv',
 'fish.csv',
 'communities.csv',
 'forestfires.csv',
 'cbm.csv']

In [77]:
import os
import numpy as np 
import pandas as pd
from time import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE

from RTER import RegressionTree
from ensemble import RegressionTreeBoosting, RegressionTreeEnsemble



from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor






data_file_dir = "./data/real_data_cleaned/"

#data_file_name_seq = ['airfoil.csv','space_ga_scale.csv','whitewine.csv', 'dakbilgic.csv','mg_scale.csv','bias.csv','cpusmall_scale.csv','aquatic.csv','yacht.csv', 'abalone.csv','cbm.csv']
data_file_name_seq = ['aquatic.csv']
#data_seq = glob.glob("{}/*.csv".format(log_file_dir))
#data_file_name_seq = [os.path.split(data)[1] for data in data_seq]

log_file_dir = "./results/realdata_forest/"


for data_file_name in data_file_name_seq:
    # load dataset
    data_name = os.path.splitext(data_file_name)[0]
    data_file_path = os.path.join(data_file_dir, data_file_name)
    data = pd.read_csv(data_file_path,header = None)
    data = np.array(data)
    
    X = data[:,1:]
    y = data[:,0]
    
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    

  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)


    # RTER ensemble

    parameters={"n_estimators":[50], "max_features":[0.5,0.75,1],
                "max_samples":[0.8,1,1.2],
   "min_samples_split":[2,5,10], "max_depth":[2,3,4,5,6,7,8,9],
   "order":[0,1],"splitter":["varreduction"],
    "estimator":["pointwise_extrapolation_estimator"],
   "r_range_low":[0],"r_range_up":[0.6,1],
   "lamda":[0.0001,0.001,0.01,0.1],"V":[2,"auto"]}
    cv_model_ensemble=GridSearchCV(estimator=RegressionTreeEnsemble(),param_grid=parameters, cv=3, n_jobs=50)
    cv_model_ensemble.fit(X_train, y_train)
    time_start=time()
    ensemble_model = cv_model_ensemble.best_estimator_
    ensemble_model.ensemble_parallel = 1
    mse_score= - ensemble_model.score(X_test, y_test)
    time_end=time()
        
       
     
    
        

3424 fits failed out of a total of 20736.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3424 fits failed with the following error:
Traceback (most recent call last):
  File "/home/karl/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karl/RTER/ensemble/ensemble.py", line 102, in fit
    self.trees[i].fit(X[bootstrap_idx] , y[bootstrap_idx])
  File "/home/karl/RTER/RTER/tree.py", line 125, in fit
    super(RegressionTree, self).fit(X,Y,self.X_range)
  File "/home/karl/RTER/RTER/tree.py", line 87, in fit
    builder.build(self.tree_, X, Y,X_range)
  File "/home/karl/RTER/RTER/_tree.py", line 175, in build
    if (dt_X

GridSearchCV(cv=3,
             estimator=<ensemble.ensemble.RegressionTreeEnsemble object at 0x7f9562c6bfd0>,
             n_jobs=50,
             param_grid={'V': [2, 'auto'],
                         'estimator': ['pointwise_extrapolation_estimator'],
                         'lamda': [0.0001, 0.001, 0.01, 0.1],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                         'max_features': [0.5, 0.75, 1],
                         'max_samples': [0.8, 1, 1.2],
                         'min_samples_split': [2, 5, 10], 'n_estimators': [50],
                         'order': [0, 1], 'r_range_low': [0],
                         'r_range_up': [0.6, 1], 'splitter': ['varreduction']})

In [78]:
mse_score

1.1091052082704789

In [79]:
time_end - time_start

0.22292017936706543

In [80]:
cv_model_ensemble.best_params_

{'V': 'auto',
 'estimator': 'pointwise_extrapolation_estimator',
 'lamda': 0.0001,
 'max_depth': 9,
 'max_features': 0.75,
 'max_samples': 1.2,
 'min_samples_split': 5,
 'n_estimators': 50,
 'order': 0,
 'r_range_low': 0,
 'r_range_up': 1,
 'splitter': 'varreduction'}

In [92]:
check_model = RegressionTreeEnsemble(
          n_estimators = 50,
          max_features = 0.75,
          max_samples = 1.2,
          V = "auto", 
          estimator = "pointwise_extrapolation_estimator",
          lamda = 0.0001,
          max_depth = 7,
          min_samples_split = 10,
          order = 0,
          r_range_up =1,
          splitter = "varreduction")
check_model.fit(X_train, y_train)
-check_model.score(X_test,y_test)

1.1865963494728193