In [19]:
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.dummy import DummyRegressor
import joblib

In [23]:
random_state = 42
num_folds = 5


RF_parameters = {
    'n_estimators': [100, 500, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,8,15, None],
    'min_samples_split': [2, 5, 10],
    'criterion' :['squared_error', 'absolute_error', 'poisson'],
    'random_state': [random_state]
}

dummy_parameters = {
    'strategy': ['mean']
}

performance_data = []
estimators = [DummyRegressor(), RandomForestRegressor()]
parameters = [dummy_parameters, RF_parameters]
names = ["dummy", "RF"]

for file in os.listdir("./Data/processed/"):
    df = pd.read_csv("./Data/processed/"+file, index_col = 0)
    groups = df['instance']
    evaluation_metrics = ['neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2', 'explained_variance']
    X =df.drop(['target', 'function', 'instance'], axis = 1)
    y = df.iloc[:,-1]
    filename = file.split(".")[0]
    print(filename)
    for i, estimator in enumerate(estimators):
        clf = GridSearchCV(estimator = estimator, param_grid = parameters[i], cv = LeaveOneGroupOut(), scoring = evaluation_metrics, refit = 'neg_root_mean_squared_error', verbose=True, n_jobs=-1)
        clf.fit(X, y.values.ravel(), groups=groups)
        joblib.dump(clf.best_estimator_, './results/models/'+names[i]+"/"+filename+'.pkl', compress = 1)
        row = [estimator, filename,clf.best_params_]
        for metric in evaluation_metrics:
            row.append(clf.cv_results_["mean_test_"+metric][clf.best_index_])
        performance_data.append(row)
        with open("./results/models/"+names[i]+"/"+filename+'.params', 'w') as f:
            f.write(str(clf.best_params_))
    break
performance_df = pd.DataFrame(performance_data, columns=['model', 'config', 'parameters', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2', 'explained_variance'])
performance_df.to_csv("./results/performance_table.csv")





dim_5_budget_10000_conf_19
Fitting 5 folds for each of 1 candidates, totalling 5 fits
DummyRegressor()
{'mean_fit_time': array([0.00328097]), 'std_fit_time': array([0.00170246]), 'mean_score_time': array([0.00476041]), 'std_score_time': array([0.00247002]), 'param_strategy': masked_array(data=['mean'],
             mask=[False],
       fill_value='?',
            dtype=object), 'params': [{'strategy': 'mean'}], 'split0_test_neg_mean_squared_error': array([-2903.60583396]), 'split1_test_neg_mean_squared_error': array([-8207.17440931]), 'split2_test_neg_mean_squared_error': array([-8811.76830649]), 'split3_test_neg_mean_squared_error': array([-148508.57025882]), 'split4_test_neg_mean_squared_error': array([-18377.8490015]), 'mean_test_neg_mean_squared_error': array([-37361.79356202]), 'std_test_neg_mean_squared_error': array([55796.87909319]), 'rank_test_neg_mean_squared_error': array([1], dtype=int32), 'split0_test_neg_root_mean_squared_error': array([-53.885117]), 'split1_test_neg_root

TT
4624.047771655277
TT
9542.476853578382
TT
29237.003803919157
TT
122600.8264851696
TT
8778.91727858269
