In [None]:
# !pip install gpy
# !pip install gpyopt

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import uniform
import GPy
import GPyOpt
from GPyOpt.methods import BayesianOptimization
import matplotlib.pyplot as plt
import time

In [None]:
def measure_runtime(func):
    start_time = time.time()
    result = func()
    end_time = time.time()
    runtime = end_time - start_time
    return result, runtime

In [None]:
def split_data(df):
  y = df['logerror']
  X = df.drop(columns=['logerror'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
  return X_train, y_train, X_test, y_test

In [None]:
def random_search(X_train, y_train, X_test, y_test, param_range):

  random_search_result = {}

  xgb = XGBRegressor()

  learning_rate_range    = param_range['learning_rate']
  gamma_range            = param_range['gamma']
  max_depth_range        = param_range['max_depth']
  n_estimators_range     = param_range['n_estimators']
  min_child_weight_range = param_range['min_child_weight']

  param_dist = {"learning_rate":    uniform(learning_rate_range[0],    learning_rate_range[1]),
                "gamma":            uniform(gamma_range[0],            gamma_range[1]),
                "max_depth":        range(max_depth_range[0],          max_depth_range[1]),
                "n_estimators":     range(n_estimators_range[0],       n_estimators_range[1]),
                "min_child_weight": uniform(min_child_weight_range[0], min_child_weight_range[1])}

  rs = RandomizedSearchCV(xgb, param_distributions=param_dist,
                          scoring='r2', n_iter=25)

  _,random_search_result['runtime'] = measure_runtime(lambda: rs.fit(X_train, y_train));

  random_search_result['score'] = rs.score(X_test, y_test)

  return random_search_result

In [None]:
# Optimization objective
def cv_score(params):
    params = params[0]
    score = cross_val_score(
                XGBRegressor(learning_rate     = params[0],
                              gamma            = params[1],
                              max_depth        = round(params[2]),
                              n_estimators     = round(params[3]),
                              min_child_weight = params[4]),
                X_train, y_train, scoring='r2').mean()
    score = np.array(score)
    return score

In [None]:
def bayesian_opt(X_train, y_train, X_test, y_test, param_range):

  bayesian_opt_result = {}

  learning_rate_range    = param_range['learning_rate']
  gamma_range            = param_range['gamma']
  max_depth_range        = param_range['max_depth']
  n_estimators_range     = param_range['n_estimators']
  min_child_weight_range = param_range['min_child_weight']

  domain = [{'name': 'learning_rate',    'type': 'continuous', 'domain': (learning_rate_range[0],    learning_rate_range[1])},
            {'name': 'gamma',            'type': 'continuous', 'domain': (gamma_range[0],            gamma_range[1])},
            {'name': 'max_depth',        'type': 'continuous', 'domain': (max_depth_range[0],        max_depth_range[1])},
            {'name': 'n_estimators',     'type': 'continuous', 'domain': (n_estimators_range[0],     n_estimators_range[1])},
            {'name': 'min_child_weight', 'type': 'continuous', 'domain': (min_child_weight_range[0], min_child_weight_range[1])}]



  optimizer = BayesianOptimization(f=cv_score,
                                  domain=domain,
                                  model_type='GP',
                                  acquisition_type ='EI',
                                  acquisition_jitter = 0.05,
                                  exact_feval=True,
                                  maximize=True)

  # Only 20 iterations because we have 5 initial random points
  _,bayesian_opt_result['runtime'] = measure_runtime(lambda: optimizer.run_optimization(max_iter=20))

  best_params_index = (-optimizer.Y).argmax()
  best_params = optimizer.X[best_params_index]

  best_xgb = XGBRegressor(learning_rate    = best_params[0],
                          gamma            = best_params[1],
                          max_depth        = round(best_params[2]),
                          n_estimators     = round(best_params[3]),
                          min_child_weight = best_params[4])

  best_xgb.fit(X_train, y_train)
  bayesian_opt_result['score'] = best_xgb.score(X_test, y_test)

  return bayesian_opt_result


In [None]:
pca_dataframes = {}

pca_dataframes['10%'] = pd.read_csv('pca_10.csv')
pca_dataframes['50%'] = pd.read_csv('pca_50.csv')
pca_dataframes['75%'] = pd.read_csv('pca_75.csv')

umap_dataframes = {}

umap_dataframes['10%'] = pd.read_csv('umap_10.csv')
umap_dataframes['50%'] = pd.read_csv('umap_50.csv')
umap_dataframes['75%'] = pd.read_csv('umap_75.csv')

autoencoder_dataframes = {}

autoencoder_dataframes['10%'] = pd.read_csv('autoencoder_10.csv')
autoencoder_dataframes['50%'] = pd.read_csv('autoencoder_50.csv')
autoencoder_dataframes['75%'] = pd.read_csv('autoencoder_75.csv')

ikpca_dataframes = {}

ikpca_dataframes['10%'] = pd.read_csv('ikpca_10.csv')
ikpca_dataframes['50%'] = pd.read_csv('ikpca_50.csv')
ikpca_dataframes['75%'] = pd.read_csv('ikpca_75.csv')

In [None]:
param_range = {"learning_rate":   [0, 1],
              "gamma":            [0, 5],
              "max_depth":        [1, 10],
              "n_estimators":     [1, 50],
              "min_child_weight": [1, 10]}

In [None]:
dimension_reduction_methods = {}

dimension_reduction_methods['pca'] = pca_dataframes
dimension_reduction_methods['umap'] = umap_dataframes
dimension_reduction_methods['autoencoder'] = autoencoder_dataframes
dimension_reduction_methods['ikpca'] = ikpca_dataframes

method_results = {}

for method, percentages in dimension_reduction_methods.items():

  method_results[method] = {}
  percentage_results = method_results[method]

  for percentage, df in percentages.items():

    X_train, y_train, X_test, y_test = split_data(df)

    percentage_results[percentage] = {}
    percentage_results[percentage]['random_search'] = random_search(X_train, y_train, X_test, y_test, param_range)
    percentage_results[percentage]['bayesian_opt'] = bayesian_opt(X_train, y_train, X_test, y_test, param_range)