In [None]:
!pip install gpy
!pip install gpyopt

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import uniform
import GPy
import GPyOpt
from GPyOpt.methods import BayesianOptimization
import matplotlib.pyplot as plt
import time
from sklearn.linear_model import LinearRegression
import statistics

In [None]:
def measure_runtime(func):
    start_time = time.time()
    result = func()
    end_time = time.time()
    runtime = end_time - start_time
    return result, runtime

In [None]:
def split_data(df):
  y = df['logerror']
  X = df.drop(columns=['logerror'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
  return X_train, y_train, X_test, y_test

# Hyper-parameter Tuning Methods

In [None]:
def random_search(X_train, y_train, X_test, y_test, param_range):

  xgb = XGBRegressor()

  param_dist = {"learning_rate":    uniform(  param_range['learning_rate'][0],    param_range['learning_rate'][1]     ),
                "gamma":            uniform(  param_range['gamma'][0],            param_range['gamma'][1]             ),
                "max_depth":        range(    param_range['max_depth'][0],        param_range['max_depth'][1]         ),
                "n_estimators":     range(    param_range['n_estimators'][0],     param_range['n_estimators'][1]      ),
                "min_child_weight": uniform(  param_range['min_child_weight'][0], param_range['min_child_weight'][1]  )}

  rs = RandomizedSearchCV(xgb, param_distributions=param_dist,
                          scoring='r2', n_iter=25)

  _,runtime = measure_runtime(lambda: rs.fit(X_train, y_train));

  score = rs.score(X_test, y_test)

  return runtime, score

In [None]:
def repeat_random_search(X_train, y_train, X_test, y_test, param_range, iterations):

  runtimes = []
  scores = []

  for i in range(iterations):
    runtime, score = random_search(X_train, y_train, X_test, y_test, param_range)
    runtimes.append(runtime)
    scores.append(score)

  result = {}
  result['runtimes'] = runtimes
  result['scores'] = scores

  return result

In [None]:
def bayes_opt(X_train, y_train, X_test, y_test, param_range):


  domain = [{'name': 'learning_rate',    'type': 'continuous', 'domain': (param_range['learning_rate'][0],    param_range['learning_rate'][1])},
            {'name': 'gamma',            'type': 'continuous', 'domain': (param_range['gamma'][0],            param_range['gamma'][1])},
            {'name': 'max_depth',        'type': 'continuous', 'domain': (param_range['max_depth'][0],        param_range['max_depth'][1])},
            {'name': 'n_estimators',     'type': 'continuous', 'domain': (param_range['n_estimators'][0],     param_range['n_estimators'][1])},
            {'name': 'min_child_weight', 'type': 'continuous', 'domain': (param_range['min_child_weight'][0], param_range['min_child_weight'][1])}]

  # Optimization objective
  def cv_score(params):
      params = params[0]
      score = cross_val_score(
                  XGBRegressor(learning_rate     = params[0],
                                gamma            = params[1],
                                max_depth        = round(params[2]),
                                n_estimators     = round(params[3]),
                                min_child_weight = params[4]),
                  X_train, y_train, scoring='r2').mean()
      score = np.array(score)
      return score

  optimizer = BayesianOptimization(f=cv_score,
                                  domain=domain,
                                  model_type='GP',
                                  acquisition_type ='EI',
                                  acquisition_jitter = 0.05,
                                  exact_feval=True,
                                  maximize=True)

  # Only 20 iterations because we have 5 initial random points
  _,runtime = measure_runtime(lambda: optimizer.run_optimization(max_iter=20))

  best_params_index = (-optimizer.Y).argmax()
  best_params = optimizer.X[best_params_index]

  best_xgb = XGBRegressor(learning_rate    = best_params[0],
                          gamma            = best_params[1],
                          max_depth        = round(best_params[2]),
                          n_estimators     = round(best_params[3]),
                          min_child_weight = best_params[4])

  best_xgb.fit(X_train, y_train)
  score = best_xgb.score(X_test, y_test)

  return runtime, score

In [None]:
def repeat_bayes_opt(X_train, y_train, X_test, y_test, param_range, iterations):

  runtimes = []
  scores = []

  for i in range(iterations):
    runtime, score = bayes_opt(X_train, y_train, X_test, y_test, param_range)
    runtimes.append(runtime)
    scores.append(score)

  result = {}
  result['runtimes'] = runtimes
  result['scores'] = scores

  return result

# Load Data

In [None]:
pca_dataframes = {}

pca_dataframes['10%'] = pd.read_csv('pca_10.csv')
pca_dataframes['50%'] = pd.read_csv('pca_50.csv')
pca_dataframes['75%'] = pd.read_csv('pca_75.csv')

umap_dataframes = {}

umap_dataframes['10%'] = pd.read_csv('umap_10.csv')
umap_dataframes['50%'] = pd.read_csv('umap_50.csv')
umap_dataframes['75%'] = pd.read_csv('umap_75.csv')

autoencoder_dataframes = {}

autoencoder_dataframes['10%'] = pd.read_csv('autoencoder_10.csv')
autoencoder_dataframes['50%'] = pd.read_csv('autoencoder_50.csv')
autoencoder_dataframes['75%'] = pd.read_csv('autoencoder_75.csv')

ikpca_dataframes = {}

ikpca_dataframes['10%'] = pd.read_csv('ikpca_10.csv')
ikpca_dataframes['50%'] = pd.read_csv('ikpca_50.csv')
ikpca_dataframes['75%'] = pd.read_csv('ikpca_75.csv')

In [None]:
dimension_reduced_data = {}

dimension_reduced_data['PCA'] = pca_dataframes
dimension_reduced_data['UMAP'] = umap_dataframes
dimension_reduced_data['Autoencoder'] = autoencoder_dataframes
dimension_reduced_data['IKPCA'] = ikpca_dataframes

In [None]:
param_range = {"learning_rate":   [0, 1],
              "gamma":            [0, 5],
              "max_depth":        [1, 10],
              "n_estimators":     [1, 50],
              "min_child_weight": [1, 10]}

# Create Models

In [None]:
#runs for hours

xgboost_results = {}

for reduction_method, percentages in dimension_reduced_data.items():

  xgboost_results[reduction_method] = {}
  percentage_results = xgboost_results[reduction_method]

  for percentage, df in percentages.items():

    X_train, y_train, X_test, y_test = split_data(df)

    percentage_results[percentage] = {}
    percentage_results[percentage]['random'] = repeat_random_search(X_train, y_train, X_test, y_test, param_range, 10)
    percentage_results[percentage]['bayes'] = repeat_bayes_opt(X_train, y_train, X_test, y_test, param_range, 10)

In [None]:
linear_results = {}

for reduction_method, percentages in dimension_reduced_data.items():

  linear_results[reduction_method] = {}
  percentage_results = linear_results[reduction_method]

  for percentage, df in percentages.items():

    X_train, y_train, X_test, y_test = split_data(df)

    reg = LinearRegression()

    percentage_results[percentage] = {}
    _,percentage_results[percentage]['runtimes'] = measure_runtime(lambda: reg.fit(X_train, y_train))
    percentage_results[percentage]['scores'] = reg.score(X_test, y_test)

# Graph Results

In [None]:
def label_bars(x_linear, x_random, x_bayes, y_linear, y_random, y_bayes, y_type):

  for i in range(len(y_linear)):

    linear_va = 'top' if y_linear[i] < 0 else 'bottom'
    random_va = 'top' if y_random[i] < 0 else 'bottom'
    bayes_va = 'top' if y_bayes[i] < 0 else 'bottom'

    if y_type == 'scores':
      plt.text(x_linear[i], y_linear[i], f'{y_linear[i]:.2e}', ha='center', va=linear_va)
      plt.text(x_random[i], y_random[i], f'{y_random[i]:.2e}', ha='center', va=random_va)
      plt.text(x_bayes[i], y_bayes[i], f'{y_bayes[i]:.2e}', ha='center', va=bayes_va)
    else:
      plt.text(x_linear[i], y_linear[i], f'{y_linear[i]:.2f}', ha='center', va=linear_va)
      plt.text(x_random[i], y_random[i], f'{y_random[i]:.2f}', ha='center', va=random_va)
      plt.text(x_bayes[i], y_bayes[i], f'{y_bayes[i]:.2f}', ha='center', va=bayes_va)

In [None]:
def plot_reduction_method(reduction_method, y_type):

  percentages = ['10%', '50%', '75%']

  y_linear = []
  y_linear.append(linear_results[reduction_method]['10%'][y_type])
  y_linear.append(linear_results[reduction_method]['50%'][y_type])
  y_linear.append(linear_results[reduction_method]['75%'][y_type])


  y_random = []
  y_random.append(statistics.mean(xgboost_results[reduction_method]['10%']['random'][y_type]))
  y_random.append(statistics.mean(xgboost_results[reduction_method]['50%']['random'][y_type]))
  y_random.append(statistics.mean(xgboost_results[reduction_method]['75%']['random'][y_type]))

  y_bayes = []
  y_bayes.append(statistics.mean(xgboost_results[reduction_method]['10%']['bayes'][y_type]))
  y_bayes.append(statistics.mean(xgboost_results[reduction_method]['50%']['bayes'][y_type]))
  y_bayes.append(statistics.mean(xgboost_results[reduction_method]['75%']['bayes'][y_type]))

  # Set the positions for the bars
  positions = np.arange(0, len(percentages)*3, 3)

  bar_width = 0.8

  x_linear = positions - bar_width
  x_random = positions
  x_bayes = positions + bar_width

  # grouped bar chart
  plt.bar(x_linear, y_linear, bar_width, label='Linear Regression', color='lightgreen')
  plt.bar(x_random, y_random, bar_width, label='XGBoost Random Search', color='orange')
  plt.bar(x_bayes, y_bayes, bar_width, label='XGBoost Bayesian Optimization', color='skyblue')

  plt.axhline(y=0, color='gray', linestyle='--')


  plt.xlabel('Percentage of Original Dimensions')

  ylabel = 'Average R-squared' if y_type == 'scores' else 'Average Runtime (s)'
  plt.ylabel(ylabel)


  plt.xticks(positions, percentages)

  plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

  # Add text label above each bar
  label_bars(x_linear, x_random, x_bayes, y_linear, y_random, y_bayes, y_type)


  plt.title(reduction_method + " " + y_type.capitalize())

  plt.gcf().set_size_inches(9, 6)
  plt.show()


In [None]:
plot_reduction_method('PCA', 'scores')
plot_reduction_method('UMAP', 'scores')
plot_reduction_method('Autoencoder', 'scores')
plot_reduction_method('IKPCA', 'scores')

In [None]:
plot_reduction_method('PCA', 'runtimes')
plot_reduction_method('UMAP', 'runtimes')
plot_reduction_method('Autoencoder', 'runtimes')
plot_reduction_method('IKPCA', 'runtimes')