In [None]:
from bayes_opt import BayesianOptimization
import subprocess
from time import time_ns

In [None]:
def run_spind(CHUNK_SIZE, SORT_SIZE, MERGE_SIZE, VALIDATION_SIZE):
    start_time = time_ns()
    
    # execute spind using the provided parameters
    output = subprocess.check_output(['java', '-jar', 'spind.jar', str(round(CHUNK_SIZE)), str(round(SORT_SIZE)), str(round(MERGE_SIZE)), str(round(VALIDATION_SIZE))])

    # we want to minimize time -> return negative time since the optimizer is maximizing
    # return time in seconds with millisecond resolution
    return -(((time_ns() - start_time)//1_000_000)/1_000)

In [None]:
parameter_bounds = {
    'CHUNK_SIZE': (10_000, 100_000_000),
    'SORT_SIZE': (10_000, 5_000_000),
    'MERGE_SIZE': (2, 1_000),
    'VALIDATION_SIZE': (1, 1_000_000)
}

In [None]:
optimizer = BayesianOptimization(
    f=run_spind,
    pbounds=parameter_bounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

In [None]:
optimizer.maximize(20, 65)

In [None]:
import pandas as pd
import numpy as np

OPTIMIZATION_NAME = 'TPCH-1'

data = []
for r in optimizer.res:
    t = r['params']
    t['time'] = r['target']
    data.append(t)
df = pd.DataFrame(data)
df.values[:,:-1] = np.round(df.values[:,:-1])
df.to_csv(f'data/{OPTIMIZATION_NAME}.csv')

## Visualization

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"

In [None]:
df1 = pd.read_csv('data/data_gov_second.csv', index_col='iter')
df2 = pd.read_csv('data/TPCH-unary.csv', index_col='iter')

In [None]:
# set column order
cols = ['CHUNK_SIZE', 'SORT_SIZE', 'MERGE_SIZE', 'VALIDATION_SIZE']
n = len(cols)
fig, ax = plt.subplots(2, n-1)
fig.set_figheight(9)
fig.set_figwidth(16)

for i, df in enumerate([df1, df2]):
    col1 = 0
    for col2 in range(col1+1, n):
        x_name = cols[col2]
        y_name = cols[col1]

        x = df[x_name]
        y = df[y_name]
        execution_time = df['time']
        ax[i, col2-1].grid()
        if i == 1:
            ax[i, col2-1].set_xlabel(f"{x_name.replace('_', ' ')}")
        bin = ax[i, col2-1].hexbin(x, y, gridsize=30, C=execution_time, cmap='Spectral', reduce_C_function=np.mean)

        if i == 0:
            bin1 = bin
        else:
            bin2 = bin

        if col2 == 1:
            ax[i, col2-1].set_ylabel('CHUNK SIZE')

fig.subplots_adjust(right=0.89)
cbar_ax = fig.add_axes([0.9, 0.12, 0.02, 0.33])
fig.colorbar(bin1, cax=cbar_ax)
plt.ylabel('Execution time (sec)')
cbar_ax = fig.add_axes([0.9, 0.54, 0.02, 0.33])
fig.colorbar(bin2, cax=cbar_ax)

plt.ylabel('Execution time (sec)')
fig.suptitle('Effectiveness of Chunk Size\nData.gov 85 iterations (top) & TPCH-1 36 iterations (bottom)', fontsize=16)
plt.savefig('out.pdf', bbox_inches='tight')
plt.show()