# Benchmark Results

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
df_group = pd.read_csv('benchmark_data/Benchmark_group.csv', header=0)
df_fold = pd.read_csv('benchmark_data/Benchmark_fold.csv', header=0)


# Slider on n_workers

In [3]:
def df_plot_preprocessing(df):
    df['nthreads'] = df['nthreads'].astype(str)
    return df

def df_to_plotly(df):
    df=df.pivot(index='nthreads', columns='blocksize_MB')['compute_time'] #.fillna(0)
    return {'z': df.values.tolist(),
            'x': df.columns.tolist(),
            'y': df.index.tolist()}
    

In [4]:
import numpy as np

n_workers_list = np.sort(df_group['n_workers'].unique())[::-1].tolist()

df_group = df_plot_preprocessing(df_group)
df_fold = df_plot_preprocessing(df_fold)

# Create figure
fig_work = make_subplots(
    2, 1, 
    x_title='Blocksize [MB]', 
    y_title='n_threads', 
    subplot_titles=("<b>Groupby</b>", "<b>Foldby</b>")
)

# Add traces, one for each slider step
for nwork in n_workers_list:
    df_plot_1 = df_group[df_group['n_workers']==nwork][['nthreads', 'blocksize_MB', 'compute_time']]
    df_plot_2 = df_fold[df_fold['n_workers']==nwork][['nthreads', 'blocksize_MB', 'compute_time']]
    
    max_1 = df_plot_1[['compute_time']].max()[0]
    max_2 = df_plot_2[['compute_time']].max()[0]
    min_1 = df_plot_1[['compute_time']].min()[0]
    min_2 = df_plot_2[['compute_time']].min()[0]
                
    value_max = int(max(max_1, max_2))
    value_min = int(min(min_1, min_2))
    i = n_workers_list.index(nwork)
    
    fig_work.add_trace(
        go.Heatmap(df_to_plotly(df_plot_1), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False, 
            hovertemplate='x: %{x:.s}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'
        ), 
        row=1, col=1 
    )
    fig_work.add_trace(
        go.Heatmap(df_to_plotly(df_plot_2), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False, 
            hovertemplate='x: %{x:.s}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'                   
        ), 
        row=2, col=1
    )
    
    fig_work.data[2*i].update(zmin=value_min, zmax=value_max)
    fig_work.data[2*i+1].update(zmin=value_min, zmax=value_max)

# Make 10th trace visible
fig_work.data[1].visible = True

# Create and add slider
steps = []
for i in range(len(n_workers_list)):
    step = dict(
        method="restyle",
        args=[{"visible": [False] * len(fig_work.data)}],
        label=str(n_workers_list[i])
    )
    step["args"][0]["visible"][2*i:2*i+2] = [True, True]  # Toggle i'th trace to "visible"
    steps.append(step)

sliders = [dict(
    active=1,
    currentvalue={"prefix": "n_workers = "},
    pad={"t": 50},
    steps=steps
)]

fig_work.update_layout(
    sliders=sliders, 
    title_text="<b>Benchmark with fixed n_workers</b>", 
    width=1000, 
    height=600
)

fig_work.update_annotations(font_size=15)
fig_work.layout.annotations[0].update(x=0.05)
fig_work.layout.annotations[1].update(x=0.04);

# fig_work.show()


# Slider on nthreads

In [5]:
def df_plot_preprocessing_wor(df):
    df['n_workers'] = df['n_workers'].astype(str)
    return df

def df_to_plotly_wor(df):
    df=df.pivot(index='n_workers', columns='blocksize_MB')['compute_time'] #.fillna(0)
    return {'z': df.values.tolist(),
            'x': df.columns.tolist(),
            'y': df.index.tolist()}


In [6]:
n_threads_list = np.sort(df_group['nthreads'].unique())[::-1].tolist()

df_group = df_plot_preprocessing_wor(df_group)
df_fold = df_plot_preprocessing_wor(df_fold)

# Create figure
fig_thread = make_subplots(
    2, 1, 
    x_title='Blocksize [MB]', 
    y_title='n_workers', 
    subplot_titles=("<b>Groupby</b>", "<b>Foldby</b>")
)

# Add traces, one for each slider step
for nthr in n_threads_list:
    df_plot_1 = df_group[df_group['nthreads']==nthr][['n_workers', 'blocksize_MB', 'compute_time']]
    df_plot_2 = df_fold[df_fold['nthreads']==nthr][['n_workers', 'blocksize_MB', 'compute_time']]
    
    max_1 = df_plot_1[['compute_time']].max()[0]
    max_2 = df_plot_2[['compute_time']].max()[0]
    min_1 = df_plot_1[['compute_time']].min()[0]
    min_2 = df_plot_2[['compute_time']].min()[0]
                
    value_max = int(max(max_1, max_2))
    value_min = int(min(min_1, min_2))
    i = n_threads_list.index(nthr)
    
    fig_thread.add_trace(
        go.Heatmap(df_to_plotly_wor(df_plot_1), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False, 
            hovertemplate='x: %{x:.s}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'
        ), 
        row=1, col=1 
    )
    fig_thread.add_trace(
        go.Heatmap(df_to_plotly_wor(df_plot_2), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False, 
            hovertemplate='x: %{x:.s}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'
        ), 
        row=2, col=1
    )
    
    fig_thread.data[2*i].update(zmin=value_min, zmax=value_max)
    fig_thread.data[2*i+1].update(zmin=value_min, zmax=value_max)

# Make 10th trace visible
fig_thread.data[1].visible = True

# Create and add slider
steps = []
for i in range(len(n_threads_list)):
    step = dict(
        method="restyle",
        args=[{"visible": [False] * len(fig_thread.data)}],
        label=str(n_threads_list[i])
    )
    step["args"][0]["visible"][2*i:2*i+2] = [True, True]  # Toggle i'th trace to "visible"
    steps.append(step)

sliders = [dict(
    active=1,
    currentvalue={"prefix": "n_threads = "},
    pad={"t": 50},
    steps=steps
)]

fig_thread.update_layout(
    sliders=sliders, 
    title_text="<b>Benchmark with fixed n_threads</b>", 
    width=1000, 
    height=600
)

fig_thread.update_annotations(font_size=15)
fig_thread.layout.annotations[0].update(x=0.05)
fig_thread.layout.annotations[1].update(x=0.04);

# fig_thread.show()

# Slider on block_size

In [7]:
def df_plot_preprocessing_thr(df):
    df['nthreads'] = df['nthreads'].astype(str)
    df['n_workers'] = df['n_workers'].astype(str)
    return df

def df_to_plotly_thr(df):
    df=df.pivot(index='nthreads', columns='n_workers')['compute_time']
    return {'z': df.values.tolist(),
            'x': df.columns.tolist(),
            'y': df.index.tolist()}

In [8]:
n_blk_list = np.sort(df_group['blocksize_MB'].unique())[::-1].tolist()

df_group = df_plot_preprocessing_thr(df_group)
df_fold = df_plot_preprocessing_thr(df_fold)

# Create figure
fig_bs = make_subplots(
    2, 1, 
    x_title='n_workers', 
    y_title='n_threads', 
    subplot_titles=("<b>Groupby</b>", "<b>Foldby</b>")
)

# Add traces, one for each slider step
for blk in n_blk_list:
    df_plot_1 = df_group[df_group['blocksize_MB']==blk][['nthreads', 'n_workers', 'compute_time']]
    df_plot_2 = df_fold[df_fold['blocksize_MB']==blk][['nthreads', 'n_workers', 'compute_time']]
    
    max_1 = df_plot_1[['compute_time']].max()[0]
    max_2 = df_plot_2[['compute_time']].max()[0]
    min_1 = df_plot_1[['compute_time']].min()[0]
    min_2 = df_plot_2[['compute_time']].min()[0]
                
    value_max = int(max(max_1, max_2))
    value_min = int(min(min_1, min_2))
    i =  n_blk_list.index(blk)
    
    fig_bs.add_trace(
        go.Heatmap(df_to_plotly_thr(df_plot_1), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False, 
            hovertemplate='x: %{x:.0f}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'
        ), 
        row=1, col=1 
    )
    fig_bs.add_trace(
        go.Heatmap(df_to_plotly_thr(df_plot_2), 
            reversescale=True, colorbar={"title": 'Time [s]'}, hoverongaps=False,
            hovertemplate='x: %{x:.0f}<br>y: %{y:.0f} <br><b>time: %{z:.1f}</b>'
        ), 
        row=2, col=1
    )
    
    fig_bs.data[2*i].update(zmin=value_min, zmax=value_max)
    fig_bs.data[2*i+1].update(zmin=value_min, zmax=value_max)

# Make 10th trace visible
fig_bs.data[1].visible = True

# Create and add slider
steps = []
for i in range(len(n_blk_list)):
    step = dict(
        method="restyle",
        args=[{"visible": [False] * len(fig_bs.data)}],
        label=str(n_blk_list[i])
    )
    step["args"][0]["visible"][2*i:2*i+2] = [True, True]  # Toggle i'th trace to "visible"
    steps.append(step)

sliders = [dict(
    active=1,
    currentvalue={"prefix": "blocksize [MB] = "},
    pad={"t": 50},
    steps=steps
)]

fig_bs.update_layout(
    sliders=sliders, 
    title_text="<b>Benchmark with fixed blocksize [MB]</b>", 
    width=1000, 
    height=600
)

fig_bs.update_annotations(font_size=15)
fig_bs.layout.annotations[0].update(x=0.05)
fig_bs.layout.annotations[1].update(x=0.04);

# fig_bs.show()

# Results

In [9]:
fig_work.show()
fig_thread.show()
fig_bs.show()

In [11]:
print('Best combinations with groupby:')
df_group['compute_time'] = df_group['compute_time'].round(2)
df_group.sort_values(by='compute_time').head(5)


Best combinations with groupby:


Unnamed: 0,nthreads,n_workers,blocksize_MB,compute_time
0,1,4,1.181MB,817.66
14,1,4,9.455MB,1063.41
15,1,4,4.7275MB,1067.58
2,1,8,4.7275MB,1100.91
13,1,8,1.181MB,1138.52


In [21]:
print('Best combinations with foldby:')
df_fold['compute_time'] = df_fold['compute_time'].round(2)
df_fold.sort_values(by='compute_time').head(5)

Best combinations with foldby:


Unnamed: 0,nthreads,n_workers,blocksize_MB,compute_time
2,1,4,1.181MB,947.26
1,1,4,4.7275MB,1019.52
5,1,8,1.181MB,1101.83
6,1,8,4.7275MB,1190.66
15,1,4,9.455MB,1281.49


In [20]:
df_ = df_group.merge(right=df_fold, how='inner', on=['nthreads', 'n_workers', 'blocksize_MB'], suffixes=['_groupby', '_foldby'])
df_['foldby/groupby'] = df_['compute_time_foldby']/df_['compute_time_groupby']
df_.sort_values(by='foldby/groupby', ascending=False)

Unnamed: 0,nthreads,n_workers,blocksize_MB,compute_time_groupby,compute_time_foldby,foldby/groupby
9,1,2,9.455MB,1561.27,2075.05,1.329078
17,2,2,9.455MB,2008.62,2552.82,1.270932
14,1,4,9.455MB,1063.41,1281.49,1.205076
0,1,4,1.181MB,817.66,947.26,1.158501
16,2,4,9.455MB,1760.2,2030.04,1.153301
8,4,1,9.455MB,4517.18,5139.59,1.137787
10,1,2,1.181MB,1479.0,1660.78,1.122907
1,1,2,4.7275MB,1596.85,1769.65,1.108213
4,2,4,4.7275MB,1459.7,1615.39,1.106659
11,2,2,4.7275MB,1880.65,2064.6,1.097812
