In [35]:
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd 
import os 
import plotly.io as pio
pio.kaleido.scope.mathjax = None 
from math import log

In [None]:
cwd = os.getcwd()
csv_path = cwd + "/tmp/benchmarks/test.csv"

In [None]:
df = pd.read_csv(csv_path)

fig = px.line(df, x='Set size', y='cache-misses', title="Cache misses relative to amount of sequences compared <b>Repo : " + df['Repo Version'].iloc[0] + "</b>")
fig.update_layout(yaxis_title='cache misses (%)', xaxis_title='Number of unique sequences compared')
fig.show()

fig = px.line(df, x='Set size', y='elapsed time', title="Cache misses relative to amount of sequences compared <b>Repo : " + df['Repo Version'].iloc[0] + "</b>")
fig.update_layout(yaxis_title='cache misses (%)', xaxis_title='Number of unique sequences compared')
fig.show()

# The following section analysises the relation of file size to input parameters

In [None]:
sequence = []
threshold = []
min_count = []
max_depth = []
size = []

directory = "data/benchmarking/parameter_test"

for filename in os.listdir("data/benchmarking/parameter_test"):
    f = os.path.join(directory, filename)
    file_stats = os.stat(f)
    parameters = filename.split("_")
    sequence.append(parameters[1])
    threshold.append(float(parameters[2]))
    min_count.append(int(parameters[3]))
    max_depth.append(int(parameters[4].split(".")[0]))
    size.append(file_stats.st_size)

data = {
    "sequence" : sequence, 
    "threshold" : threshold,
    "min_count" : min_count,
    "max_depth" : max_depth,
    "size" : size
}

df = pd.DataFrame(data)
df.columns

In [None]:
df['avg_size_threshold'] = df.groupby(['threshold'])['size'].transform('mean')
df['avg_size_min_count'] = df.groupby(['min_count'])['size'].transform('mean')
df['avg_size_max_depth'] = df.groupby(['max_depth'])['size'].transform('mean')

df_threshold = df[['threshold', 'avg_size_threshold']]
df_threshold.drop_duplicates(inplace=True)
df_threshold.sort_values('threshold', inplace=True)

df_min_count = df[['min_count', 'avg_size_min_count']]
df_min_count.drop_duplicates(inplace=True)
df_min_count.sort_values('min_count', inplace=True)

df_max_depth = df[['max_depth', 'avg_size_max_depth']]
df_max_depth.drop_duplicates(inplace=True)
df_max_depth.sort_values('max_depth', inplace=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_threshold['threshold'], y=df_threshold['avg_size_threshold'], name='threshold'))
fig.add_trace(go.Scatter(x=df_min_count['min_count'], y=df_min_count['avg_size_min_count'], name='min_count'))
fig.add_trace(go.Scatter(x=df_max_depth['max_depth'], y=df_max_depth['avg_size_max_depth'], name='max_depth'))

fig.update_layout(yaxis_title='Size (Bytes)', xaxis_title='Parameter', title='File size change in relation to parameter tuning')

fig.show()

# <b>Visualizing Benchmarking on Human Dataset</b>

## Visualize time elapsed compared to amount of VLMCs compared

In [43]:
df = pd.read_csv('csv_results/03_15_11_44.csv')
df_sv = pd.read_csv('csv_results/sorted-vector_human_03_20_12_09.csv')
df_hm = pd.read_csv('csv_results/hashmap_human_03_20_12_10.csv')
df_co = pd.read_csv('csv_results/combo_human_03_20_12_10.csv')
df_pst = df[df.implementation=='PstClassifierSeqan']

fig = make_subplots(rows=3, cols=1, shared_yaxes=False, 
                    x_title="Amount of VLMCs",
                    y_title="Elapsed Time (s)",
                    subplot_titles=("Small", "Medium", "Large"), 
                    horizontal_spacing= 0.02, vertical_spacing= 0.08) # specs=[[{}, {}],[{"colspan": 2}, None]])

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='small']['set_size'], y=df_pst[df_pst.vlmc_size=='small']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x'), 1, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='small']['set_size'], y=df_hm[df_hm.vlmc_size=='small']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square'), 1, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='small']['set_size'], y=df_co[df_co.vlmc_size=='small']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond'), 1, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='small']['set_size'], y=df_sv[df_sv.vlmc_size=='small']['elapsed_time'], line_color='#636EFA', name="sorted-vector"), 1, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='medium']['set_size'], y=df_pst[df_pst.vlmc_size=='medium']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='medium']['set_size'], y=df_hm[df_hm.vlmc_size=='medium']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='medium']['set_size'], y=df_co[df_co.vlmc_size=='medium']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='medium']['set_size'], y=df_sv[df_sv.vlmc_size=='medium']['elapsed_time'], line_color='#636EFA', name="sorted-vector", showlegend = False), 2, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='large']['set_size'], y=df_pst[df_pst.vlmc_size=='large']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='large']['set_size'], y=df_hm[df_hm.vlmc_size=='large']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='large']['set_size'], y=df_co[df_co.vlmc_size=='large']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='large']['set_size'], y=df_sv[df_sv.vlmc_size=='large']['elapsed_time'], line_color='#636EFA', name="sorted-vector", showlegend = False), 3, 1)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.08),
    margin=dict(l=100, r=60, t=60, b=80),
    height=800,
    plot_bgcolor="white")

fig.update_xaxes(gridcolor='LightGrey', row=1, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=1, col=1)
fig.update_yaxes(tickvals=[0, 0.1, 0.2, 0.3, 0.4, 0.5], row=1, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=1, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=2, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=2, col=1)
fig.update_yaxes(tickvals=[0, 1, 2, 3, 4, 5], row=2, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=2, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=3, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=3, col=1)
fig.update_yaxes(tickvals=[0, 5, 10, 15, 20, 25, 30], row=3, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=3, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)

fig.layout.annotations[0]["font"] = {'size': 24}
fig.layout.annotations[1]["font"] = {'size': 24}
fig.layout.annotations[2]["font"] = {'size': 24}

fig.layout.annotations[3]["font"] = {'size': 28}
fig.layout.annotations[4]["font"] = {'size': 28}
fig.show()
fig.write_image("images/human_elapsed_time.pdf")

In [27]:
df.head()

Unnamed: 0,repo_version,implementation,vlmc_size,set_size,threshold,min_count,max_depth,nr_cores_used,branch_misses,branch_misses_count,...,cycles_count,instructions,instructions_count,cache_references,cache_references_count,cache_misses,cache_misses_count,elapsed_time,user,sys
0,ffb2fb1,PstClassifierSeqan,small,24,3.9075,9,6,8,0.42,5751422,...,4833400970,1.51,7304501229,23.5,32109065,21.085,6770062,0.485253,1.117658,0.189433
1,ffb2fb1,PstClassifierSeqan,medium,24,3.0,6,8,8,0.51,84771752,...,59654976493,1.48,88298984823,26.54,448829552,57.42,257719405,4.548532,14.890683,1.983431
2,ffb2fb1,PstClassifierSeqan,large,24,2.0,3,10,8,0.72,887558266,...,455877695513,1.43,652929352791,31.791,4043921570,65.288,2640188228,33.207907,117.425658,9.743831
3,ffb2fb1,sorted-vector,small,24,3.9075,9,6,8,1.68,1519426,...,530690568,1.06,565072979,148.59,24767974,27.574,6829476,0.039223,0.116043,0.03413
4,ffb2fb1,sorted-vector,medium,24,3.0,6,8,8,2.99,37202042,...,11715782422,0.66,7680225735,100.13,316107529,63.827,201763458,0.495033,2.809867,0.23652


## Visualizing cache-misses in relation to amount of VLMCs compared

In [30]:
df = pd.read_csv('csv_results/03_15_11_44.csv')
df_sv = pd.read_csv('csv_results/sorted-vector_human_03_20_12_09.csv')
df_hm = pd.read_csv('csv_results/hashmap_human_03_20_12_10.csv')
df_co = pd.read_csv('csv_results/combo_human_03_20_12_10.csv')
df_pst = df[df.implementation=='PstClassifierSeqan']

fig = make_subplots(rows=3, cols=1, shared_yaxes=False, 
                    x_title="Amount of VLMCs",
                    y_title="Cache misses (%)",
                    subplot_titles=("Small", "Medium", "Large"), 
                    horizontal_spacing= 0.02, vertical_spacing= 0.08) # specs=[[{}, {}],[{"colspan": 2}, None]])

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='small']['set_size'], y=df_pst[df_pst.vlmc_size=='small']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x'), 1, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='small']['set_size'], y=df_hm[df_hm.vlmc_size=='small']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square'), 1, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='small']['set_size'], y=df_co[df_co.vlmc_size=='small']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond'), 1, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='small']['set_size'], y=df_sv[df_sv.vlmc_size=='small']['cache_misses'], line_color='#636EFA', name="sorted-vector"), 1, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='medium']['set_size'], y=df_pst[df_pst.vlmc_size=='medium']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='medium']['set_size'], y=df_hm[df_hm.vlmc_size=='medium']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='medium']['set_size'], y=df_co[df_co.vlmc_size=='medium']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='medium']['set_size'], y=df_sv[df_sv.vlmc_size=='medium']['cache_misses'], line_color='#636EFA', name="sorted-vector", showlegend = False), 2, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='large']['set_size'], y=df_pst[df_pst.vlmc_size=='large']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='large']['set_size'], y=df_hm[df_hm.vlmc_size=='large']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='large']['set_size'], y=df_co[df_co.vlmc_size=='large']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='large']['set_size'], y=df_sv[df_sv.vlmc_size=='large']['cache_misses'], line_color='#636EFA', name="sorted-vector", showlegend = False), 3, 1)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.08),
    margin=dict(l=100, r=60, t=60, b=80),
    height=800,
    plot_bgcolor="white")

fig.update_xaxes(gridcolor='LightGrey', row=1, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=1, col=1)
# fig.update_yaxes(tickvals=[0, 0.1, 0.2, 0.3, 0.4, 0.5], row=1, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=1, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=2, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=2, col=1)
# fig.update_yaxes(tickvals=[0, 1, 2, 3, 4, 5], row=2, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=2, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=3, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=3, col=1)
# fig.update_yaxes(tickvals=[0, 5, 10, 15, 20, 25, 30], row=3, col=1)
fig.update_xaxes(tickvals=[3, 6, 12, 24], row=3, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)

fig.layout.annotations[0]["font"] = {'size': 24}
fig.layout.annotations[1]["font"] = {'size': 24}
fig.layout.annotations[2]["font"] = {'size': 24}

fig.layout.annotations[3]["font"] = {'size': 28}
fig.layout.annotations[4]["font"] = {'size': 28}
fig.show()
fig.write_image("images/human_cache_misses.pdf")

## Visualizing degree of parallelization

In [None]:
df = pd.read_csv('csv_results/parallelization_human_03_15_13_45.csv')
df_small = df[df.vlmc_size=="small"]

fig = px.line(df, x='nr_cores_used', y='elapsed_time', color='vlmc_size')

fig.update_layout(title="Elapsed time compared to amount of cores used", yaxis_title="Elapsed time (sec)", xaxis_title="Amount of cores used")

fig.show()


fig = px.line(df, x='nr_cores_used', y='task_clock', color='vlmc_size')

fig.update_layout(title="Degree of parallelization compared to amount of cores used (Human dataset)", yaxis_title="Degree of parallelization", xaxis_title="Amount of cores used")

fig.show()

fig = px.line(df, x='nr_cores_used', y='cache_misses', color='vlmc_size')

fig.update_layout(title="Cache-misses compared to amount of cores used (Human dataset)", yaxis_title="Degree of parallelization", xaxis_title="Amount of cores used")

fig.show()



# <b>Visualizing Benchmarking on E-coli Dataset</b>

## Visualize time elapsed compared to amount of VLMCs compared

In [45]:
df_sv = pd.read_csv('csv_results/sorted-vector_ecoli_03_20_12_30.csv')
df_hm = pd.read_csv('csv_results/hashmap_ecoli_03_20_12_32.csv')
df_co = pd.read_csv('csv_results/combo_ecoli_03_20_12_51.csv')
df_pst = pd.read_csv('csv_results/ecoli_03_16_09_44.csv')


fig = make_subplots(rows=3, cols=1, shared_yaxes=False, 
                    x_title="Amount of VLMCs",
                    y_title="Elapsed Time (s)",
                    subplot_titles=("Small", "Medium", "Large"), 
                    horizontal_spacing= 0.02, vertical_spacing= 0.08) # specs=[[{}, {}],[{"colspan": 2}, None]])

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='small']['set_size'], y=df_pst[df_pst.vlmc_size=='small']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x'), 1, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='small']['set_size'], y=df_hm[df_hm.vlmc_size=='small']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square'), 1, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='small']['set_size'], y=df_co[df_co.vlmc_size=='small']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond'), 1, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='small']['set_size'], y=df_sv[df_sv.vlmc_size=='small']['elapsed_time'], line_color='#636EFA', name="sorted-vector"), 1, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='medium']['set_size'], y=df_pst[df_pst.vlmc_size=='medium']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='medium']['set_size'], y=df_hm[df_hm.vlmc_size=='medium']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='medium']['set_size'], y=df_co[df_co.vlmc_size=='medium']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='medium']['set_size'], y=df_sv[df_sv.vlmc_size=='medium']['elapsed_time'], line_color='#636EFA', name="sorted-vector", showlegend = False), 2, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='large']['set_size'], y=df_pst[df_pst.vlmc_size=='large']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='large']['set_size'], y=df_hm[df_hm.vlmc_size=='large']['elapsed_time'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='large']['set_size'], y=df_co[df_co.vlmc_size=='large']['elapsed_time'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='large']['set_size'], y=df_sv[df_sv.vlmc_size=='large']['elapsed_time'], line_color='#636EFA', name="sorted-vector", showlegend = False), 3, 1)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.08),
    margin=dict(l=100, r=60, t=60, b=80),
    height=800,
    plot_bgcolor="white")

fig.update_xaxes(gridcolor='LightGrey', row=1, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=1, col=1)
# fig.update_yaxes(tickvals=[0, 0.1, 0.2, 0.3, 0.4, 0.5], row=1, col=1)
fig.update_xaxes(tickvals=list(df_sv[df_sv.vlmc_size=='small']['set_size']), row=1, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=2, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=2, col=1)
# fig.update_yaxes(tickvals=[0, 1, 2, 3, 4, 5], row=2, col=1)
fig.update_xaxes(tickvals=list(df_sv[df_sv.vlmc_size=='small']['set_size']), row=2, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=3, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=3, col=1)
# fig.update_yaxes(tickvals=[0, 5, 10, 15, 20, 25, 30], row=3, col=1)
fig.update_xaxes(tickvals=list(df_sv[df_sv.vlmc_size=='small']['set_size']), row=3, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)

fig.layout.annotations[0]["font"] = {'size': 24}
fig.layout.annotations[1]["font"] = {'size': 24}
fig.layout.annotations[2]["font"] = {'size': 24}

fig.layout.annotations[3]["font"] = {'size': 28}
fig.layout.annotations[4]["font"] = {'size': 28}
fig.show()
fig.write_image("images/ecoli_elapsed_time.pdf")

## Cache-misses

In [32]:
df_sv['set_size'].unique()

array([7430, 3715, 1857,  928,  464,  232,  116,   58,   29,   14,    7,
          3])

In [44]:
df_sv = pd.read_csv('csv_results/sorted-vector_ecoli_03_20_12_30.csv')
df_hm = pd.read_csv('csv_results/hashmap_ecoli_03_20_12_32.csv')
df_co = pd.read_csv('csv_results/combo_ecoli_03_20_12_51.csv')
df_pst = pd.read_csv('csv_results/ecoli_03_16_09_44.csv')

fig = make_subplots(rows=3, cols=1, shared_yaxes=False, 
                    x_title="Amount of VLMCs",
                    y_title="Cache misses (%)",
                    subplot_titles=("Small", "Medium", "Large"), 
                    horizontal_spacing= 0.02, vertical_spacing= 0.08) # specs=[[{}, {}],[{"colspan": 2}, None]])

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='small']['set_size'], y=df_pst[df_pst.vlmc_size=='small']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x'), 1, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='small']['set_size'], y=df_hm[df_hm.vlmc_size=='small']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square'), 1, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='small']['set_size'], y=df_co[df_co.vlmc_size=='small']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond'), 1, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='small']['set_size'], y=df_sv[df_sv.vlmc_size=='small']['cache_misses'], line_color='#636EFA', name="sorted-vector"), 1, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='medium']['set_size'], y=df_pst[df_pst.vlmc_size=='medium']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='medium']['set_size'], y=df_hm[df_hm.vlmc_size=='medium']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='medium']['set_size'], y=df_co[df_co.vlmc_size=='medium']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='medium']['set_size'], y=df_sv[df_sv.vlmc_size=='medium']['cache_misses'], line_color='#636EFA', name="sorted-vector", showlegend = False), 2, 1)

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='large']['set_size'].apply(log), y=df_pst[df_pst.vlmc_size=='large']['cache_misses'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size=='large']['set_size'].apply(log), y=df_hm[df_hm.vlmc_size=='large']['cache_misses'], line_color='#FECB52', name="hashmap", marker_symbol='square', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size=='large']['set_size'].apply(log), y=df_co[df_co.vlmc_size=='large']['cache_misses'], line_color='#00CC96', name="combo", marker_symbol='diamond', showlegend = False), 3, 1)
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size=='large']['set_size'].apply(log), y=df_sv[df_sv.vlmc_size=='large']['cache_misses'], line_color='#636EFA', name="sorted-vector", showlegend = False), 3, 1)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.08),
    margin=dict(l=100, r=60, t=60, b=80),
    height=800,
    plot_bgcolor="white")

fig.update_xaxes(gridcolor='LightGrey', row=1, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=1, col=1)
# fig.update_yaxes(tickvals=[0, 0.1, 0.2, 0.3, 0.4, 0.5], row=1, col=1)
fig.update_xaxes(tickvals=list(df_sv[df_sv.vlmc_size=='small']['set_size']), row=1, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=1, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=2, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=2, col=1)
# fig.update_yaxes(tickvals=[0, 1, 2, 3, 4, 5], row=2, col=1)
fig.update_xaxes(tickvals=list(df_sv[df_sv.vlmc_size=='small']['set_size']), row=2, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=2, col=1)

fig.update_xaxes(gridcolor='LightGrey', row=3, col=1)
fig.update_yaxes(gridcolor='LightGrey', row=3, col=1)
# fig.update_yaxes(tickvals=[0, 5, 10, 15, 20, 25, 30], row=3, col=1)
fig.update_xaxes(tickvals=[ log(x) for x in list(df_sv[df_sv.vlmc_size=='small']['set_size'])], row=3, col=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey', row=3, col=1)

fig.layout.annotations[0]["font"] = {'size': 24}
fig.layout.annotations[1]["font"] = {'size': 24}
fig.layout.annotations[2]["font"] = {'size': 24}

fig.layout.annotations[3]["font"] = {'size': 28}
fig.layout.annotations[4]["font"] = {'size': 28}
fig.show()
fig.write_image("images/ecoli_cache_misses.pdf")

In [None]:
df = pd.read_csv('csv_results/03_15_11_44.csv')
df_sv = pd.read_csv('csv_results/sorted-vector_human_03_20_12_09.csv')
df_hm = pd.read_csv('csv_results/hashmap_human_03_20_12_10.csv')
df_co = pd.read_csv('csv_results/combo_human_03_20_12_10.csv')
df_pst = df[df.implementation=='PstClassifierSeqan']
fig = go.Figure()

df_sv['misses_per_instruction'] = (df_sv.cache_misses_count * 100) / df_sv.instructions_count
df_pst['misses_per_instruction'] = (df_pst.cache_misses_count * 100) / df_pst.instructions_count

for vlmc_size in ['small', 'medium', 'large']:
  fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size==vlmc_size]['set_size'], y=df_pst[df_pst.vlmc_size==vlmc_size]['misses_per_instruction'], name="PstClassifierSeqan " + vlmc_size, marker_symbol='x'))
  fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['misses_per_instruction'], name="Sorted-vector " + vlmc_size))
 
fig.update_layout(title="Cache-misses per instruction compared to amount of VLMCs compared (E-coli dataset)", yaxis_title="Cache-misses per instruction (%)", xaxis_title="Size of directory of VLMCs")

fig.show()

# <b>Comparison of vlmc container </b>

In [None]:
df_sv = pd.read_csv('csv_results/sorted-vector_human_03_20_12_09.csv')
df_hm = pd.read_csv('csv_results/hashmap_human_03_20_12_10.csv')
df_co = pd.read_csv('csv_results/combo_human_03_20_12_10.csv')

fig = go.Figure()
vlmc_size = 'small'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human small)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'medium'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human medium)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'large'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human large)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

In [None]:
df_sv = pd.read_csv('csv_results/sorted-vector_human_03_20_12_09.csv')
df_hm = pd.read_csv('csv_results/hashmap_human_03_20_12_10.csv')
df_co = pd.read_csv('csv_results/combo_human_03_20_12_10.csv')

fig = go.Figure()
vlmc_size = 'small'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human small)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'medium'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human medium)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'large'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (dataset human large)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

In [None]:
df_sv = pd.read_csv('csv_results/sorted-vector_ecoli_03_20_12_30.csv')
df_hm = pd.read_csv('csv_results/hashmap_ecoli_03_20_12_32.csv')
df_co = pd.read_csv('csv_results/combo_ecoli_03_20_12_51.csv')

fig = go.Figure()
vlmc_size = 'small'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (E-coli dataset small)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'medium'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (E-coli dataset medium)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'large'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['elapsed_time'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['elapsed_time'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['elapsed_time'], name="combo " + vlmc_size))
fig.update_layout(title="Elapsed time compared to amount of VLMCs compared for different vlmc containers (E-coli dataset large)", yaxis_title="Elapsed time (sec)", xaxis_title="Size of directory of VLMCs")
fig.show()

In [None]:
df_sv = pd.read_csv('csv_results/sorted-vector_ecoli_03_20_12_30.csv')
df_hm = pd.read_csv('csv_results/hashmap_ecoli_03_20_12_32.csv')
df_co = pd.read_csv('csv_results/combo_ecoli_03_20_12_51.csv')

fig = go.Figure()
vlmc_size = 'small'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Cache-misses compared to amount of VLMCs compared for different vlmc containers (E-coli dataset small)", yaxis_title="Cache-misses (%)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'medium'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Cache-misses compared to amount of VLMCs compared for different vlmc containers (E-coli dataset medium)", yaxis_title="Cache-misses (%)", xaxis_title="Size of directory of VLMCs")
fig.show()

fig = go.Figure()
vlmc_size = 'large'
fig.add_trace(go.Scatter(x=df_sv[df_sv.vlmc_size==vlmc_size]['set_size'], y=df_sv[df_sv.vlmc_size==vlmc_size]['cache_misses'], name="sorted-vector " + vlmc_size))
fig.add_trace(go.Scatter(x=df_hm[df_hm.vlmc_size==vlmc_size]['set_size'], y=df_hm[df_hm.vlmc_size==vlmc_size]['cache_misses'], name="hashmap " + vlmc_size))
fig.add_trace(go.Scatter(x=df_co[df_co.vlmc_size==vlmc_size]['set_size'], y=df_co[df_co.vlmc_size==vlmc_size]['cache_misses'], name="combo " + vlmc_size))
fig.update_layout(title="Cache-misses compared to amount of VLMCs compared for different vlmc containers (E-coli dataset large)", yaxis_title="Cache-misses (%)", xaxis_title="Size of directory of VLMCs")
fig.show()

# <b>Parameter Sweep for combo container </b>

## Human dataset

In [None]:
df = pd.read_csv('csv_results/parameter_sweep_human_03_20_10_33.csv')
df['mean_elapsed_time'] = df.groupby(['vlmc_size', 'combo_init_size'])['elapsed_time'].transform('mean')
df.sort_values('combo_init_size', inplace=True)
fig = px.line(df, x='combo_init_size', y='mean_elapsed_time', color='vlmc_size')

fig.update_layout(title="Elapsed time with different initial sizes for index by value vector", yaxis_title="Elapsed time (sec)", xaxis_title="Initial size for index by value vector")

fig.show()

df.groupby('vlmc_size').mean_elapsed_time.min()

In [None]:
df = pd.read_csv('csv_results/parameter_sweep_ecoli_03_20_10_40.csv')
df['mean_elapsed_time'] = df.groupby(['vlmc_size', 'combo_init_size'])['elapsed_time'].transform('mean')
df.sort_values('combo_init_size', inplace=True)
fig = px.line(df, x='combo_init_size', y='mean_elapsed_time', color='vlmc_size')

fig.update_layout(title="Elapsed time with different initial sizes for index by value vector", yaxis_title="Elapsed time (sec)", xaxis_title="Initial size for index by value vector")

fig.show()

df.groupby('vlmc_size').mean_elapsed_time.min()

# <b>Kmer distributions </b>

## Human dataset

In [None]:
df = pd.read_csv("./tmp/one_human_VLMCs_kmer-distribution.txt", sep=",", header=None)

fig = px.histogram(df)

fig.show()

## E-coli dataset

In [None]:
df = pd.read_csv("./tmp/small_test_kmer-distribution.txt", sep=",", header=None)

fig = px.histogram(df)

fig.show()

In [None]:
df = pd.read_csv("./tmp/one_human_VLMCs_kmer-distribution.txt", sep=",", header=None)
df.head(100)

In [None]:
df = pd.read_csv("./tmp/one_human_VLMCs_kmer-distribution.txt", sep=",", header=None)
df.columns = ["integer_rep"]

fig = px.bar(df[df.index < 400], x=df[df.index < 400].index, y='integer_rep')
fig.update_layout(yaxis_title="Integer representation", xaxis_title="Index")

fig.show()

In [None]:
t = [35, 29, 21, 15, 34, 28, 20, 14, 33, 27, 19, 13, 32, 26, 18]
t.reverse()
print(t)

In [None]:
df = pd.read_csv("./tmp/one_vlmcs_kmer-distribution.txt", sep=",", header=None)
df.columns = ["integer_rep"]

fig = px.bar(df[df.index < 100], x=df[df.index < 100].index, y='integer_rep')
fig.update_layout(yaxis_title="Integer representation", xaxis_title="Index")

fig.show()

In [None]:
df = pd.read_csv('csv_results/03_15_11_44.csv')
df_sv_old = pd.read_csv('csv_results/sorted-vector_human_03_27_11_28.csv')
df_sv_new = pd.read_csv('csv_results/sorted-vector_human_03_27_11_34.csv')
df_sv_tmp = pd.read_csv('csv_results/sorted-vector_human_03_27_12_09.csv')
df_pst = df[df.implementation=='PstClassifierSeqan']

df_sv_old['mean_elapsed_time'] = df_sv_old.groupby(['vlmc_size', 'set_size'])['cache_misses'].transform('mean')
df_sv_new['mean_elapsed_time'] = df_sv_new.groupby(['vlmc_size', 'set_size'])['cache_misses'].transform('mean')
df_sv_tmp['mean_elapsed_time'] = df_sv_tmp.groupby(['vlmc_size', 'set_size'])['cache_misses'].transform('mean')
df_sv_old['idx_helped'] = 1
df_sv_old['idx'] = df_sv_old.groupby(['vlmc_size', 'set_size'])['idx_helped'].transform('cumsum')
df_sv_new['idx_helped'] = 1
df_sv_new['idx'] = df_sv_new.groupby(['vlmc_size', 'set_size'])['idx_helped'].transform('cumsum')
df_sv_tmp['idx_helped'] = 1
df_sv_tmp['idx'] = df_sv_tmp.groupby(['vlmc_size', 'set_size'])['idx_helped'].transform('cumsum')

fig = make_subplots(rows=2, cols=2, shared_yaxes=False, subplot_titles=("Small", "Medium", "Large"), horizontal_spacing= 0.02, vertical_spacing= 0.06, specs=[[{}, {}],[{"colspan": 2}, None]])

fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='small']['set_size'], y=df_pst[df_pst.vlmc_size=='small']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x'), 1, 1)
fig.add_trace(go.Scatter(x=df_sv_old[(df_sv_old.vlmc_size=='small') & (df_sv_old.idx==1)]['set_size'], y=df_sv_old[df_sv_old.vlmc_size=='small']['mean_elapsed_time'], line_color='#00CC96', name="sorted-vector old"), 1, 1)
fig.add_trace(go.Scatter(x=df_sv_new[(df_sv_new.vlmc_size=='small') & (df_sv_new.idx==1)]['set_size'], y=df_sv_new[df_sv_new.vlmc_size=='small']['mean_elapsed_time'], line_color='#636EFA', name="sorted-vector new"), 1, 1)
fig.add_trace(go.Scatter(x=df_sv_tmp[(df_sv_tmp.vlmc_size=='small') & (df_sv_tmp.idx==1)]['set_size'], y=df_sv_tmp[df_sv_tmp.vlmc_size=='small']['mean_elapsed_time'], line_color='yellow', name="sorted-vector new new"), 1, 1)


fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='medium']['set_size'], y=df_pst[df_pst.vlmc_size=='medium']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 1, 2)
fig.add_trace(go.Scatter(x=df_sv_old[(df_sv_old.vlmc_size=='medium') & (df_sv_old.idx==1)]['set_size'], y=df_sv_old[df_sv_old.vlmc_size=='medium']['mean_elapsed_time'], line_color='#00CC96', name="hashmap", marker_symbol='square', showlegend = False), 1, 2)
fig.add_trace(go.Scatter(x=df_sv_new[(df_sv_new.vlmc_size=='medium') & (df_sv_new.idx==1)]['set_size'], y=df_sv_new[df_sv_new.vlmc_size=='medium']['mean_elapsed_time'], line_color='#636EFA', name="hashmap", marker_symbol='square', showlegend = False), 1, 2)
fig.add_trace(go.Scatter(x=df_sv_tmp[(df_sv_tmp.vlmc_size=='medium') & (df_sv_tmp.idx==1)]['set_size'], y=df_sv_tmp[df_sv_tmp.vlmc_size=='medium']['mean_elapsed_time'], line_color='yellow', name="hashmap", marker_symbol='square', showlegend = False), 1, 2)


fig.add_trace(go.Scatter(x=df_pst[df_pst.vlmc_size=='large']['set_size'], y=df_pst[df_pst.vlmc_size=='large']['elapsed_time'], line_color='#EF553B', name="Pst", marker_symbol='x', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv_old[(df_sv_old.vlmc_size=='large') & (df_sv_old.idx==1)]['set_size'], y=df_sv_old[df_sv_old.vlmc_size=='large']['mean_elapsed_time'], line_color='#00CC96', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv_new[(df_sv_new.vlmc_size=='large') & (df_sv_new.idx==1)]['set_size'], y=df_sv_new[df_sv_new.vlmc_size=='large']['mean_elapsed_time'], line_color='#636EFA', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)
fig.add_trace(go.Scatter(x=df_sv_tmp[(df_sv_tmp.vlmc_size=='large') & (df_sv_tmp.idx==1)]['set_size'], y=df_sv_tmp[df_sv_tmp.vlmc_size=='large']['mean_elapsed_time'], line_color='yellow', name="hashmap", marker_symbol='square', showlegend = False), 2, 1)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.08),
    margin=dict(l=60, r=20, t=60, b=60),
    height=700)

fig.add_annotation(x=-0.045, y=0.5,
            xref="paper",
            yref="paper",
            text="Elapsed time [sec]",
            showarrow=False,
            textangle=-90,
            font_size=20)

fig.add_annotation(x=0.5, y=-0.11,
            xref="paper",
            yref="paper",
            text="Amount of VLMCs",
            showarrow=False,
            font_size=20)

fig.show()

### Ahmdals

In [None]:
df = pd.read_csv('csv_results/parallelization_ecoli_03_28_16_11.csv')
time = df['elapsed_time']
cores = df['nr_cores_used']
single_core_time = time[0]
speedup = single_core_time / time 
fig = go.Figure()
fig.add_trace(go.Scatter(mode='lines', x=cores, y=speedup, line_color='#00CC96', name="Algorithm"))

infinite_ahmdals = cores
fig.add_trace(go.Scatter(mode='lines', x=cores, y=infinite_ahmdals, line_color='#636EFA', name="Theoretical"))
fig.update_layout(yaxis_title="Speedup", xaxis_title="Amount of cores used", yaxis_range=[0,20], xaxis_range=[0,17], font={'size': 20}, plot_bgcolor="white", yaxis_gridcolor='LightGrey'
,xaxis_gridcolor='LightGrey')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,)
fig.show()
fig.write_image("images/ecolit-parallel-to-ahmdals.pdf")

In [5]:
df.head()

Unnamed: 0,repo_version,nr_elements,size_of_vector,branch_misses,branch_misses_count,branches,branches_count,task_clock,task_clock_count,cycles,cycles_count,instructions,instructions_count,cache_references,cache_references_count,cache_misses,cache_misses_count
0,91a6b55,4,536,2.05,17195,816.936,841348,0.914,1,4.212,4338353,1.33,4906590,121.505,125136,35.821,46676
1,91a6b55,5,1048,1.96,16452,798.101,833962,0.733,0,3.362,3513382,1.3,4869492,123.082,128612,37.495,48597
2,91a6b55,6,2072,2.0,16773,696.56,836144,0.68,1,3.127,3753554,1.14,4880598,108.084,129743,41.544,53095
3,91a6b55,7,4120,1.92,16158,750.128,837430,0.689,0,3.086,3445436,1.22,4888697,114.802,128163,33.602,44453
4,91a6b55,8,8216,1.96,16427,804.177,840462,0.731,0,3.323,3472838,1.31,4906366,123.138,128694,33.057,43361


In [12]:
df = pd.read_csv('csv_results/dev_test_6.csv')

fig = go.Figure()
fig.add_trace(go.Scatter(mode='lines', x=df['nr_elements'], y=df['cache_misses_count'] / df['instructions_count'], line_color='#00CC96'))

# infinite_ahmdals = cores
# fig.add_trace(go.Scatter(mode='lines', x=cores, y=infinite_ahmdals, line_color='#636EFA', name="Theoretical"))
fig.update_layout(yaxis_title="Speedup", xaxis_title="Amount of cores used", font={'size': 20}, plot_bgcolor="white", yaxis_gridcolor='LightGrey', xaxis_gridcolor='LightGrey')
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,)
fig.show()