In [16]:
import pandas as pd
import plotly.express as px
import os

# Ensure the directory exists
os.makedirs("./results/plots", exist_ok=True)

min_threshold = 200_000_000

concurrent_io = pd.read_csv("./results/concurrent-io.csv")
concurrent_io = concurrent_io[concurrent_io["wchar"] > min_threshold]
concurrent_io["timestamp"] = pd.to_datetime(concurrent_io["timestamp"]) - min(pd.to_datetime(concurrent_io["timestamp"]))
constant_io = pd.read_csv("./results/constant-io.csv")
constant_io = constant_io[constant_io["wchar"] > min_threshold]
constant_io["timestamp"] = pd.to_datetime(constant_io["timestamp"]) - min(pd.to_datetime(constant_io["timestamp"]))
correlated_prefix_io = pd.read_csv("./results/correlated-prefix-io.csv")
correlated_prefix_io = correlated_prefix_io[correlated_prefix_io["wchar"] > min_threshold]
correlated_prefix_io["timestamp"] = pd.to_datetime(correlated_prefix_io["timestamp"]) - min(pd.to_datetime(correlated_prefix_io["timestamp"]))

# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Write IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["wchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["wchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["wchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/io-write-patterns.png", width=1920, height=1080)


# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Read IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["rchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["rchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["rchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/io-read-patterns.png", width=1920, height=1080)


In [17]:
import pandas as pd
import plotly.express as px
import os

# Ensure the directory exists
os.makedirs("./results/plots", exist_ok=True)

min_threshold = 200_000_000

concurrent_io = pd.read_csv("./results/read-query-concurrent-io.csv")
concurrent_io = concurrent_io[concurrent_io["wchar"] > min_threshold]
concurrent_io["timestamp"] = pd.to_datetime(concurrent_io["timestamp"]) - min(pd.to_datetime(concurrent_io["timestamp"]))
constant_io = pd.read_csv("./results/read-query-constant-io.csv")
constant_io = constant_io[constant_io["wchar"] > min_threshold]
constant_io["timestamp"] = pd.to_datetime(constant_io["timestamp"]) - min(pd.to_datetime(constant_io["timestamp"]))
correlated_prefix_io = pd.read_csv("./results/read-query-correlated-prefix-io.csv")
correlated_prefix_io = correlated_prefix_io[correlated_prefix_io["wchar"] > min_threshold]
correlated_prefix_io["timestamp"] = pd.to_datetime(correlated_prefix_io["timestamp"]) - min(pd.to_datetime(correlated_prefix_io["timestamp"]))

# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Write IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["wchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["wchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["wchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/read-query-io-write-patterns.png", width=1920, height=1080)


# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Read IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["rchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["rchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["rchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/read-query-io-read-patterns.png", width=1920, height=1080)


In [19]:
import pandas as pd
import plotly.express as px
import os

# Ensure the directory exists
os.makedirs("./results/plots", exist_ok=True)

min_threshold = 200_000_000

concurrent_io = pd.read_csv("./results/insert-query-concurrent-io.csv")
concurrent_io = concurrent_io[concurrent_io["wchar"] > min_threshold]
concurrent_io["timestamp"] = pd.to_datetime(concurrent_io["timestamp"]) - min(pd.to_datetime(concurrent_io["timestamp"]))
constant_io = pd.read_csv("./results/insert-query-constant-io.csv")
constant_io = constant_io[constant_io["wchar"] > min_threshold]
constant_io["timestamp"] = pd.to_datetime(constant_io["timestamp"]) - min(pd.to_datetime(constant_io["timestamp"]))
correlated_prefix_io = pd.read_csv("./results/insert-query-correlated-prefix-io.csv")
correlated_prefix_io = correlated_prefix_io[correlated_prefix_io["wchar"] > min_threshold]
correlated_prefix_io["timestamp"] = pd.to_datetime(correlated_prefix_io["timestamp"]) - min(pd.to_datetime(correlated_prefix_io["timestamp"]))

# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Write IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["wchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["wchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["wchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/insert-query-io-write-patterns.png", width=1920, height=1080)


# plot the data for all three csv files
fig = px.scatter(title="IO Patterns (Read IO)")
fig.add_scatter(
    x=concurrent_io["timestamp"],
    y=concurrent_io["rchar"],
    mode="lines",
    name="Concurrent IO",
    line=dict(color="blue"),
)
fig.add_scatter(
    x=constant_io["timestamp"],
    y=constant_io["rchar"],
    mode="lines",
    name="Constant IO",
    line=dict(color="red"),
)
fig.add_scatter(
    x=correlated_prefix_io["timestamp"],
    y=correlated_prefix_io["rchar"],
    mode="lines",
    name="Correlated Prefix IO",
    line=dict(color="green"),
)
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="IO (MB/s)",
)
fig.show()

fig.write_image("./results/plots/insert-query-io-read-patterns.png", width=1920, height=1080)


In [14]:
def show_aggregate_stats(df):
    return df.describe([.50, .75, .95, .99])


constant_throughput = pd.read_csv("./results/constant-throughput.csv")
constant_throughput = constant_throughput[["elapsed_time"]]

concurrent_throughput = pd.read_csv("./results/concurrent-throughput.csv")
concurrent_throughput = concurrent_throughput[["elapsed_time"]]

correlated_prefix_throughput = pd.read_csv("./results/correlated-prefix-throughput.csv")
correlated_prefix_throughput = correlated_prefix_throughput[["elapsed_time"]]

constant_throughput_stats = show_aggregate_stats(constant_throughput)
concurrent_throughput_stats = show_aggregate_stats(concurrent_throughput)
correlated_prefix_throughput_stats = show_aggregate_stats(correlated_prefix_throughput)


# plot bar chart for the stats
constant_throughput_stats = constant_throughput_stats.transpose()
constant_throughput_stats["type"] = "Constant Throughput"
concurrent_throughput_stats = concurrent_throughput_stats.transpose()
concurrent_throughput_stats["type"] = "Concurrent Throughput"
correlated_prefix_throughput_stats = correlated_prefix_throughput_stats.transpose()
correlated_prefix_throughput_stats["type"] = "Correlated Prefix Throughput"
stats = pd.concat([constant_throughput_stats, concurrent_throughput_stats, correlated_prefix_throughput_stats])
stats = stats.reset_index()
stats = stats.rename(columns={"index": "statistic"})
stats = stats[["statistic", "mean", "std", "50%", "75%", "95%", "99%", "type"]]
stats = stats.melt(id_vars=["statistic", "type"], var_name="percentile", value_name="value")
# Create individual plots for each statistic
unique_statistics = stats["statistic"].unique()
for statistic in unique_statistics:
    filtered_stats = stats[stats["statistic"] == statistic]
    fig = px.bar(
        filtered_stats,
        x="percentile",
        y="value",
        color="type",
        barmode="group",
        title=f"Throughput Statistics for {statistic}",
    )
    fig.update_layout(
        xaxis_title="Percentile",
        yaxis_title=f"Value ({statistic})",
        title_x=0.5,
        title_y=0.95,
        title_font=dict(size=20),
        legend_title_text="IO Pattern",
    )
    fig.show()

fig.write_image("./results/plots/throughput-statistics.png", width=1920, height=1080)

In [18]:
def show_aggregate_stats(df):
    return df.describe([.50, .75, .95, .99])


constant_throughput = pd.read_csv("./results/read-query-constant-throughput.csv")
constant_throughput = constant_throughput[["elapsed_time"]]

concurrent_throughput = pd.read_csv("./results/read-query-concurrent-throughput.csv")
concurrent_throughput = concurrent_throughput[["elapsed_time"]]

correlated_prefix_throughput = pd.read_csv("./results/read-query-correlated-prefix-throughput.csv")
correlated_prefix_throughput = correlated_prefix_throughput[["elapsed_time"]]

constant_throughput_stats = show_aggregate_stats(constant_throughput)
concurrent_throughput_stats = show_aggregate_stats(concurrent_throughput)
correlated_prefix_throughput_stats = show_aggregate_stats(correlated_prefix_throughput)


# plot bar chart for the stats
constant_throughput_stats = constant_throughput_stats.transpose()
constant_throughput_stats["type"] = "Constant Throughput"
concurrent_throughput_stats = concurrent_throughput_stats.transpose()
concurrent_throughput_stats["type"] = "Concurrent Throughput"
correlated_prefix_throughput_stats = correlated_prefix_throughput_stats.transpose()
correlated_prefix_throughput_stats["type"] = "Correlated Prefix Throughput"
stats = pd.concat([constant_throughput_stats, concurrent_throughput_stats, correlated_prefix_throughput_stats])
stats = stats.reset_index()
stats = stats.rename(columns={"index": "statistic"})
stats = stats[["statistic", "mean", "std", "50%", "75%", "95%", "99%", "type"]]
stats = stats.melt(id_vars=["statistic", "type"], var_name="percentile", value_name="value")
# Create individual plots for each statistic
unique_statistics = stats["statistic"].unique()
for statistic in unique_statistics:
    filtered_stats = stats[stats["statistic"] == statistic]
    fig = px.bar(
        filtered_stats,
        x="percentile",
        y="value",
        color="type",
        barmode="group",
        title=f"Throughput Statistics for {statistic}",
    )
    fig.update_layout(
        xaxis_title="Percentile",
        yaxis_title=f"Value ({statistic})",
        title_x=0.5,
        title_y=0.95,
        title_font=dict(size=20),
        legend_title_text="IO Pattern",
    )
    fig.show()

fig.write_image("./results/plots/read-query-throughput-statistics.png", width=1920, height=1080)

In [None]:
def show_aggregate_stats(df):
    return df.describe([.50, .75, .95, .99])


constant_throughput = pd.read_csv("./results/insert-query-constant-throughput.csv")
constant_throughput = constant_throughput[["elapsed_time"]]

concurrent_throughput = pd.read_csv("./results/insert-query-concurrent-throughput.csv")
concurrent_throughput = concurrent_throughput[["elapsed_time"]]

correlated_prefix_throughput = pd.read_csv("./results/insert-query-correlated-prefix-throughput.csv")
correlated_prefix_throughput = correlated_prefix_throughput[["elapsed_time"]]

constant_throughput_stats = show_aggregate_stats(constant_throughput)
concurrent_throughput_stats = show_aggregate_stats(concurrent_throughput)
correlated_prefix_throughput_stats = show_aggregate_stats(correlated_prefix_throughput)


# plot bar chart for the stats
constant_throughput_stats = constant_throughput_stats.transpose()
constant_throughput_stats["type"] = "Constant Throughput"
concurrent_throughput_stats = concurrent_throughput_stats.transpose()
concurrent_throughput_stats["type"] = "Concurrent Throughput"
correlated_prefix_throughput_stats = correlated_prefix_throughput_stats.transpose()
correlated_prefix_throughput_stats["type"] = "Correlated Prefix Throughput"
stats = pd.concat([constant_throughput_stats, concurrent_throughput_stats, correlated_prefix_throughput_stats])
stats = stats.reset_index()
stats = stats.rename(columns={"index": "statistic"})
stats = stats[["statistic", "mean", "std", "50%", "75%", "95%", "99%", "type"]]
stats = stats.melt(id_vars=["statistic", "type"], var_name="percentile", value_name="value")
# Create individual plots for each statistic
unique_statistics = stats["statistic"].unique()
for statistic in unique_statistics:
    filtered_stats = stats[stats["statistic"] == statistic]
    fig = px.bar(
        filtered_stats,
        x="percentile",
        y="value",
        color="type",
        barmode="group",
        title=f"Throughput Statistics for {statistic}",
    )
    fig.update_layout(
        xaxis_title="Percentile",
        yaxis_title=f"Value ({statistic})",
        title_x=0.5,
        title_y=0.95,
        title_font=dict(size=20),
        legend_title_text="IO Pattern",
    )
    fig.show()

fig.write_image("./results/plots/insert-query-throughput-statistics.png", width=1920, height=1080)