In [None]:
import seaborn as sns

from zarr_benchmarks.parse_json_for_plots import get_benchmarks_dataframe

In [None]:
package_paths_dict = {
    "zarr_python_2": "../../example_results/0001_zarr-python-v2.json",
    "zarr_python_3": "../../example_results/0002_zarr-python-v3.json",
    "tensorstore": "../../example_results/0003_tensorstore.json",
}
benchmarks_df = get_benchmarks_dataframe(package_paths_dict)

In [None]:
benchmarks_df.head()

In [None]:
read_write_benchmarks = benchmarks_df[
    benchmarks_df.chunk_size.isin([64, 128])
    & (~benchmarks_df.blosc_shuffle.isin(["bitshuffle", "noshuffle"]))
]

In [None]:
read_write_benchmarks.head()

# Zarr-python v2 (read-write)

In [None]:
benchmarks_zarr_v2 = read_write_benchmarks[
    read_write_benchmarks.package == "zarr_python_2"
]
write_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "write"]
read_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "read"]

In [None]:
write_zarr_v2_chunks_128 = write_zarr_v2[write_zarr_v2.chunk_size == 128]
read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

In [None]:
# As compression ratio increases, so does mean write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# As compression ratio increases, so does mean write time (LOG SCALE)
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set(xscale="log")
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# Blosc only with xlim to see left
blosc_only_write = write_zarr_v2_chunks_128[
    ~write_zarr_v2_chunks_128.compressor.isin(["gzip", "zstd"])
]
graph = sns.relplot(
    data=blosc_only_write,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
    facet_kws=dict(xlim=(0, 5)),
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set(xscale="log")
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors

df = read_zarr_v2_chunks_128
read_zarr_v2_chunks_128 = read_zarr_v2_chunks_128.explode(column="stats.data")

graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="stats.data",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    kind="line",
    height=4,
    aspect=1.5,
)
graph.set(xscale="log")
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors

read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

df = read_zarr_v2_chunks_128
df = df.reset_index()
df = df.explode(column="stats.data")
# df = df.melt(id_vars=["stats.mean", "compression_ratio"], value_vars=["stats.data"], var_name="measurement_number", value_name="measured_value")
# graph = sns.relplot(
#     data=read_zarr_v2_chunks_128,
#     x="stats.data",
#     y="compression_ratio",
#     hue="compressor",
#     style="compressor",
#     size="compression_level",
#     kind="line",
#     height=4,
#     aspect=1.5,
# )
# graph.set(xscale="log")
# graph.set_axis_labels("Mean read time (s)", "Compression ratio")
sns.relplot(x="stats.data", y="compression_ratio", kind="line", data=df)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# example data
# x = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
# y = np.exp(-x)
read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

df = read_zarr_v2_chunks_128
# x_llim = df["stats.mean"] - 2*df["stats.stddev"]
# x_ulim = df["stats.mean"] + 2*df["stats.stddev"]
x = df["stats.mean"]
y = df["compression_ratio"]

xerr_lower = x - df["stats.min"]
xerr_upper = df["stats.max"] - x
xerr = np.array([xerr_lower, xerr_upper])
print("xerr lower:", xerr[0])
print("xerr upper:", xerr[1])
# x_llim = df["stats.min"]
# x_ulim = df["stats.max"]
# xerr = np.array([x_llim, x_ulim])

# df = df.explode(column="stats.data")
# lower & upper limits of the error
# x = df["stats.mean"]
# y = df["compression_ratio"]
# lolims  = df["stats.mean"] - 2*df["stats.stddev"]
# uplims  = df["stats.mean"] + 2*df["stats.stddev"]
# lolims = np.array([0, 0, 1, 0, 1, 0, 0, 0, 1, 0], dtype=bool)
# uplims = np.array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1], dtype=bool)
ls = "dotted"

fig, ax = plt.subplots(figsize=(7, 4))

# standard error bars
ax.errorbar(x, y, xerr=xerr, fmt="o", markersize=2)
# ax.errorbar(x, y, fmt='o')

# # including upper limits
# ax.errorbar(x, y + 0.5, xerr=xerr, yerr=yerr, uplims=uplims,
#             linestyle=ls)

# # including lower limits
# ax.errorbar(x, y + 1.0, xerr=xerr, yerr=yerr, lolims=lolims,
#             linestyle=ls)

# including upper and lower limits
# ax.errorbar(x, y + 1.5, xerr=xerr, yerr=yerr,
#             lolims=lolims, uplims=uplims,
#             marker='o', markersize=8,
#             linestyle=ls)

# Plot a series with lower and upper limits in both x & y
# constant x-error with varying y-error
# xerr = 0.2
# yerr = np.full_like(x, 0.2)
# yerr[[3, 6]] = 0.3

# # mock up some limits by modifying previous data
# xlolims = lolims
# xuplims = uplims
# lolims = np.zeros_like(x)
# uplims = np.zeros_like(x)
# lolims[[6]] = True  # only limited at this index
# uplims[[3]] = True  # only limited at this index

# do the plotting
# ax.errorbar(x, y + 2.1, xerr=xerr, yerr=yerr,
#             xlolims=xlolims, xuplims=xuplims,
#             uplims=uplims, lolims=lolims,
#             marker='o', markersize=8,
#             linestyle='none')

# lvls = df.compression_ratio.unique()
# for i in lvls:
#     ax.errorbar(x = df[df['compression_ratio']==i]["stats.mean"],
#                 y = df[df['compression_ratio']==i]["compression_ratio"],
#                 xerr=df[df['compression_ratio']==i]["stats.stddev"],
#     )

# tidy up the figure
# ax.set_xlim((0, 5.5))
ax.set_title("Errorbar upper and lower limits")
ax.set_xscale("log")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

df = read_zarr_v2_chunks_128
x = df["stats.mean"]
y = df["compression_ratio"]

# xerr_lower = x - df["stats.min"]
# xerr_upper = df["stats.max"] - x
xerr_lower = 2 * df["stats.stddev"]
xerr_upper = 2 * df["stats.stddev"]
xerr = np.array([xerr_lower, xerr_upper])
# print("xerr lower:", xerr[0])
# print("xerr upper:", xerr[1])

fig, ax = plt.subplots(figsize=(7, 4))

# standard error bars
ax.errorbar(x, y, xerr=xerr, fmt="o", markersize=2)

# tidy up the figure
# ax.set_xlim((2.3, 2.8))
ax.set_title("Errorbar upper and lower limits")
# ax.set_xscale('log')
plt.show()

In [None]:
read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

data = read_zarr_v2_chunks_128
x_axis = "stats.mean"
y_axis = "compression_ratio"
col = "compressor"
plot_name = "read_zarr_v2_chunks_128"
hue = None
size = None
title = None
# hue = "compressor"
# size = "compression_level"

# plot_relplot_benchmarks(
#         read_chunks_128,
#         x_axis="stats.mean",
#         y_axis="compression_ratio",
#         col="compressor",
#         sub_dir_name="read",
#         plot_name=f"{package}_chunk_size128",
#     )

if col is None:
    facet_kws = None
    col_wrap = None
    plot_name = plot_name
else:
    facet_kws = dict(sharex=False, sharey=False)
    if len(data[col].unique()) < 3:
        col_wrap = 2
    else:
        col_wrap = 3
    plot_name = plot_name + "_subplots"

graph = sns.relplot(
    data=data,
    x=x_axis,
    y=y_axis,
    hue=hue,
    style=hue,
    size=size,
    col=col,
    height=4,
    aspect=1.5,
    facet_kws=facet_kws,
    col_wrap=col_wrap,
)


# Add error bars using matplotlib
def add_error_bars(x, y, **kwargs):
    ax = plt.gca()
    xerr_lower = 2 * data["stats.stddev"]
    xerr_upper = 2 * data["stats.stddev"]
    xerr = np.array([xerr_lower, xerr_upper])
    xerr = xerr[:, : len(x)]
    ax.errorbar(x, y, xerr=xerr, fmt="o", markersize=2, **kwargs)


graph.map(add_error_bars, x_axis, y_axis)

# x_axis_label, y_axis_label = get_axis_labels(data, x_axis=x_axis, y_axis=y_axis)
# graph.set_axis_labels(x_axis_label, y_axis_label)

if title is not None:
    graph.figure.suptitle(title)
    graph.tight_layout()

# ax.set_xlim((0, 1))

In [None]:
# Replace sns.relplot with sns.lineplot for 95% CI
import seaborn as sns

# Example for plotting with 95% CI
graph = sns.lineplot(
    data=read_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    errorbar=("ci", 95),  # 95% confidence interval
)
graph.set(xscale="log")
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Blosc only reads
blosc_only_read = read_zarr_v2_chunks_128[
    ~read_zarr_v2_chunks_128.compressor.isin(["gzip", "zstd"])
]
graph = sns.relplot(
    data=blosc_only_read,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# higher compression level = higher compression ratio
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="compression_level",
    y="compression_ratio",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Compression ratio")

In [None]:
# higher compression ratio = larger write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Mean write time (s)")

In [None]:
# higher compression level, doesn't always mean higher read time (but maybe for some compressors? Would need more points...)
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Mean read time (s)")

In [None]:
# Higher write time for higher compression ratios (regardless of chunk size)
graph = sns.relplot(
    data=write_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
graph = sns.relplot(
    data=read_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Keeping compression level the same, larger chunks sizes compress worse (WE'D NEED TO DO AT LEAST ONE MORE CHUNK SIZE FOR THIS GRAPH TO WORK)
# for compressor in read_zarr_v2.compressor.unique():
#     compressor_reads = read_zarr_v2[read_zarr_v2.compressor == compressor]
#     graph = sns.relplot(
#         data=compressor_reads,
#         x="chunk_size",
#         y="compression_ratio",
#         hue="compressor",
#         style="compressor",
#         col="compression_level",
#         height=4,
#         aspect=1.2,
#         col_wrap=3
#     )
#     graph.set_axis_labels("Chunk size", "Compression ratio")

In [None]:
benchmarks_df.columns

# Comparison between python packages

In [None]:
write_benchmarks = read_write_benchmarks[read_write_benchmarks.group == "write"]
read_benchmarks = read_write_benchmarks[read_write_benchmarks.group == "read"]
write_chunks_128 = write_benchmarks[write_benchmarks.chunk_size == 128]
read_chunks_128 = read_benchmarks[read_benchmarks.chunk_size == 128]

In [None]:
write_chunks_128.head()

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2 for zstd/gzip
graph = sns.relplot(
    data=write_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in write_chunks_128.compressor.unique():
    compressor_writes = write_chunks_128[write_chunks_128.compressor == compressor]
    graph = sns.relplot(
        data=compressor_writes,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean write time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)

In [None]:
# Tensorstore is winning for read times!
graph = sns.relplot(
    data=read_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in read_chunks_128.compressor.unique():
    compressor_reads = read_chunks_128[read_chunks_128.compressor == compressor]
    graph = sns.relplot(
        data=compressor_reads,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean read time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)