In [None]:
import seaborn as sns

from zarr_benchmarks.parse_json_for_plots import get_benchmarks_dataframe

In [None]:
package_paths_dict = {
    "zarr_python_2": "../../data/json/full_test_run_2/0005_zarr-python-v2.json",
    "zarr_python_3": "../../data/json/full_test_run_2/0006_zarr-python-v3.json",
    "tensorstore": "../../data/json/full_test_run_2/0007_tensorstore.json",
}

benchmarks_df = get_benchmarks_dataframe(package_paths_dict)

In [None]:
benchmarks_df.head()

In [None]:
read_write_benchmarks = benchmarks_df[
    benchmarks_df.chunk_size.isin([64, 128])
    & (~benchmarks_df.blosc_shuffle.isin(["bitshuffle", "noshuffle"]))
]

In [None]:
read_write_benchmarks.head()

# Zarr-python v2 (read-write)

In [None]:
benchmarks_zarr_v2 = read_write_benchmarks[
    read_write_benchmarks.package == "zarr_python_2"
]
write_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "write"]
read_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "read"]

In [None]:
write_zarr_v2_chunks_128 = write_zarr_v2[write_zarr_v2.chunk_size == 128]
read_zarr_v2_chunks_128 = read_zarr_v2[read_zarr_v2.chunk_size == 128]

In [None]:
# As compression ratio increases, so does mean write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# As compression ratio increases, so does mean write time (LOG SCALE)
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set(xscale="log")
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# Blosc only with xlim to see left
blosc_only_write = write_zarr_v2_chunks_128[
    ~write_zarr_v2_chunks_128.compressor.isin(["gzip", "zstd"])
]
graph = sns.relplot(
    data=blosc_only_write,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
    facet_kws=dict(xlim=(0, 5)),
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set(xscale="log")
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Blosc only reads
blosc_only_read = read_zarr_v2_chunks_128[
    ~read_zarr_v2_chunks_128.compressor.isin(["gzip", "zstd"])
]
graph = sns.relplot(
    data=blosc_only_read,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# higher compression level = higher compression ratio
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="compression_level",
    y="compression_ratio",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Compression ratio")

In [None]:
# higher compression ratio = larger write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_128,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Mean write time (s)")

In [None]:
# higher compression level, doesn't always mean higher read time (but maybe for some compressors? Would need more points...)
graph = sns.relplot(
    data=read_zarr_v2_chunks_128,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
    col_wrap=3,
)
graph.set_axis_labels("Compression level", "Mean read time (s)")

In [None]:
# Higher write time for higher compression ratios (regardless of chunk size)
graph = sns.relplot(
    data=write_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
graph = sns.relplot(
    data=read_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Keeping compression level the same, larger chunks sizes compress worse (WE'D NEED TO DO AT LEAST ONE MORE CHUNK SIZE FOR THIS GRAPH TO WORK)
# for compressor in read_zarr_v2.compressor.unique():
#     compressor_reads = read_zarr_v2[read_zarr_v2.compressor == compressor]
#     graph = sns.relplot(
#         data=compressor_reads,
#         x="chunk_size",
#         y="compression_ratio",
#         hue="compressor",
#         style="compressor",
#         col="compression_level",
#         height=4,
#         aspect=1.2,
#         col_wrap=3
#     )
#     graph.set_axis_labels("Chunk size", "Compression ratio")

In [None]:
benchmarks_df.columns

# Comparison between python packages

In [None]:
write_benchmarks = read_write_benchmarks[read_write_benchmarks.group == "write"]
read_benchmarks = read_write_benchmarks[read_write_benchmarks.group == "read"]
write_chunks_128 = write_benchmarks[write_benchmarks.chunk_size == 128]
read_chunks_128 = read_benchmarks[read_benchmarks.chunk_size == 128]

In [None]:
write_chunks_128.head()

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2 for zstd/gzip
graph = sns.relplot(
    data=write_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in write_chunks_128.compressor.unique():
    compressor_writes = write_chunks_128[write_chunks_128.compressor == compressor]
    graph = sns.relplot(
        data=compressor_writes,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean write time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)

In [None]:
# Tensorstore is winning for read times!
graph = sns.relplot(
    data=read_chunks_128,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in read_chunks_128.compressor.unique():
    compressor_reads = read_chunks_128[read_chunks_128.compressor == compressor]
    graph = sns.relplot(
        data=compressor_reads,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean read time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)