In [None]:
import seaborn as sns
from rough_plots import get_benchmarks_dataframe

In [None]:
zarr_v2_path = "../data/json/0007_zarr-python-v2.json"
zarr_v3_path = "../data/json/0008_zarr-python-v3.json"
tensorstore_path = "../data/json/0009_tensorstore.json"

benchmarks_df = get_benchmarks_dataframe(
    (zarr_v2_path, zarr_v3_path, tensorstore_path),
    package_ids=("zarr_python_2", "zarr_python_3", "tensorstore"),
)

In [None]:
benchmarks_df.head()

# Zarr-python v2

In [None]:
benchmarks_zarr_v2 = benchmarks_df[benchmarks_df.package == "zarr_python_2"]
write_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "write"]
read_zarr_v2 = benchmarks_zarr_v2[benchmarks_zarr_v2.group == "read"]

In [None]:
write_zarr_v2_chunks_200 = write_zarr_v2[write_zarr_v2.chunk_size == 200]
read_zarr_v2_chunks_200 = read_zarr_v2[read_zarr_v2.chunk_size == 200]

In [None]:
# As compression ratio increases, so does mean write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_zarr_v2_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# higher compression level = higher compression ratio
graph = sns.relplot(
    data=write_zarr_v2_chunks_200,
    x="compression_level",
    y="compression_ratio",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False),
)
graph.set_axis_labels("Compression level", "Compression ratio")

In [None]:
# higher compression ratio = larger write time
graph = sns.relplot(
    data=write_zarr_v2_chunks_200,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
)
graph.set_axis_labels("Compression level", "Mean write time (s)")

In [None]:
# higher compression level, doesn't always mean higher read time (but maybe for some compressors? Would need more points...)
graph = sns.relplot(
    data=read_zarr_v2_chunks_200,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
)
graph.set_axis_labels("Compression level", "Mean read time (s)")

In [None]:
# Higher write time for higher compression ratios (regardless of chunk size)
graph = sns.relplot(
    data=write_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
graph = sns.relplot(
    data=read_zarr_v2,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Keeping compression level the same, larger chunks sizes compress worse
for compressor in read_zarr_v2.compressor.unique():
    compressor_reads = read_zarr_v2[read_zarr_v2.compressor == compressor]
    graph = sns.relplot(
        data=compressor_reads,
        x="chunk_size",
        y="compression_ratio",
        hue="compressor",
        style="compressor",
        col="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Chunk size", "Compression ratio")

In [None]:
benchmarks_df.columns

# Comparison between python packages

In [None]:
write_benchmarks = benchmarks_df[benchmarks_df.group == "write"]
read_benchmarks = benchmarks_df[benchmarks_df.group == "read"]
write_chunks_200 = write_benchmarks[write_benchmarks.chunk_size == 200]
read_chunks_200 = read_benchmarks[read_benchmarks.chunk_size == 200]

In [None]:
write_chunks_200.head()

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2 for zstd/gzip
graph = sns.relplot(
    data=write_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in write_chunks_200.compressor.unique():
    compressor_writes = write_chunks_200[write_chunks_200.compressor == compressor]
    graph = sns.relplot(
        data=compressor_writes,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean write time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)

In [None]:
# Tensorstore is winning for read times!
graph = sns.relplot(
    data=read_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="package",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# zarr python v3 and tensorstore, seem quite a bit faster than zarr python v2
for compressor in read_chunks_200.compressor.unique():
    compressor_reads = read_chunks_200[read_chunks_200.compressor == compressor]
    graph = sns.relplot(
        data=compressor_reads,
        x="stats.mean",
        y="compression_ratio",
        hue="package",
        style="package",
        size="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Mean read time (s)", "Compression ratio")
    graph.fig.suptitle(compressor)
    graph.fig.subplots_adjust(top=0.9)