In [None]:
import json
import pandas as pd
import seaborn as sns

In [None]:
def load_benchmarks_json(path_to_file: str) -> dict:
    with open(path_to_file, "r") as in_file_obj:
        text = in_file_obj.read()
        # convert the text into a dictionary
        return json.loads(text)


def prepare_benchmarks_dataframe(json_dict: dict) -> pd.DataFrame:
    benchmark_df = pd.json_normalize(json_dict["benchmarks"])

    # copy compression ratio from read benchmarks to write benchmarks
    param_cols = [col for col in benchmark_df if col.startswith("params")]
    benchmark_df["compression_ratio"] = benchmark_df.groupby(
        param_cols, dropna=False, as_index=False
    )["extra_info.compression_ratio"].transform("max")

    # combine compression level columns for different compressors
    benchmark_df["compression_level"] = benchmark_df[
        ["params.blosc_clevel", "params.gzip_level", "params.zstd_level"]
    ].max(axis=1)

    # create column for type of compressor
    benchmark_df["compressor"] = ""
    blosc_compressors = (
        "blosc-"
        + benchmark_df.loc[
            ~benchmark_df["params.blosc_clevel"].isna(), "params.blosc_cname"
        ]
    )
    benchmark_df.loc[~benchmark_df["params.blosc_clevel"].isna(), "compressor"] = (
        blosc_compressors
    )
    benchmark_df.loc[~benchmark_df["params.gzip_level"].isna(), "compressor"] = "gzip"
    benchmark_df.loc[~benchmark_df["params.zstd_level"].isna(), "compressor"] = "zstd"

    # remove un-needed columns
    stats_cols = [col for col in benchmark_df if col.startswith("stats")]
    benchmark_df = benchmark_df[
        [
            "group",
            "compressor",
            "compression_level",
            "compression_ratio",
            "params.chunk_size",
        ]
        + stats_cols
    ]
    benchmark_df = benchmark_df.rename(columns={"params.chunk_size": "chunk_size"})

    return benchmark_df

In [None]:
json_path = "../data/json/0007_zarr-python-v2.json"
json_dict = load_benchmarks_json(json_path)
benchmark_df = prepare_benchmarks_dataframe(json_dict)

In [None]:
benchmark_df.head()

In [None]:
write_benchmarks = benchmark_df[benchmark_df.group == "write"]
read_benchmarks = benchmark_df[benchmark_df.group == "read"]

In [None]:
write_benchmarks_chunks_200 = write_benchmarks[write_benchmarks.chunk_size == 200]
read_benchmarks_chunks_200 = read_benchmarks[read_benchmarks.chunk_size == 200]

In [None]:
# As compression ratio increases, so does mean write time
graph = sns.relplot(
    data=write_benchmarks_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
# read time doesn't vary greatly depending on compression ratio; but does vary significantly between compressors
graph = sns.relplot(
    data=read_benchmarks_chunks_200,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    height=4,
    aspect=1.5,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# higher compression level = higher compression ratio
graph = sns.relplot(
    data=write_benchmarks_chunks_200,
    x="compression_level",
    y="compression_ratio",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False),
)
graph.set_axis_labels("Compression level", "Compression ratio")

In [None]:
# higher compression ratio = larger write time
graph = sns.relplot(
    data=write_benchmarks_chunks_200,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
)
graph.set_axis_labels("Compression level", "Mean write time (s)")

In [None]:
# higher compression level, doesn't always mean higher read time (but maybe for some compressors? Would need more points...)
graph = sns.relplot(
    data=read_benchmarks_chunks_200,
    x="compression_level",
    y="stats.mean",
    col="compressor",
    hue="compressor",
    facet_kws=dict(sharex=False, sharey=False),
)
graph.set_axis_labels("Compression level", "Mean read time (s)")

In [None]:
# Higher write time for higher compression ratios (regardless of chunk size)
graph = sns.relplot(
    data=write_benchmarks,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean write time (s)", "Compression ratio")

In [None]:
graph = sns.relplot(
    data=read_benchmarks,
    x="stats.mean",
    y="compression_ratio",
    hue="compressor",
    style="compressor",
    size="compression_level",
    col="chunk_size",
    height=4,
    aspect=1.2,
)
graph.set_axis_labels("Mean read time (s)", "Compression ratio")

In [None]:
# Keeping compression level the same, larger chunks sizes compress worse
for compressor in benchmark_df.compressor.unique():
    compressor_reads = read_benchmarks[read_benchmarks.compressor == compressor]
    graph = sns.relplot(
        data=compressor_reads,
        x="chunk_size",
        y="compression_ratio",
        hue="compressor",
        style="compressor",
        col="compression_level",
        height=4,
        aspect=1.2,
    )
    graph.set_axis_labels("Chunk size", "Compression ratio")

In [None]:
benchmark_df.columns