In [1]:
%load_ext autoreload
%autoreload 2

In [61]:
import os

import polars as pl

from src.local.configs import BenchmarksConfig
from src.local.manager import BenchmarksManager
from src.local.settings import REPORT_FILE_NAME, REPORT_FOLDER_NAME, ROOT_FOLDER

In [0]:
cfg = BenchmarksConfig.load_from_json("../../benchmarks_config/local/benchmarks.json")
manager = BenchmarksManager(configs=cfg)
result = manager.run()
# print(result)


In [62]:
report_df = pl.read_csv(os.path.join(REPORT_FOLDER_NAME, REPORT_FILE_NAME))
# report_df

In [46]:
def get_folder_size(path: str, max_files: int = 1e6) -> int:
    """Return folder size in GB (including subfolders)."""

    total_size = 0

    for root, dirs, files in os.walk(path):
        for f in files:
            fp = os.path.join(root, f)

            if not os.path.exists(fp):
                continue

            total_size += os.path.getsize(fp)
            max_files -= 1
            if max_files <= 0:
                return round(total_size / 1024**3, 2)

    return round(total_size / 1024**3, 2)

In [63]:
report_df = report_df.with_columns(
    pl.struct(["base_dir", "n_files"]).map_elements(
        lambda row: get_folder_size(
            os.path.join(ROOT_FOLDER, row["base_dir"]),
            row["n_files"]
        )
    ).alias("size"),
    (   pl.when((pl.col('cpu_count') == 2) & (pl.col('memory_gb') == 8)).then(pl.lit("case_4"))
        .when((pl.col('cpu_count') == 4) & (pl.col('memory_gb') == 16)).then(pl.lit("case_3"))
        .when((pl.col('cpu_count') == 8) & (pl.col('memory_gb') == 32)).then(pl.lit("case_2"))
        .when((pl.col('cpu_count') == 16) & (pl.col('memory_gb') == 64)).then(pl.lit("case_1"))
    ).alias("type"),
    pl.when(pl.col("polars(s)") == 0.0).then(pl.lit(0)).otherwise(pl.lit(1)).alias("polars_success"),
    pl.when(pl.col("spark(s)") == 0.0).then(pl.lit(0)).otherwise(pl.lit(1)).alias("spark_success")
).select("type", "size", "task", pl.col("polars(s)").alias("polars_duration"), pl.col("spark(s)").alias("spark_duration"), "polars_success", "spark_success")




In [74]:
groups = ["task"]
(
    report_df.filter(pl.col("polars_success") == 1).group_by(groups).agg(pl.mean("polars_duration").alias("polars_avg_duration"))
    .join(
        report_df.filter(pl.col("spark_success") == 1).group_by(groups).agg(pl.mean("spark_duration").alias("spark_avg_duration")),
        groups
    )
)


task,polars_avg_duration,spark_avg_duration
str,f64,f64
"""join_adjacent_days""",0.646206,2.635587
"""group_by_model""",33.578412,74.517488
"""quarterly_stats""",13.37535,72.870587
"""union_and_aggregate""",33.207794,72.449881


In [84]:
groups = ["task"]
(
    report_df.group_by(groups).agg(((pl.count("polars_success") - pl.sum("polars_success")) / pl.count("polars_success") * 100).alias("polars_failer(%)"))
    .join(
        report_df.group_by(groups).agg(((pl.count("spark_success") - pl.sum("spark_success")) / pl.count("spark_success") * 100).alias("spark_failer(%)")),
        groups
    )
)


task,polars_failer(%),spark_failer(%)
str,f64,f64
"""join_adjacent_days""",0.0,0.0
"""union_and_aggregate""",0.0,0.0
"""quarterly_stats""",25.0,0.0
"""group_by_model""",0.0,0.0
