In [0]:
from pyspark.sql import SparkSession, functions as f

class SparkBenchmark:
    """Scalable benchmark suite for Apache Spark with diverse task coverage."""

    def __init__(self, spark: SparkSession, tasks: list[str] = None):
        if tasks is None:
            tasks = ["*"]

        self._spark = spark
        self.tasks = {
            bm_name: getattr(self, bm_name) for bm_name in dir(self)
            if callable(getattr(self, bm_name)) and not bm_name.startswith("_") and (bm_name in tasks or "*" in tasks)
        }
    def _read_multiple_csv_files(self, data_path: str):
        """Read multiple CSV files based on configuration."""

        return self._spark.read.csv(data_path, header=True, inferSchema=True)
    
    def group_by_model(self, data_path: str):
        """
        Benchmark type: Aggregation + statistics.
        Measures group-by and aggregation kernel efficiency.
        """

        df = self._read_multiple_csv_files(data_path)
        result = (
            df.groupBy("model")
            .agg(
                f.mean("smart_5_raw").alias("avg_smart_5_raw"),
                f.stddev("smart_5_raw").alias("std_smart_5_raw"),
                f.countDistinct("serial_number").alias("unique_disks"),
                f.sum("failure").alias("failures_total"),
            )
            .orderBy(f.desc("avg_smart_5_raw"))
        )
        return result

    def union_and_aggregate(self, data_folders: list[str]):
        """
        Benchmark type: Multi-file union + global reduction.
        Tests concatenation (unionAll) performance and overall aggregation.
        """

        if len(data_folders) < 2:
            raise ValueError("At least two files are required for union test.")

        base_df = self._spark.read.csv(data_folders[0], header=True, inferSchema=True)
        for data_path in data_folders[1:]:
            next_df = self._spark.read.csv(data_path, header=True, inferSchema=True)
            base_df = base_df.unionByName(next_df, allowMissingColumns=True)

        result = (
            base_df.agg(
                f.mean("smart_5_raw").alias("mean_smart_5"),
                f.mean("smart_187_raw").alias("mean_smart_187"),
                f.sum("failure").alias("total_failures"),
                f.count("*").alias("records_total"),
            )
            .withColumn(
                "failure_rate",
                f.col("total_failures") / f.col("records_total"),
            )
        )
        return result

    def join_adjacent_days(self, data_folders: list[str]):
        """
        Benchmark type: Join between adjacent files.
        Tests join performance and shuffle behavior (Spark equivalent of Polars version).
        """

        if len(data_folders) < 2:
            raise ValueError("At least two files are required for join test.")

        base_df = (
            self._spark.read.csv(data_folders[0], header=True, inferSchema=True)
            .select("serial_number", "smart_5_raw", "date")
        )

        for i, data_path in enumerate(data_folders[1:], start=1):
            next_df = (
                self._spark.read.csv(data_path, header=True, inferSchema=True)
                .select(
                    f.col("serial_number").alias("serial_number_next"),
                    f.col("smart_5_raw").alias(f"smart_5_raw_next_{i}"),
                    f.col("date").alias(f"date_next_{i}")
                )
            )

            base_df = (
                base_df.join(
                    next_df,
                    base_df.serial_number == next_df.serial_number_next,
                    "inner"
                )
                .drop("serial_number_next")
            )

        last_suffix = f"_{i}"
        base_df = base_df.withColumn(
            f"smart_delta{last_suffix}",
            f.col(f"smart_5_raw_next{last_suffix}") - f.col("smart_5_raw")
        )

        return base_df