In [0]:
%pip install --upgrade "polars[rt64]"

In [0]:
import glob
from pathlib import Path
from typing import List

import polars as pl


class PolarsBenchmark:
    """
    Optimized Polars benchmark suite.
    Handles CSV folders with inconsistent schemas (extra/missing columns).
    """

    def __init__(self, tasks: List[str] | None = None):
        tasks = tasks or ["*"]

        self.tasks = {
            name: getattr(self, name)
            for name in dir(self)
            if callable(getattr(self, name))
            and not name.startswith("_")
            and (name in tasks or "*" in tasks)
        }

    def _lazy_from_path(self, path: str) -> pl.LazyFrame:
        """
        Reads CSV files from:
        - a single file
        - a folder
        - nested subfolders
        Returns a LazyFrame with schema-relaxed concatenation.
        """

        p = Path(path)

        if p.is_dir():
            files = sorted(str(f) for f in p.rglob("*.csv"))

            if not files:
                raise FileNotFoundError(f"No CSV files found under {path}")

            lazy_frames = [
                pl.scan_csv(f, low_memory=True)
                for f in files
            ]

            return pl.concat(lazy_frames, how="diagonal_relaxed")

        if "*" in path:
            files = sorted(glob.glob(path))
            if not files:
                raise FileNotFoundError(f"No matching CSV for pattern {path}")

            lazy_frames = [pl.scan_csv(f, low_memory=True) for f in files]
            return pl.concat(lazy_frames, how="diagonal_relaxed")

        if p.is_file() and p.suffix.lower() == ".csv":
            return pl.scan_csv(str(p), low_memory=True)

        raise ValueError(f"Path is neither file, folder, nor pattern: {path}")

    def _lazy_union(self, paths: List[str]) -> pl.LazyFrame:
        """
        UNION all folders/files with safe schema merging.
        """
        lfs = [self._lazy_from_path(p) for p in paths]
        return pl.concat(lfs, how="diagonal_relaxed")

    def group_by_model(self, data_path: str) -> pl.DataFrame:
        df = self._lazy_from_path(data_path)

        return (
            df.group_by("model")
            .agg([
                pl.col("smart_5_raw").mean().alias("avg_smart_5_raw"),
                pl.col("smart_5_raw").std().alias("std_smart_5_raw"),
                pl.col("serial_number").n_unique().alias("unique_disks"),
                pl.col("failure").sum().alias("failures_total"),
            ])
            .sort("avg_smart_5_raw", descending=True)
            .collect()
        )

    def union_and_aggregate(self, data_folders: List[str]) -> pl.DataFrame:
        if len(data_folders) < 2:
            raise ValueError("At least two folders/files required")

        df = self._lazy_union(data_folders)

        df = df.with_columns([
            pl.col("smart_5_raw").cast(pl.Float64, strict=False),
            pl.col("smart_187_raw").cast(pl.Float64, strict=False),
            pl.col("failure").cast(pl.Int64, strict=False),
        ])

        return (
            df.select([
                pl.col("smart_5_raw").mean().alias("mean_smart_5"),
                pl.col("smart_187_raw").mean().alias("mean_smart_187"),
                pl.col("failure").sum().alias("total_failures"),
                pl.len().alias("records_total"),
            ])
            .with_columns(
                (pl.col("total_failures") / pl.col("records_total")).alias("failure_rate")
            )
            .collect(engine="streaming")
        )

    def join_adjacent_days(self, data_folders: List[str]) -> pl.DataFrame:
        if len(data_folders) < 2:
            raise ValueError("At least two folders/files required")

        base = (
            self._lazy_from_path(data_folders[0])
            .select([
                pl.col("serial_number"),
                pl.col("smart_5_raw").cast(pl.Float64).alias("smart_5_raw_0"),
            ])
        )

        for i, p in enumerate(data_folders[1:], start=1):
            nxt = (
                self._lazy_from_path(p)
                .select([
                    pl.col("serial_number"),
                    pl.col("smart_5_raw").cast(pl.Float64).alias(f"smart_5_raw_{i}"),
                ])
            )

            base = (
                base.join(
                    nxt,
                    on="serial_number",
                    how="inner",
                )
            )

        delta_exprs = [
            (pl.col(f"smart_5_raw_{i}") - pl.col("smart_5_raw_0")).alias(f"smart_delta_{i}")
            for i in range(1, len(data_folders))
        ]

        result = base.with_columns(delta_exprs)

        return result.collect(engine="streaming")
