In [None]:
import polars as pl
import time
import os
import itertools as it

indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Base Hard Polars queries without applying custom functions
def base_hard_polars_queries(log_dir, result_file):
    print(",base_hard_polars_queries")
    df = pl.scan_csv(log_dir)  # Lazy loading
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()

            res = (
                df.group_by(ix)
                .agg([
                    pl.col(iy).alias(iy),          # Collect the column as is
                    pl.col(col).sum().alias(col)   # Sum aggregation
                ])
                .explode(iy)                      # Explode the column
                .group_by(iy)
                .agg([
                    pl.col(ix).alias(ix),          # Collect column again as is
                    pl.col(col).sum().alias(col)   # Sum aggregation
                ])
            ).collect(streaming=True)  # Collect to materialize the computation

            time_elapsed = time.time() - t1
            memory_usage = res.estimated_size()  # Memory usage of the result

            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"hard,Q{query_index},{time_elapsed},{memory_usage}")

# IOMAX Hard Polars queries with LazyFrame
def iomax_hard_polars_queries(log_dir, result_file):
    print(",iomax_hard_polars_queries")
    df = pl.scan_csv(log_dir)  # Lazy loading
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()
            memory_usage = 0
            if query_index == 1:
                x = df.group_by(indices).agg([
                    pl.col("request_io_size_bytes").sum(),
                    pl.col("disk_time").sum()
                ]).collect(streaming=True)
                memory_usage = x.estimated_size()

            res = (
                x.group_by(ix)
                .agg([
                    pl.col(iy).alias(iy),
                    pl.col(col).sum()
                ])
                .explode(iy)
                .group_by(iy)
                .agg([
                    pl.col(ix).alias(ix),
                    pl.col(col).sum()
                ])
            )

            time_elapsed = time.time() - t1

            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"hard,Q{query_index},{time_elapsed},{memory_usage}")

base_hard_polars_queries("../datasets_thesios_io_traces/dataset-125m.csv", "results_rust_hard_queries_csv/results_rust_plain_hard_125m.txt")

,base_hard_polars_queries
File results_rust_hard_queries_csv/results_rust_plain_hard_125m.txt created/updated successfully!
hard,Q1,819.9540143013,8000071592
