In [None]:
import polars as pl
import os
import time
import resource

# Log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# def get_memory_usage():
#     return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss  # Memory in KB

# Base Medium Polars Queries with LazyFrame
def base_medium_polars_queries(log_dir, result_file):
    print(",base_medium_polars_queries")
    df = pl.scan_parquet(log_dir)  # Lazy loading of parquet
    query_index = 0
    combinations = [('op_type', 'service_class')]

    for ix, iy in combinations:
        for col in ['disk_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()
            
            # Perform first group_by aggregation on two columns
            res = df.group_by([ix, iy]).agg(pl.col(col).sum())
            
            # Perform secondary aggregation on the result of the first aggregation
            res = res.group_by(ix).agg(pl.col(col).sum()).collect()
            
            # Measure time and memory usage
            time_elapsed = time.time() - t1
            memory_usage = res.estimated_size()

            # Log results
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

# IOMAX Medium Polars Queries with LazyFrame
def iomax_medium_polars_queries(log_dir, result_file):
    print(",iomax_medium_polars_queries")
    df = pl.scan_parquet(log_dir)  # Lazy loading of parquet
    query_index = 0
    combinations = [('op_type', 'service_class')]

    for ix, iy in combinations:
        for col in ['disk_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()
            memory_usage = 0
            
            # Perform first aggregation for memory measurement on the first query only
            if query_index == 1:
                x = df.group_by([ix, iy]).agg([
                    pl.col("disk_io_size_bytes").sum(),
                    pl.col("disk_time").sum()
                ]).collect()
                memory_usage = get_memory_usage()

            # Secondary group_by and aggregation on individual column
            res = x.group_by(ix).agg(pl.col(col).sum())
            time_elapsed = time.time() - t1

            # Log results
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

# Example calls
iomax_medium_polars_queries("../datasets_thesios_io_traces/dataset-125m", "results_rust_medium_queries_parquet/results_parquet_rust_iomax_medium_125m.txt")

,iomax_medium_polars_queries
