In [None]:
import polars as pl
import os
import time
import sys
import resource

# Log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    
    # Check if the file was created
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Base Polars queries with LazyFrame
def base_simple_polars_queries(log_dir, result_file):
    print(",base_simple_polars_queries")
    df = pl.scan_parquet(log_dir)  # Lazy reading of the parquet
    query_index = 0

    for ix in ['redundancy_type', 'service_class']:
        for col in ['request_io_size_bytes', 'disk_io_size_bytes', 'response_io_size_bytes', 'disk_time', 'simulated_latency']: 
            query_index += 1 
            t1 = time.time()
            
            res = df.group_by(ix).agg([pl.col(col).sum()]).collect()
            time_elapsed = time.time() - t1

            # Memory usage
            memory_usage = res.estimated_size()

            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

# IOMAX Polars queries with LazyFrame
def iomax_simple_polars_queries(log_dir, result_file):
    print(",iomax_simple_polars_queries")
    query_index = 0
    df = pl.scan_parquet(log_dir)  # Lazy reading of the parquet
    cols = ['request_io_size_bytes', 'disk_io_size_bytes', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = [pl.col(col).sum() for col in cols]
    for ix in ['redundancy_type', 'service_class']:
        for col in cols:
            query_index += 1
            t1 = time.time()
            memory_usage = 0
            if query_index == 1:
                x = df.group_by(["redundancy_type", "service_class"]).agg(agg_dict).collect()
                memory_usage = x.estimated_size()
            res = x.group_by(ix).agg([pl.col(col).sum()])
            time_elapsed = time.time() - t1

            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

iomax_simple_polars_queries("../datasets_thesios_io_traces/dataset-125m", "results_rust_simple_queries_parquet/results_parquet_rust_iomax_simple_125m.txt")

,iomax_simple_polars_queries
