In [1]:
import polars as pl
import os
import time
import resource
import itertools as it


# Log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Base Medium Polars Queries with LazyFrame
def base_medium_polars_queries(log_dir, result_file):
    print(",base_medium_polars_queries")
    df = pl.scan_csv(log_dir)  # Lazy loading of CSV
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()
            
            res = df.group_by([ix, iy]).agg(pl.col(col).sum())
            
            res = res.group_by(ix).agg(pl.col(col).sum()).collect(streaming=True)
            
            # Measure time and memory usage
            time_elapsed = time.time() - t1
            memory_usage = res.estimated_size()

            # Log results
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

# IOMAX Medium Polars Queries with LazyFrame
def iomax_medium_polars_queries(log_dir, result_file):
    print(",iomax_medium_polars_queries")
    df = pl.scan_csv(log_dir)  # Lazy loading of CSV
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time.time()
            memory_usage = 0
            
            if query_index == 1:
                x = df.group_by(indices).agg([
                    pl.col("request_io_size_bytes").sum(),
                    pl.col("disk_time").sum()
                ]).collect(streaming=True)
                memory_usage = x.estimated_size()

            res = x.group_by([ix, iy]).agg(pl.col(col).sum())
            res = res.group_by(ix).agg(pl.col(col).sum())
            time_elapsed = time.time() - t1

            # Log results
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

base_medium_polars_queries("../datasets_thesios_io_traces/dataset-125m.csv", "results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt")

,base_medium_polars_queries
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q1,167.08983635902405,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q2,89.22701263427734,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q3,63.051602363586426,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q4,78.68493509292603,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q5,74.0441517829895,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q6,79.70606541633606,303038352
File results_rust_medium_queries_csv/results_rust_plain_medium_125m.txt created/updated successfully!
medium,Q7,45.09219264984131,6

In [2]:
import polars as pl
print(pl.__version__)

1.13.1
