In [1]:
import polars as pl
import os
import time
import sys
import resource
import pandas as pd
import tracemalloc

# Log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    
    # Check if the file was created
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Base Polars queries with LazyFrame
def base_simple_polars_queries(log_dir, result_file):
    print(",base_simple_polars_queries")
    df = pl.scan_csv(log_dir)
    query_index = 0
    
    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']: 
            query_index += 1 
            t1 = time.time()
            
            # Perform group_by and aggregation lazily, collect to materialize
            res = df.group_by(ix).agg([pl.col(col).sum()]).collect(streaming=True)
            time_elapsed = time.time() - t1

            # Memory usage
            memory_usage = res.estimated_size()

            # Store the time and memory usage for logging
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

# IOMAX Polars queries with LazyFrame
def iomax_simple_polars_queries(log_dir, result_file):
    print(",iomax_simple_polars_queries")
    query_index = 0
    df = pl.scan_csv(log_dir)
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = [pl.col(col).sum() for col in cols]
    for ix in ['filename', 'application']:
        for col in cols:
            query_index += 1
            t1 = time.time()
            memory_usage = 0
            if query_index == 1:
                x = df.group_by(["filename", "application"]).agg(agg_dict).collect(streaming=True)
                memory_usage = x.estimated_size()
            res = x.group_by(ix).agg([pl.col(col).sum()])
            time_elapsed = time.time() - t1

            # Store the time and memory usage for logging
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

iomax_simple_polars_queries("../datasets_thesios_io_traces/dataset-125m.csv", "results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt")

,iomax_simple_polars_queries
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q1,262.2061264514923,571819816
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q2,0.6832168102264404,0
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q3,2.932708978652954,0
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q4,0.649237871170044,0
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q5,0.6661269664764404,0
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q6,0.36383867263793945,0
File results_rust_simple_queries_csv/results_rust_iomax_simple_125m.txt created/updated successfully!
simple,Q7,0.43001675605773926,0
File results_rust_simple_queries_