In [5]:
# The params below are needed only if the researcher wants to run the notebook with papermill
# query_function = None
# dataset_path = None
# result_file = None

In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pandas as pd
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import numpy as np
import json
import resource
import numpy as np

# Function to log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    # Check if the file was created
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Query function with memory limit
def base_simple_pandas_queries(log_dir, result_file):
    print(f",base_simple_pandas_queries")
    df = pd.read_csv(log_dir)
    query_index = 0
    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']: 
            query_index += 1 
            t1 = time()
            res = df.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1

            # Memory usage
            memory_usage = res.memory_usage(deep=True).sum()
            
            # Store the time and memory usage for graphing later   
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

def base_simple_dask_queries(log_dir, result_file):
    print(f",base_simple_dask_queries")
    query_index = 0
    ddf = dd.read_csv(f"{log_dir}")
    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']:
            query_index += 1 
            t1 = time()
            res = ddf.groupby([ix]).agg({col: 'sum'}).compute()
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)
            
            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")


def iomax_simple_pandas_queries(log_dir, result_file):
    print(f",iomax_simple_pandas_queries")
    query_index = 0
    df = pd.read_csv(log_dir)
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = {col: 'sum' for col in cols}
    for ix in ['filename', 'application']:
        for col in cols:
            query_index += 1
            t1 = time()
            memory_usage = 0
            if query_index == 1:
                x = df.groupby(['filename', 'application']).agg(agg_dict)
                memory_usage = x.memory_usage(deep=True).sum()
            x.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

def iomax_simple_dask_queries(log_dir, result_file):
    print(f",iomax_simple_dask_queries")
    query_index = 0
    ddf = dd.read_csv(f"{log_dir}")
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = {col: 'sum' for col in cols}
    for ix in ['filename', 'application']:
        for col in cols:
            query_index += 1
            t1 = time()
            memory_usage = 0
            if query_index == 1:
                x = ddf.groupby(['filename', 'application']).agg(agg_dict).compute()
                memory_usage = x.memory_usage(deep=True).sum()
            x.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

base_simple_pandas_queries("datasets_thesios_io_traces/dataset-200k.csv", "results_simple_queries_csv/results_pandas_plain_simple_200k.txt")

,base_simple_pandas_queries
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q1,0.01770305633544922,2037684
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q2,0.01626443862915039,2037684
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q3,0.017340660095214844,2037684
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q4,0.01747608184814453,2037684
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q5,0.017590999603271484,2037684
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q6,0.010231256484985352,10281
File results_simple_queries_csv/results_pandas_plain_simple_200k.txt created/updated successfully!
simple,Q7,0.009399652481079102,10281
File results_