In [3]:
# The params below are needed if the researcher wants to run the notebook in isolation with papermill
# query_function = None
# dataset_path = None
# result_file = None

In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pandas as pd
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import numpy as np
import json
import resource
import numpy as np

indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Function to log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    # Check if the file was created
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

def base_hard_pandas_queries(log_dir, result_file):
    print(f",base_hard_pandas_queries")
    df = pd.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()
            res = df.groupby([ix]).agg({iy: list, col: 'sum'}).explode(iy).reset_index().groupby([iy]).agg({ix: list, col: 'sum'})
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"hard,Q{query_index},{time_elapsed},{memory_usage}")

def base_hard_dask_queries(log_dir, result_file):
    print(f",base_hard_dask_queries")
    ddf = dd.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            res = ddf.groupby([ix]).agg({iy: list, col: 'sum'}).explode(iy).reset_index().groupby([iy]).agg({ix: list, col: 'sum'}).compute()
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"hard,Q{query_index},{time_elapsed},{memory_usage}")

def iomax_hard_pandas_queries(log_dir, result_file):
    print(f",iomax_hard_pandas_queries")
    df = pd.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            m = 0
            if query_index == 1:
                x = df.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'}).reset_index()
                m = x.memory_usage(deep=True).sum()
            x.groupby([ix]) \
                .agg({iy: list, col: 'sum'}) \
                .reset_index() \
                .explode(iy) \
                .groupby([iy]) \
                .agg({ix: list, col: 'sum'})
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, m)

            print(f"hard,Q{query_index},{time_elapsed},{m}")

def iomax_hard_dask_queries(log_dir, result_file):
    print(f",iomax_hard_dask_queries")
    ddf = dd.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            m = 0
            if query_index == 1:
                x = ddf.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'}).reset_index().compute()
                m = x.memory_usage(deep=True).sum()
            x.groupby([ix]) \
                .agg({iy: list, col: 'sum'}) \
                .reset_index() \
                .explode(iy) \
                .groupby([iy]) \
                .agg({ix: list, col: 'sum'})
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, m)

            print(f"hard,Q{query_index},{time_elapsed},{m}")

iomax_hard_pandas_queries("datasets_thesios_io_traces/dataset-25m.csv", "results_hard_queries_csv/results_pandas_iomax_hard_25m.txt")

,iomax_hard_pandas_queries
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q1,21.519726991653442,326177196
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q2,7.544407844543457,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q3,6.922470569610596,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q4,6.818227291107178,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q5,6.8785998821258545,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q6,9.437772989273071,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q7,0.17970538139343262,0
File results_hard_queries_csv/results_pandas_iomax_hard_25m.txt created/updated successfully!
hard,Q8,0.195