In [1]:
import itertools as it
import numpy as np
import pandas as pd
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import numpy as np
import json
import resource
import numpy as np
import ray
import ray.data as rd
from ray.data.aggregate import AggregateFn

# Initialize Ray
ray.init(_temp_dir="/tmp/ray", object_store_memory=10**9)

# Function to log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    # Check if the file was created
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

@ray.remote
def base_simple_ray_queries(log_dir, result_file):
    print(f",base_simple_ray_queries")
    
    # Load the dataset using Ray
    df = rd.read_csv(log_dir)
    query_index = 0

    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']:
            query_index += 1
            t1 = time()
            
            try:
                res = df.groupby([ix]).sum(col).materialize()

                # Compute memory usage (approximation)
                if isinstance(res, ray.data.Dataset):
                    memory_usage = res.size_bytes()
                else:
                    memory_usage = res.memory_usage(deep=True).sum()

                time_elapsed = time() - t1

                # Store the time and memory usage for graphing later
                log_results_to_file(result_file, time_elapsed, memory_usage)

                print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

            except Exception as e:
                print(f"Error in query {query_index}: {e}")
                res = None
                continue

    # Cleanup
    del df, res
    gc.collect()

@ray.remote
def iomax_simple_ray_queries(log_dir, result_file):
    print(f",iomax_simple_ray_queries")
    query_index = 0
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']

    # Load dataset with Ray
    df = ray.data.read_csv(f"{log_dir}")

    # Define aggregation functions
    agg_fns = [
        AggregateFn(
            init=lambda _: 0,
            accumulate_row=lambda acc, row: acc + row[col],
            merge=lambda acc1, acc2: acc1 + acc2,
            name=f"{col}_sum"
        )
        for col in cols
    ]
    x = None  # Placeholder for intermediate result

    for ix in ['filename', 'application']:
        for agg_fn, col in zip(agg_fns, cols):
            query_index += 1
            t1 = time()
            memory_usage = 0

            try:
                if query_index == 1:
                    # Perform the initial grouping and aggregation
                    x = df.groupby(['filename', 'application']).aggregate(*agg_fns).materialize()
                    # Compute memory usage (approximation)
                    memory_usage = x.size_bytes()

                # Perform subsequent group-by aggregations on the result
                result = x.groupby([ix]).aggregate(
                    AggregateFn(
                        init=lambda _: 0,
                        accumulate_row=lambda acc, row: acc + row[f"{col}_sum"],
                        merge=lambda acc1, acc2: acc1 + acc2,
                        name=f"{col}_sum_final"
                    )
                ).materialize()

                time_elapsed = time() - t1

                # Log results
                log_results_to_file(result_file, time_elapsed, memory_usage)

                print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

            except Exception as e:
                print(f"Error in query {query_index}: {e}")
                continue

    # Cleanup
    del df, x
    gc.collect()

ray.get(iomax_simple_ray_queries.remote("../datasets_thesios_io_traces/dataset-125m.csv", "results_ray_simple_queries_csv/results_ray_iomax_simple__TEST.txt"))

# Shut down Ray after queries complete
ray.shutdown()

2024-11-25 11:22:17,646	INFO worker.py:1816 -- Started a local Ray instance.


[36m(base_simple_ray_queries pid=50254)[0m ,base_simple_ray_queries


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV]


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q1,3.0122838020324707,1200496


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q2,0.9253604412078857,1200496


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q3,0.775383472442627,1200496


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q4,0.7818777561187744,1200496


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q5,0.7616517543792725,1200496


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q6,0.5755198001861572,5988


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q7,0.5712475776672363,5988


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q8,1.161604642868042,5988


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


[36m(base_simple_ray_queries pid=50254)[0m File results_ray_simple_queries_csv/results_ray_base_simple_200k_TEST.txt created/updated successfully!
[36m(base_simple_ray_queries pid=50254)[0m simple,Q9,0.5451247692108154,5988


(pid=50254) Running 0: 0.00 row [00:00, ? row/s]

(pid=50254) Sort Sample 2:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Map 3:   0%|                                                          | 0.00/1.00 [00:00<?…

(pid=50254) Shuffle Reduce 4:   0%|                                                       | 0.00/1.00 [00:00<?…

[36m(base_simple_ray_queries pid=50254)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-25_11-22-15_763215_49327/logs/ray-data
[36m(base_simple_ray_queries pid=50254)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]
