In [1]:
import ray
import ray.data as rd
from ray.data.aggregate import AggregateFn
from time import time
import os
import pandas as pd
import itertools as it
import gc


# Initialize Ray
ray.init(_temp_dir="/tmp/ray", object_store_memory=10**9)

# Define combinations of indices
indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Function to log results to a file
def log_results_to_file(result_file, time_elapsed, memory_usage):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time_elapsed}, Memory: {memory_usage}\n")
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

@ray.remote
def base_medium_ray_queries(log_dir, result_file):
    print("base_medium_ray_queries")
    
    # Load the dataset with Ray
    df = rd.read_csv(log_dir)
    
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()
            
            try:
                group_by = df.groupby([ix, iy]).materialize()
                aggregated = group_by.aggregate(
                    AggregateFn(
                        init=lambda col_val: 0,
                        accumulate_row=lambda acc, row: acc + row[col],
                        merge=lambda acc1, acc2: acc1 + acc2,
                        name=f"{col}_sum"
                    )
                ).materialize()

                res = aggregated.groupby([ix]).aggregate(
                    AggregateFn(
                        init=lambda col_val: 0,
                        accumulate_row=lambda acc, row: acc + row[f"{col}_sum"],
                        merge=lambda acc1, acc2: acc1 + acc2,
                        name=f"{col}_sum_final"
                    )
                ).materialize()

                # Compute memory usage
                if isinstance(res, ray.data.Dataset):
                    memory_usage = res.size_bytes()
                else:
                    memory_usage = res.memory_usage(deep=True).sum()

                time_elapsed = time() - t1
                
                # Log results
                log_results_to_file(result_file, time_elapsed, memory_usage)
                print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

            except Exception as e:
                print(f"Error in query {query_index}: {e}")
                continue

    # Cleanup
    del df, res
    gc.collect()

@ray.remote
def iomax_medium_ray_queries(log_dir, result_file):
    print(f",iomax_medium_ray_queries")
    
    # Load the dataset
    df = rd.read_csv(log_dir)
    query_index = 0
    x = None  # Placeholder for the precomputed result
    
    # List of columns to group by
    indices = ['filename', 'application', 'io_zone', 'redundancy_type']
    
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()
            memory_usage = 0

            try:
                if query_index == 1:
                    x = df.groupby(indices).aggregate(
                        AggregateFn(
                            init=lambda _: 0,
                            accumulate_row=lambda acc, row: acc + row['request_io_size_bytes'],
                            merge=lambda acc1, acc2: acc1 + acc2,
                            name="request_io_size_bytes"
                        ),
                        AggregateFn(
                            init=lambda _: 0,
                            accumulate_row=lambda acc, row: acc + row['disk_time'],
                            merge=lambda acc1, acc2: acc1 + acc2,
                            name="disk_time"
                        )
                    ).materialize()

                    # Compute memory usage (approximation)
                    if isinstance(x, ray.data.Dataset):
                        memory_usage = x.size_bytes()
                    else:
                        memory_usage = x.memory_usage(deep=True).sum()

                # Reuse `x` for subsequent queries
                grouped = x.groupby([ix, iy]).aggregate(
                    AggregateFn(
                        init=lambda _: 0,
                        accumulate_row=lambda acc, row: acc + row[col],
                        merge=lambda acc1, acc2: acc1 + acc2,
                        name=f"{col}_sum_final"
                    )
                ).groupby([ix]).aggregate(
                    AggregateFn(
                        init=lambda _: 0,
                        accumulate_row=lambda acc, row: acc + row[f"{col}_sum_final"],
                        merge=lambda acc1, acc2: acc1 + acc2,
                        name=f"{col}_sum_total"
                    )
                ).materialize()

                time_elapsed = time() - t1
                
                # Log results
                log_results_to_file(result_file, time_elapsed, memory_usage)
                print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

            except Exception as e:
                print(f"Error in query {query_index}: {e}")
                continue

    # Cleanup
    del df, x
    gc.collect()

ray.get(iomax_medium_ray_queries.remote("../datasets_thesios_io_traces/dataset-25m.csv", "results_ray_medium_queries_csv/results_ray_base_medium_25m.txt"))

# Shut down Ray after queries complete
ray.shutdown()

2024-11-18 19:54:42,442	INFO worker.py:1816 -- Started a local Ray instance.


[36m(iomax_medium_ray_queries pid=106303)[0m ,iomax_medium_ray_queries


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(raylet)[0m Spilled 3075 MiB, 122 objects, write throughput 904 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[36m(raylet)[0m Spilled 6001 MiB, 239 objects, write throughput 847 MiB/s.
[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q1,319.1110146045685,132205968


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q2,20.84027123451233,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q3,23.626575231552124,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q4,24.867728233337402,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q5,22.06327509880066,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q6,24.09229016304016,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q7,5.961038589477539,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q8,5.836777925491333,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q9,6.996642351150513,0


[36m(iomax_medium_ray_queries pid=106303)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-18_19-54-40_768269_106003/logs/ray-data
[36m(iomax_medium_ray_queries pid=106303)[0m Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> AllToAllOperator[Aggregate]


(pid=106303) Running 0: 0.00 row [00:00, ? row/s]

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

(pid=106303) Sort Sample 2:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Map 3:   0%|                                                         | 0.00/1.00 [00:00<?…

(pid=106303) Shuffle Reduce 4:   0%|                                                      | 0.00/1.00 [00:00<?…

[36m(iomax_medium_ray_queries pid=106303)[0m File results_ray_medium_queries_csv/results_ray_base_medium_25m.txt created/updated successfully!
[36m(iomax_medium_ray_queries pid=106303)[0m medium,Q10,6.09261679649353,0


In [2]:
ray.shutdown()