In [None]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import json
import resource
import numpy as np
import cudf
import dask_cudf
import rmm
import itertools
from dask import delayed

import pandas as pd

from dask.base import normalize_token

# Register custom tokenization for cudf.Index
@normalize_token.register(cudf.core.index.Index)
def normalize_cudf_index(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

# Custom tokenization for cudf.MultiIndex
@normalize_token.register(cudf.MultiIndex)
def normalize_cudf_multiindex(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

def base_hard_cudf_queries(log_dir, result_file):
    print(f",base_hard_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()
            res = df.groupby([ix]).agg({iy: list, col: 'sum'}).explode(iy).reset_index().groupby([iy]).agg({ix: list, col: 'sum'})
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"hard,Q{query_index},{time_elapsed},{memory_usage}")

def base_hard_dask_queries(log_dir, result_file):
    print(f",base_hard_dask_queries")
    
    # Read the data into a Dask cuDF DataFrame
    ddf = dask_cudf.read_csv(log_dir)
    
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()

            res = ddf.groupby([ix]).agg({iy: "collect", col: 'sum'}).reset_index()
            
            meta = {
                ix: "object",           # generic types to avoid conflicts
                iy: "object",           # generic types to avoid conflicts
                col: "int64"
            }

            res_exploded = res.map_partitions(lambda df: df.explode(iy), meta=meta)

            final_res = res_exploded.groupby(iy).agg({ix: "collect", col: 'sum'}).reset_index().compute()

            time_elapsed = time() - t1
            memory_usage = final_res.memory_usage(deep=True).sum()

            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"hard,Q{query_index}, {time_elapsed}, Memory: {memory_usage}")


def iomax_hard_cudf_queries(log_dir, result_file):
    print(f",iomax_hard_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            m = 0
            if query_index == 1:
                x = df.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'}).reset_index()
                m = x.memory_usage(deep=True).sum()
            x.groupby([ix]) \
                .agg({iy: list, col: 'sum'}) \
                .reset_index() \
                .explode(iy) \
                .groupby([iy]) \
                .agg({ix: list, col: 'sum'})
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, m)

            print(f"{time_elapsed},{m}")

def iomax_hard_dask_queries(log_dir, result_file):
    print(f",iomax_hard_dask_queries")
    
    # Load data into a Dask-cuDF DataFrame
    ddf = dask_cudf.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1
            t1 = time()            
            m = 0
            if query_index == 1:
                x = ddf.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'}).reset_index().compute()
                m = x.memory_usage(deep=True).sum()   
            
            x.groupby([ix]).agg({iy: list, col: 'sum'}).reset_index().explode(iy).groupby([iy]).agg({ix: list, col: 'sum'})
            
            time_elapsed = time() - t1

            log_results_to_file(result_file, time_elapsed, m)
            print(f"hard,Q{query_index}, {time_elapsed}, Memory: {m}")
            
base_hard_pandas_queries('datasets_thesios_io_traces/dataset-25m.csv', 'results_cudf_hard_queries_csv/results_cudf_csv_pandas_plain_hard_25m_TEST.txt')

,base_hard_pandas_queries


get_mempolicy: Function not implemented


File results_cudf_hard_queries_csv/results_cudf_csv_pandas_plain_hard_25m_TEST.txt created/updated successfully!
hard,Q1,558.418375492096,200024560
