In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import json
import resource
import numpy as np
import cudf
import dask_cudf
import rmm

import pandas as pd

from dask.base import normalize_token

# Register custom tokenization for cudf.Index
@normalize_token.register(cudf.core.index.Index)
def normalize_cudf_index(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

# Custom tokenization for cudf.MultiIndex
@normalize_token.register(cudf.MultiIndex)
def normalize_cudf_multiindex(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

indices = ['filename', 'application', 'io_zone', 'redundancy_type']
combinations = list(it.combinations(indices, r=2))[:5]

# Log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Pandas query function
def base_medium_cudf_queries(log_dir, result_file):
    print(f",base_medium_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            res = df.groupby([ix, iy]).agg({col: 'sum'}).groupby([ix]).sum()
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")
    
# Dask query function
def base_medium_dask_queries(log_dir, result_file):
    print(f",base_medium_dask_queries")
    ddf = dask_cudf.read_csv(f"{log_dir}")
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            res = ddf.groupby([ix, iy]).agg({col: 'sum'}).groupby([ix]).sum().compute()
            time_elapsed = time() - t1
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)

            print(f"medium,Q{query_index},{time_elapsed},{memory_usage}")

# Pandas IOMAX queries
def iomax_medium_cudf_queries(log_dir, result_file):
    print(f",iomax_medium_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            m = 0
            if query_index == 1:
                x = df.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'})
                m = x.memory_usage(deep=True).sum()
            x.groupby([ix, iy]).agg({col: 'sum'}).groupby([ix]).sum()
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, m)

            print(f"medium,Q{query_index},{time_elapsed},{m}")
    
# Dask IOMAX queries
def iomax_medium_dask_queries(log_dir, result_file):
    print(f",iomax_medium_dask_queries")
    ddf = dask_cudf.read_csv(f"{log_dir}")
    query_index = 0
    for ix, iy in combinations:
        for col in ['request_io_size_bytes', 'disk_time']:
            query_index += 1 
            t1 = time()
            m = 0
            if query_index == 1:
                x = ddf.groupby(indices).agg({'request_io_size_bytes': 'sum', 'disk_time': 'sum'}).compute()
                m = x.memory_usage(deep=True).sum()
            x.groupby([ix, iy]).agg({col: 'sum'}).groupby([ix]).sum()
            time_elapsed = time() - t1

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, m)

            print(f"medium,Q{query_index},{time_elapsed},{m}")

iomax_medium_dask_queries('datasets_thesios_io_traces/dataset-125m.csv', 'results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt')

,iomax_medium_dask_queries


get_mempolicy: Function not implemented


File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q1,224.6024146080017,1032070996
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q2,0.47611522674560547,0
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q3,0.4142496585845947,0
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q4,0.41880297660827637,0
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q5,0.36164069175720215,0
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q6,0.37160444259643555,0
File results_cudf_medium_queries_csv/results_cudf_csv_dask_iomax_medium_125m.txt created/updated successfully!
medium,Q7,0.1215882301330