In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pyarrow
import numba
import cython
from time import time
import os
import matplotlib.pyplot as plt
import gc
import json
import resource
import numpy as np
import cudf
import dask_cudf
import rmm
import pandas as pd

from dask.base import normalize_token

# Register custom tokenization for cudf.Index
@normalize_token.register(cudf.core.index.Index)
def normalize_cudf_index(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

# Custom tokenization for cudf.MultiIndex
@normalize_token.register(cudf.MultiIndex)
def normalize_cudf_multiindex(index):
    return normalize_token(type(index)), tuple(index.to_pandas().values)

# Log results to a file
def log_results_to_file(result_file, time, memory):
    with open(result_file, 'a') as f:
        f.write(f"Times: {time}, Memory: {memory}\n")
    
    if os.path.exists(result_file):
        print(f"File {result_file} created/updated successfully!")
    else:
        print(f"Error: {result_file} was not created.")

# Pandas query function
def base_simple_cudf_queries(log_dir, result_file):
    print(f",base_simple_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']: 
            query_index += 1 
            t1 = time()
            res = df.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1

            # Memory usage
            memory_usage = res.memory_usage(deep=True).sum()
            
            # Store the time and memory usage for graphing later   
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")
    
# Dask query function
def base_simple_dask_queries(log_dir, result_file):
    print(f",base_simple_dask_queries")
    query_index = 0
    ddf = dask_cudf.read_csv(f"{log_dir}")
    for ix in ['filename', 'application']:
        for col in ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']:
            query_index += 1 
            t1 = time()
            res = ddf.groupby([ix]).agg({col: 'sum'}).compute()
            time_elapsed = time() - t1
            # print(res)
            memory_usage = res.memory_usage(deep=True).sum()

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

# Pandas IOMAX queries
def iomax_simple_cudf_queries(log_dir, result_file):
    print(f",iomax_simple_pandas_queries")
    df = cudf.read_csv(log_dir)
    query_index = 0
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = {col: 'sum' for col in cols}
    for ix in ['filename', 'application']:
        for col in cols:
            query_index += 1
            t1 = time()
            memory_usage = 0
            if query_index == 1:
                x = df.groupby(['filename', 'application']).agg(agg_dict)
                memory_usage = x.memory_usage(deep=True).sum()
            res = x.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1
            # print(res)

            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")
    
# Dask IOMAX queries
def iomax_simple_dask_queries(log_dir, result_file):
    print(f",iomax_simple_dask_queries")
    ddf = dask_cudf.read_csv(f"{log_dir}")
    query_index = 0
    cols = ['request_io_size_bytes', 'file_offset', 'response_io_size_bytes', 'disk_time', 'simulated_latency']
    agg_dict = {col: sum for col in cols}
    for ix in ['filename', 'application']:
        for col in cols:
            query_index += 1
            t1 = time()
            memory_usage = 0
            if query_index == 1:
                x = ddf.groupby(['filename', 'application']).agg(agg_dict).compute()
                memory_usage = x.memory_usage(deep=True).sum()
            res = x.groupby([ix]).agg({col: 'sum'})
            time_elapsed = time() - t1
            # print(res)
            
            # Store the time and memory usage for graphing later
            log_results_to_file(result_file, time_elapsed, memory_usage)
            print(f"simple,Q{query_index},{time_elapsed},{memory_usage}")

iomax_simple_dask_queries('datasets_thesios_io_traces/dataset-125m.csv', 'results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt')

,iomax_simple_dask_queries


get_mempolicy: Function not implemented


File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q1,542.1808652877808,959176948
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q2,0.027724504470825195,0
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q3,0.027349472045898438,0
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q4,0.031377315521240234,0
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q5,0.027112960815429688,0
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q6,0.008910655975341797,0
File results_cudf_simple_queries_csv/results_cudf_csv_dask_iomax_simple_125m.txt created/updated successfully!
simple,Q7,0.00652360