In [1]:
import os
import time
import json

import cupy as cp
import numpy as np

import cudf
import dask
import dask_cudf
import dask.dataframe as dd
dask.config.set({"dataframe.backend": "cudf"})

import pandas as pd

In [2]:
class Timer:
    def __init__(self, name="cpu"):
        self.name = name
        
    def __enter__(self):
        self.start = time.perf_counter()
        
    def __exit__(self, type, value, trackback):
        self.end = time.perf_counter()
        self.execute_time = self.end - self.start
        print(f"{self.name} execute time : {self.execute_time:.4f} seconds")


In [5]:
data_path  = '/media/HDD2/Data/OTTO/archive/otto-recsys-train.jsonl'

In [8]:
with Timer(name="cpu") as cpu_time:
    df =  pd.read_json(data_path, lines=True)
    
with Timer(name="gpu") as gpu_time:
    gdf = cudf.read_json(data_path, lines=True)

cpu execute time : 28.8132 seconds
gpu execute time : 4.4809 seconds


In [9]:
with Timer(name="cpu") as cpu_time:
    df.to_parquet(os.path.join('data','train.parquet'))
    
with Timer(name="gpu") as gpu_time:
    gdf.to_parquet(os.path.join('data','train.parquet'))

cpu execute time : 8.2073 seconds
gpu execute time : 1.6177 seconds


In [11]:
with Timer(name="cpu") as cpu_time:
    df =  pd.read_parquet(os.path.join('data','train.parquet'))
    
with Timer(name="gpu") as gpu_time:
    gdf = cudf.read_parquet(os.path.join('data','train.parquet'))

cpu execute time : 10.4138 seconds
gpu execute time : 0.3840 seconds
