In [41]:
import pandas as pd
import glob, os
import duckdb, time

In [7]:
SAVE_PATH = "data/different_formats_t3"

In [4]:
parquet_files = glob.glob("data/trip_record_partitioned/green-taxi/year=2024/*.parquet")

In [6]:
df = pd.read_parquet(parquet_files)

In [17]:
df.to_csv(os.path.join(SAVE_PATH, "original.csv"))
df.to_csv(os.path.join(SAVE_PATH, "gzipped.csv"), compression="gzip")
df.to_hdf(os.path.join(SAVE_PATH, "data.h5"), key="df", format="t")
con = duckdb.connect(database=os.path.join(SAVE_PATH, "data.duckdb"))
con.sql("create table data as select * from df")

In [30]:
file_information = {}

for file_name in os.listdir(SAVE_PATH):
    path = os.path.join(SAVE_PATH, file_name)
    if os.path.isfile(path):
        file_size = os.path.getsize(path)
        file_information[file_name] = file_size


for file_name, file_size in sorted(file_information.items(), key=lambda item: -item[1]):
    print(f"{file_name.ljust(15)} : {file_size}")

data.h5         : 59432981
original.csv    : 58999309
data.duckdb     : 13643776
gzipped.csv     : 12140122


In [37]:
def time_function(func):
    start = time.time()
    func()
    end = time.time()
    return end - start


def read_csv_original():
    df = pd.read_csv(os.path.join(SAVE_PATH, "original.csv"))


def read_csv_compressed():
    df = pd.read_csv(os.path.join(SAVE_PATH, "gzipped.csv"), compression="gzip")


def read_hdf():
    df = pd.read_hdf(os.path.join(SAVE_PATH, "data.h5"))


def read_duckdb():
    con = duckdb.connect(database=os.path.join(SAVE_PATH, "data.duckdb"))
    df = con.sql("select * from data").df()

In [49]:
read_functions = {
    "original.csv": read_csv_original,
    "gzipped.csv": read_csv_compressed,
    "data.h5": read_hdf,
    "data.duckdb": read_duckdb,
}

results = {}

for file_name, func in read_functions.items():
    results[file_name] = time_function(func)


for file_name, seconds in sorted(results.items(), key=lambda item: -item[1]):
    print(f"{file_name.ljust(15)} : {seconds}")

original.csv    : 2.0467405319213867
gzipped.csv     : 1.9688646793365479
data.h5         : 0.0970005989074707
data.duckdb     : 0.06825733184814453
