In [1]:
import dask.dataframe as dd
from constants import TASK1_OUT_ROOT, RAW_DATA_ROOT, TASK1_NP_SCHEMA, RESULTS_ROOT
import os
from glob import glob
import timeit
import numpy as np
import pandas as pd

def get_raw_files(root_path, year):
    return sorted(list(glob(f"{root_path}/yellow_tripdata_{year}*.parquet")))

def get_total_size_GB(paths):
    return round(sum(os.path.getsize(p) for p in paths) / (1024 * 1024 * 1024), 3)

def analize_format(paths, id):
    # read and concat the data (if needed)
    # measure the time needed for reading and concating
    dfs = []
    format = paths[0].split(".")[-1]
    match format:
        case "parquet":
            dfs = [dd.read_parquet(p) for p in paths]
        case "csv":
            # parse dates separately because pandas backend doesn't support dtype=datetime directly during read_csv :(
            datetime_cols = ["pickup_datetime", "dropoff_datetime"]
            corrected_schema = TASK1_NP_SCHEMA.copy()
            del corrected_schema["pickup_datetime"]
            del corrected_schema["dropoff_datetime"]
            dfs = [dd.read_csv(p, dtype=corrected_schema, parse_dates=datetime_cols) for p in paths]
        case "h5":
            dfs = [dd.read_hdf(p, key="taxidata") for p in paths]
            # parse dates separately because pandas backend doesn't support dtype=datetime directly during read_hdf :(
            for i in range(len(dfs)):
                dfs[i]["pickup_datetime"] = dd.to_datetime(dfs[i]["pickup_datetime"])
                dfs[i]["dropoff_datetime"] = dd.to_datetime(dfs[i]["dropoff_datetime"])
    
    # timeit read, concat and compute (multiple runs for average and std)
    times = []
    df = None
    for _ in range(5):
        start = timeit.default_timer()
        df = dd.concat(dfs)
        df = df.compute()
        end = timeit.default_timer()
        times.append(end - start)
    
    # display size of df in memory
    n_rows, n_cols = df.shape
    results = {
        "id": id,
        "format": format,
        "n_rows": n_rows,
        "n_cols": n_cols,
        "mem_usage_GB": df.memory_usage(deep=True).sum() / (1024 * 1024 * 1024),
        "mean_read_time": np.mean(times),
        "std_read_time": np.std(times),
        "size_on_disk_GB": get_total_size_GB(paths),
    }

    del df # force cleanup
    return results

all_original_parquet = list(glob(os.path.join(RAW_DATA_ROOT, "yellow_tripdata_*.parquet")))
five_years_original_parquet = sum([get_raw_files(RAW_DATA_ROOT, year) for year in range(2020, 2025)], [])
one_year_original_parquet = get_raw_files(RAW_DATA_ROOT, 2024)

all_parquet = list(glob(os.path.join(TASK1_OUT_ROOT, "all", "*", "*.parquet")))
five_years_parquet = list(filter(lambda x: any([os.path.dirname(x).endswith(y) for y in ["2020","2021","2022","2023","2024"]]), all_parquet))
five_years_csv = os.path.join(TASK1_OUT_ROOT, "five_years","2020_2024.csv")
one_year_parquet = list(filter(lambda x: os.path.dirname(x).endswith("2024"), all_parquet))
one_year_csv = os.path.join(TASK1_OUT_ROOT, "one_year","2024.csv")
one_year_hdf5 = os.path.join(TASK1_OUT_ROOT, "one_year","2024.h5")

assert len(all_original_parquet) == 193, f"Expected 193 original parquet files, but found {len(all_original_parquet)}"
assert len(all_parquet) == 193, f"Expected 193 parquet files, but found {len(all_parquet)}"
assert len(five_years_original_parquet) == 12*5, f"Expected 60 parquet files for 5 years, but found {len(five_years_original_parquet)}"
assert len(five_years_parquet) == 12*5
assert len(one_year_original_parquet) == 12, f"Expected 12 parquet files for 1 year, but found {len(one_year_original_parquet)}"
assert len(one_year_parquet) == 12


### File sizes

In [2]:
print("Original data sizes:")
print(f"Total size (all years, parquet): {get_total_size_GB(all_original_parquet)} GB")
print(f"Total size (2020-2024, parquet): {get_total_size_GB(five_years_original_parquet)} GB")
print(f"Total size (2024, parquet):      {get_total_size_GB(one_year_original_parquet)} GB")

print("Processed data sizes:")
print(f"Total size (all years, parquet): {get_total_size_GB(all_parquet)} GB")
print(f"Total size (2020-2024, parquet): {get_total_size_GB(five_years_parquet)} GB")
print(f"Total size (2020-2024, csv):     {get_total_size_GB([five_years_csv])} GB")
print(f"Total size (2024, parquet):      {get_total_size_GB(one_year_parquet)} GB")
print(f"Total size (2024, csv):          {get_total_size_GB([one_year_csv])} GB")
print(f"Total size (2024, hdf5):         {get_total_size_GB([one_year_hdf5])} GB")

Original data sizes:
Total size (all years, parquet): 28.941 GB
Total size (2020-2024, parquet): 2.606 GB
Total size (2024, parquet):      0.645 GB
Processed data sizes:
Total size (all years, parquet): 35.057 GB
Total size (2020-2024, parquet): 3.772 GB
Total size (2020-2024, csv):     22.429 GB
Total size (2024, parquet):      0.895 GB
Total size (2024, csv):          5.305 GB
Total size (2024, hdf5):         1.014 GB


### Comments
- As expected, the binary formats (parquet and HDF) achieve the smallest file-sizes with parquet pulling ahead.
- Our processed datasets are larger than the original parquet ones due to extra columns.

### Comparing formats

In [3]:
format_results = []
format_results.append(analize_format(one_year_original_parquet, "original_1year"))
format_results.append(analize_format(one_year_parquet, "processed_1year"))
format_results.append(analize_format([one_year_csv], "processed_1year"))
format_results.append(analize_format([one_year_hdf5], "processed_1year"))

In [4]:
format_results_df = pd.DataFrame(format_results)
format_results_df["mem_usage_GB"] = format_results_df["mem_usage_GB"].round(3)
format_results_df["mean_read_time"] = format_results_df["mean_read_time"].round(2)
format_results_df["std_read_time"] = format_results_df["std_read_time"].round(2)
format_results_df["size_on_disk_GB"] = format_results_df["size_on_disk_GB"].round(3)
format_results_df.to_csv(os.path.join(RESULTS_ROOT, "format_analysis.csv"), index=False)
display(format_results_df)

Unnamed: 0,id,format,n_rows,n_cols,mem_usage_GB,mean_read_time,std_read_time,size_on_disk_GB
0,original_1year,parquet,41169720,19,5.714,3.32,0.83,0.645
1,processed_1year,parquet,41169720,22,3.297,4.12,1.75,0.895
2,processed_1year,csv,41169720,22,3.336,89.8,1.87,5.305
3,processed_1year,h5,41169720,22,3.336,16.07,0.32,1.014


### Comments
- The original parquet files have wasteful datatypes - our processed datasets have more columns, yet take up 42% less space in memory. The main culprits are the string datatypes (e.g. store_and_forward flag column is a string of either 'Y' or 'N'. We change this to 1 or 0 respectively).
- Out of all the formats, parquet is 1 to 2 orders of magnitude faster compared to other formats (CSV being the slowest)
- We had trouble using dask's implementation of HDF (dask.dataframe.to_hdf) as it produced larger files than the CSV equivalents. Using a different implementation (h5py) we achieve similar file sizes to parquet.
- Bottom-line: for this project we will stick to parquet as it is the fastest to work with, preserves data-types (CSV does not, while HDF allows only ints/floats so datetimes are not parsed automatically). Additionally, plain-text formats like CSV lose the appeal in this case as we cannot inspect them in a text editor due to the sheer size of the file.