### Install depencencies:

In [3]:
!pip3 install pandas pyarrow numpy

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pyarrow
  Downloading pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting numpy
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl (31.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.2/31.2 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

### Define functions

In [5]:
import os
import time
import numpy as np
import pandas as pd


OUTPUT_DIR = "benchmark_data"
CSV_PATH = os.path.join(OUTPUT_DIR, "data.csv")
PARQUET_PATH = os.path.join(OUTPUT_DIR, "data.parquet")

TARGET_SIZE_GB = 1.0
ROWS_PER_CHUNK = 5_000_000  # adjust if you hit memory limits
SEED = 42

np.random.seed(SEED)
os.makedirs(OUTPUT_DIR, exist_ok=True)


# -----------------------------
# Helper functions
# -----------------------------
def sizeof_gb(path):
    return os.path.getsize(path) / (1024 ** 3)


def timed(label, fn):
    start = time.perf_counter()
    result = fn()
    elapsed = time.perf_counter() - start
    print(f"{label:<35} {elapsed:8.2f} s")
    return result


# -----------------------------
# Data generation
# -----------------------------
def generate_chunk(n):
    return pd.DataFrame({
        "user_id": np.random.randint(0, 1_000_000, size=n),
        "event_type": np.random.randint(0, 50, size=n),
        "value": np.random.randn(n) * 100,
        "timestamp": np.random.randint(
            1_600_000_000, 1_700_000_000, size=n
        ),
        "category": np.random.choice(
            ["A", "B", "C", "D", "E"], size=n
        ),
    })


# -----------------------------
# Write CSV & Parquet
# -----------------------------

def write_csv():
    if os.path.exists(CSV_PATH):
        os.remove(CSV_PATH)

    total_rows = 0
    while not os.path.exists(CSV_PATH) or sizeof_gb(CSV_PATH) < TARGET_SIZE_GB:
        df = generate_chunk(ROWS_PER_CHUNK)
        df.to_csv(
            CSV_PATH,
            mode="a",
            index=False,
            header=not os.path.exists(CSV_PATH),
        )
        total_rows += len(df)

    return total_rows


def write_parquet():
    total_rows = 0
    dfs = []

    while sum(df.memory_usage(deep=True).sum() for df in dfs) < TARGET_SIZE_GB * (1024 ** 3):
        dfs.append(generate_chunk(ROWS_PER_CHUNK))
        total_rows += ROWS_PER_CHUNK

    pd.concat(dfs, ignore_index=True).to_parquet(
        PARQUET_PATH,
        engine="pyarrow",
        compression="snappy",
    )
    return total_rows

### Writing synthetic data...
Note, how parquet is not only fast but also much more compressed.

In [6]:
rows_csv = timed("Write CSV", write_csv)
rows_parquet = timed("Write Parquet", write_parquet)

print(f"\nCSV size:     {sizeof_gb(CSV_PATH):.2f} GB")
print(f"Parquet size: {sizeof_gb(PARQUET_PATH):.2f} GB")

Write CSV                              29.37 s
Write Parquet                           3.69 s

CSV size:     1.16 GB
Parquet size: 0.29 GB


### Read benchmarks

In [7]:
df_csv = timed("Read CSV", lambda: pd.read_csv(CSV_PATH))
df_parquet = timed("Read Parquet", lambda: pd.read_parquet(PARQUET_PATH))

Read CSV                                5.81 s
Read Parquet                            0.26 s
