# Pandas CSV vs Delta Lake Benchmarking

In [18]:
import pathlib

import pyarrow.dataset as ds
from deltalake import DeltaTable

import pandas as pd

## 1e8 CSV

In [19]:
%%time

(
    pd.read_csv("~/data/G1_1e8_1e2_0_0.csv", usecols=["id1", "id2", "v1"])
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 18.4 s, sys: 2.19 s, total: 20.6 s
Wall time: 20.6 s


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,29918
id002,30343
id003,30180
id004,30581
id005,30769
...,...
id096,30011
id097,29728
id098,30131
id099,30141


## 1e8 Delta Lake

In [20]:
%%time

dt = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e8_1e2_0_0")
dataset = dt.to_pyarrow_dataset()
condition = ds.field("id1") == "id016"
(
    dataset.to_table(filter=condition, columns=["id1", "id2", "v1"])
    .to_pandas()
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)

CPU times: user 2.11 s, sys: 219 ms, total: 2.33 s
Wall time: 970 ms


Unnamed: 0_level_0,v1
id2,Unnamed: 1_level_1
id001,29918
id002,30343
id003,30180
id004,30581
id005,30769
...,...
id096,30011
id097,29728
id098,30131
id099,30141


## 1e9 CSV

In [None]:
%%time

df_csv_1e9_usecols = pd.read_csv(
    "~/data/G1_1e9_1e2_0_0.csv", usecols=["id1", "id2", "v1"]
)
(df_csv_1e9_usecols.query("id1 == 'id016'").groupby("id2").agg({"v1": "sum"}))

In [None]:
del df_csv_1e9_usecols

## 1e9 Delta Lake

In [None]:
%%time

dt = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e9_1e2_0_0")
dataset = dt.to_pyarrow_dataset()
condition = ds.field("id1") == "id016"
(
    dataset.to_table(filter=condition, columns=["id1", "id2", "v1"])
    .to_pandas()
    .query("id1 == 'id016'")
    .groupby("id2")
    .agg({"v1": "sum"})
)