# Polars CSV vs Delta Lake

In [5]:
import polars as pl
import gc

In [12]:
gc.collect()

0

## 1e8 CSV

In [14]:
%%time

(
    pl.read_csv("~/data/G1_1e8_1e2_0_0.csv")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
)

CPU times: user 16.2 s, sys: 5.5 s, total: 21.7 s
Wall time: 3.73 s


id1,v1
str,i64
"""id016""",2998623


In [15]:
%%time

(
    pl.scan_csv("~/data/G1_1e8_1e2_0_0.csv")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
).collect()

CPU times: user 7.44 s, sys: 535 ms, total: 7.97 s
Wall time: 916 ms


id1,v1
str,i64
"""id016""",2998623


## 1e8 Delta Lake

In [16]:
%%time

(
    pl.read_delta("~/data/delta/G1_1e8_1e2_0_0")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
)

CPU times: user 7.01 s, sys: 3.28 s, total: 10.3 s
Wall time: 3.89 s


id1,v1
str,i32
"""id016""",2998623


In [17]:
%%time

(
    pl.scan_delta("~/data/delta/G1_1e8_1e2_0_0")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
).collect()

CPU times: user 1.27 s, sys: 161 ms, total: 1.43 s
Wall time: 829 ms


id1,v1
str,i32
"""id016""",2998623


## 1e9 Parquet

In [6]:
%%time

(
    pl.scan_parquet("~/data/G1_1e9_1e2_0_0.parquet")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
).collect(streaming=True)

CPU times: user 25.3 s, sys: 32.9 s, total: 58.2 s
Wall time: 11.5 s


id1,v1
str,i64
"""id016""",30003304


In [4]:
%%time
(
    pl.read_parquet("~/data/G1_1e9_1e2_0_0.parquet")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
)

CPU times: user 1min 48s, sys: 3min 39s, total: 5min 27s
Wall time: 5min 30s


id1,v1
str,i64
"""id016""",30003304


## 1e9 Delta Lake

In [2]:
%%time

(
    pl.scan_delta("~/data/delta/G1_1e9_1e2_0_0")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
).collect()

CPU times: user 1.68 s, sys: 295 ms, total: 1.98 s
Wall time: 1.11 s


id1,v1
str,i32
"""id016""",30003304


## 1e9 CSV

In [4]:
%%time

(
    pl.scan_csv("~/data/G1_1e9_1e2_0_0.csv")
    .filter(pl.col("id1") == 'id016')
    .groupby("id1")
    .agg(pl.sum("v1"))
).collect()

CPU times: user 1min 8s, sys: 36.6 s, total: 1min 45s
Wall time: 34.2 s


id1,v1
str,i64
"""id016""",30003304
