In [1]:
import pandas as pd
import polars as pl
import numpy as np
import time

In [2]:
# Create a CSV file with 1_000_000 rows and 10 columns
def gen_csv():
    num_rows = 50_000_000
    df = pl.DataFrame({
        'user': np.random.choice(['user1', 'user2', 'user3'], num_rows),
        'num_1': np.random.randint(0, 100, num_rows),
        'cat_1': np.random.choice(['a', 'b', 'c', 'd'], num_rows),
    })
    df.write_csv("data.csv")
# gen_csv()

In [3]:
t0 = time.time()
pd_df = pd.read_csv('./data.csv')
print('pd read time:', pd_elapsed := time.time() - t0)

t0 = time.time()
pl_df = pl.read_csv('./data.csv')
print('pl read time:', pl_elapsed := time.time() - t0)
print(f'pl is faster than pd by {(pd_elapsed - pl_elapsed) / pd_elapsed * 100} percent')

pd read time: 10.636310577392578
pl read time: 0.46529173851013184
pl is faster than pd by 95.62544046523888 percent


In [4]:
pd_df['user']

0           user3
1           user2
2           user2
3           user1
4           user3
            ...  
49999995    user1
49999996    user3
49999997    user3
49999998    user3
49999999    user3
Name: user, Length: 50000000, dtype: object

In [5]:
print('pd df memory usage', pd_df.memory_usage(deep=True).sum() / (1024**2), 'MB')
print('pl df memory usage', pl_df.estimated_size('mb'), 'MB')

pd df memory usage 6103.5157470703125 MB
pl df memory usage 667.572021484375 MB


In [6]:
cats = [f'cat_1']
t0 = time.time()
pd_df[pd_df['num_1'] <= 10][cats].nunique()
print('pd filter and select time:', pd_elapsed := time.time() - t0)

t0 = time.time()
pl_df.filter(pl.col("num_1") <= 10).select(pl.col(cats).n_unique())
print('pl filter and select time:', pl_elapsed := time.time() - t0)
print(f'pl is faster than pd by {(pd_elapsed - pl_elapsed) / pd_elapsed * 100} percent')

pd filter and select time: 0.3846902847290039
pl filter and select time: 0.0881507396697998
pl is faster than pd by 77.08527010712064 percent


In [7]:
pl_df

user,num_1,cat_1
str,i64,str
"""user3""",37,"""a"""
"""user2""",2,"""a"""
"""user2""",13,"""a"""
"""user1""",16,"""a"""
"""user3""",39,"""c"""
…,…,…
"""user1""",97,"""a"""
"""user3""",57,"""a"""
"""user3""",85,"""c"""
"""user3""",6,"""c"""


In [8]:
t0 = time.time()
res_df = pd_df.groupby('user').agg({'num_1': ['sum', 'median', 'mean', 'min', 'max'], 'cat_1': 'nunique'})
print('pd filter and select time:', pd_elapsed := time.time() - t0)
print(res_df)

t0 = time.time()
res_df = pl_df.group_by('user').agg(
    pl.col('num_1').sum().name.suffix('_sum'),
    pl.col('num_1').median().name.suffix('_median'),
    pl.col('num_1').mean().name.suffix('_mean'),
    pl.col('num_1').min().name.suffix('_min'),
    pl.col('num_1').max().name.suffix('_max'),
    pl.col('cat_1').n_unique().name.suffix('_nunique')
)
print('pl filter and select time:', pl_elapsed := time.time() - t0)
print(res_df)
print(f'pl is faster than pd by {(pd_elapsed - pl_elapsed) / pd_elapsed * 100} percent')

pd filter and select time: 4.763331890106201
           num_1                             cat_1
             sum median       mean min max nunique
user                                              
user1  825244928   49.0  49.494851   0  99       4
user2  824851217   50.0  49.503939   0  99       4
user3  824909339   49.0  49.501542   0  99       4
pl filter and select time: 2.124401569366455
shape: (3, 7)
┌───────┬───────────┬──────────────┬────────────┬───────────┬───────────┬───────────────┐
│ user  ┆ num_1_sum ┆ num_1_median ┆ num_1_mean ┆ num_1_min ┆ num_1_max ┆ cat_1_nunique │
│ ---   ┆ ---       ┆ ---          ┆ ---        ┆ ---       ┆ ---       ┆ ---           │
│ str   ┆ i64       ┆ f64          ┆ f64        ┆ i64       ┆ i64       ┆ u32           │
╞═══════╪═══════════╪══════════════╪════════════╪═══════════╪═══════════╪═══════════════╡
│ user3 ┆ 824909339 ┆ 49.0         ┆ 49.501542  ┆ 0         ┆ 99        ┆ 4             │
│ user2 ┆ 824851217 ┆ 50.0         ┆ 49.503939  ┆ 

In [9]:
t0 = time.time()
pd_df.sort_values('num_1', ascending=True)
print('pd sort time:', pd_elapsed := time.time() - t0)

t0 = time.time()
pl_df.sort('num_1', descending=False)
print('pl sort time:', pl_elapsed := time.time() - t0)
print(f'pl is faster than pd by {(pd_elapsed - pl_elapsed) / pd_elapsed * 100} percent')

pd sort time: 7.549022674560547
pl sort time: 2.3664631843566895
pl is faster than pd by 68.6520588641039 percent
