In [1]:
import tidypolars as tp
import polars as pl
from polars import col
import pandas as pd
import numpy as np
from timeit import timeit
import string

np.random.seed(123)

letters = np.asarray(list(string.ascii_lowercase))
letters = np.random.choice(letters, 10)

data_size = 1000000
# data_size = 1000
rand_string = np.random.choice(letters, data_size)
for i in range(3):
    rand_string = np.char.add(rand_string, np.random.choice(letters, data_size))

tidypolars_df = tp.Tibble(
    a = np.random.choice(np.arange(20), data_size),
    b = np.random.choice(np.arange(20), data_size),
    c = np.random.choice(rand_string, data_size),
    d = np.random.choice(rand_string, data_size)
)
polars_df = tidypolars_df.clone().to_polars()
pandas_df = polars_df.to_pandas()

In [2]:
def benchmark_me(d, num_tests):
    for i, (key, value) in enumerate(d.items()):
        if i == 0:
            out = tp.Tibble({key: [timeit(value, number = num_tests)]})
        else:
            step = tp.Tibble({key: [timeit(value, number = num_tests)]})
            out = out.bind_cols(step)
    return out.mutate((tp.col_everything() * 1000).round(3).cast(pl.Float64))

In [37]:
median_x = tidypolars_df.summarize(avg = col('a').mean()).pull('avg')[0]

summarize_funcs = { 
    'filter' : dict(
        tidypolars = lambda: tidypolars_df.filter(col('a') <= 7, col('c') == 'brkc'),
        polars = lambda: polars_df.filter((col('a') <= 7) & (col('c') == 'brkc')),
        pandas = lambda: pandas_df[(pandas_df.a <= 7) & (pandas_df.c == 'brkc')]
    ),
    'summarize': dict(
        tidypolars = lambda: tidypolars_df.summarize(x = col('a').median(), groupby = 'c'),
        polars = lambda: polars_df.groupby('c').agg(col('a').median().alias('x')),
        pandas = lambda: pandas_df.groupby('c', as_index = False)['a'].median()
    ), 
    'distinct': dict(
        tidypolars = lambda: tidypolars_df.summarize(x = tp.n_distinct(col('a')), groupby = 'c'),
        polars = lambda: polars_df.groupby('c').agg(col('a').n_unique().alias('x')),
        pandas = lambda: pandas_df.groupby('c', as_index = False)['a'].nunique()
    ), 
    'mutate' : dict(
        tidypolars = lambda: tidypolars_df.mutate(double_a = col('a') * 2, a_plus_b = col('a') + col('b')),
        polars = lambda: polars_df.with_columns([(col('a') * 2).alias('double_a'), (col('a') + col('b')).alias('a_plus_b')]), 
        pandas = lambda: pandas_df.assign(double_a = lambda x: x.a * 2, a_plus_b = lambda x : x.a + x.b)
        ), 
    'case_when' : dict(
        tidypolars = lambda: tidypolars_df.mutate(x_case = tp.case_when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)),
        polars = lambda: polars_df.with_columns(pl.when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)), 
        pandas = lambda: pandas_df.assign(x_case = lambda x : np.where(x.a > median_x , 1, (np.where(x.a >= median_x, 2, 3))))
    ), 
    'arrange' : dict(
        tidypolars = lambda: tidypolars_df.arrange('a'),
        polars = lambda: polars_df.sort(col('a')), 
        pandas = lambda: pandas_df.sort_values(by=['a'])
    ), 
    'left_join' : dict(
        tidypolars = lambda: tidypolars_df.left_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'left'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='left', rsuffix='right_')
    ), 
    'inner_join' : dict(
        tidypolars = lambda: tidypolars_df.inner_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'inner'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='inner', rsuffix='right_')
    ), 
    # 'right_join' : dict(
    #     tidypolars = lambda: tidypolars_df.right_join(tidypolars_df.head(1000), on = 'c'),
    #     polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'right'), 
    #     pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='right', rsuffix='right_')
    # ), 
    # 'pivot_wider' : dict(
    #     tidypolars = lambda: tidypolars_df.summarize(a = col('a').sum(), groupby = ['c', 'd']).pivot_wider(names_from = 'c', values_from = 'a', values_fill = 0),
    #     polars = lambda: polars_df.groupby('c').pivot(pivot_column='c', values_column='a').sum(), 
    #     pandas = lambda: pandas_df.groupby(['c', 'd'], as_index = False)['a'].sum().pivot(index = "d", columns = "c", values = "a").fillna(0)
    # )
}


# Benchmark Results

In [38]:
for i, (key, value) in enumerate(summarize_funcs.items()):
    value = benchmark_me(value, num_tests = 5).mutate(func_tested = tp.lit(key)).relocate('func_tested')
    if i == 0:
        bench_df = value
    else:
        bench_df = bench_df.bind_rows(value)
        
bench_df

AttributeError: right_join not found

In [39]:
# tidypolars_df.left_join(tidypolars_df.head(1000), on = 'c')
# polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'left')
# pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='left', rsuffix='right_')
tidypolars_df.right_join(tidypolars_df.head(1000), on = 'c')

AttributeError: right_join not found

In [25]:
# ?pl.DataFrame.slice
# polars_df.slice(0, 1000)
pandas_df.head(1000).set_index('c')

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cbzg,19,3,wntz
brnw,7,3,rgkb
ztzb,17,9,tbcg
cncb,1,10,tncr
cbcb,4,13,tccc
...,...,...,...
wbzg,12,5,cctr
wckk,5,13,gckb
ckrr,3,4,bnbk
gkzt,16,14,nzcb
