In [None]:
!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework 

In [None]:
import pandas as pd
import polars as pl

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True

In [None]:
from datetime import date
from typing import Any

def q4_pandas_native(
    line_item_ds: Any,
    orders_ds: Any,
):
    var1 = date(1993, 7, 1)
    var2 = date(1993, 10, 1)

    jn = line_item_ds.merge(orders_ds, left_on="l_orderkey", right_on="o_orderkey")

    jn = jn[
        (jn["o_orderdate"] < var2)
        & (jn["o_orderdate"] >= var1)
        & (jn["l_commitdate"] < jn["l_receiptdate"])
    ]

    jn = jn.drop_duplicates(subset=["o_orderpriority", "l_orderkey"])

    gb = jn.groupby("o_orderpriority", as_index=False)
    agg = gb.agg(order_count=pd.NamedAgg(column="o_orderkey", aggfunc="count"))

    result_df = agg.sort_values(["o_orderpriority"])

    return result_df  # type: ignore[no-any-return]

In [None]:
from typing import Any
from datetime import datetime
import narwhals as nw

def q4(
    lineitem_ds_raw: Any,
    orders_ds_raw: Any,
) -> Any:
    var_1 = datetime(1993, 7, 1)
    var_2 = datetime(1993, 10, 1)

    line_item_ds = nw.from_native(lineitem_ds_raw)
    orders_ds = nw.from_native(orders_ds_raw)

    result = (
        line_item_ds.join(orders_ds, left_on="l_orderkey", right_on="o_orderkey")
        .filter(
            nw.col("o_orderdate").is_between(var_1, var_2, closed="left"),
            nw.col("l_commitdate") < nw.col("l_receiptdate"),
        ).unique(subset=["o_orderpriority", "l_orderkey"])
        .group_by("o_orderpriority")
        .agg(nw.len().alias("order_count"))
        .sort(by="o_orderpriority")
        .with_columns(nw.col("order_count").cast(nw.Int64))
    )

    return nw.to_native(result)

In [None]:
from typing import Any
from datetime import datetime
import ibis

def q4_ibis(
    lineitem: Any,
    orders: Any,
    *,
    tool: str
) -> Any:
    var1 = datetime(1993, 7, 1)
    var2 = datetime(1993, 10, 1)

    q_final = (
        lineitem.join(orders, lineitem["l_orderkey"] == orders["o_orderkey"])
        .filter((orders["o_orderdate"] >= var1) & (orders["o_orderdate"] < var2))
        .filter(lineitem["l_commitdate"] < lineitem["l_receiptdate"])
        .distinct(on=["o_orderpriority", "l_orderkey"])
        .group_by("o_orderpriority")
        .agg(order_count=ibis._.count())
        .order_by("o_orderpriority")
    )
    if tool == 'pandas':
        return q_final.to_pandas()
    if tool == 'polars':
        return q_final.to_polars()
    raise ValueError("expected pandas or polars")

In [None]:
dir_ = "/kaggle/input/tpc-h-data-parquet-s-2/"
region = dir_ + 'region.parquet'
nation = dir_ + 'nation.parquet'
customer = dir_ + 'customer.parquet'
lineitem = dir_ + 'lineitem.parquet'
orders = dir_ + 'orders.parquet'
supplier = dir_ + 'supplier.parquet'
part = dir_ + 'part.parquet'
partsupp = dir_ + 'partsupp.parquet'

In [None]:
import ibis

con_pd = ibis.pandas.connect()
con_pl = ibis.polars.connect()

IO_FUNCS = {
    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),
    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),
    'polars[eager]': lambda x: pl.read_parquet(x),
    'polars[lazy]': lambda x: pl.scan_parquet(x),
    'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),
}

In [None]:
results = {}

## polars, lazy, via ibis

In [None]:
tool = 'polars[lazy][ibis]'
fn = IO_FUNCS[tool]
timings = %timeit -o q4_ibis(fn(lineitem), fn(orders), tool='polars')
results[tool] = timings.all_runs

## pandas, pyarrow dtype, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q4_pandas_native(fn(lineitem), fn(orders))
results[tool+'[native]'] = timings.all_runs

## pandas via Narwhals

In [None]:
tool = 'pandas'
fn = IO_FUNCS[tool]
timings = %timeit -o q4(fn(lineitem), fn(orders))
results[tool] = timings.all_runs

## pandas, pyarrow dtypes, via Narwhals

In [None]:
tool = 'pandas[pyarrow]'
fn = IO_FUNCS[tool]
timings = %timeit -o q4(fn(lineitem), fn(orders))
results[tool] = timings.all_runs

## Polars read_parquet

In [None]:
tool = 'polars[eager]'
fn = IO_FUNCS[tool]
timings = %timeit -o q4(fn(lineitem), fn(orders))
results[tool] = timings.all_runs

## Polars scan_parquet

In [None]:
tool = 'polars[lazy]'
fn = IO_FUNCS[tool]
timings = %timeit -o q4(fn(lineitem), fn(orders)).collect()
results[tool] = timings.all_runs

## Save

In [None]:
import json
with open('results.json', 'w') as fd:
    json.dump(results, fd)
