In [1]:
import polars as pl
import numpy as np

In [2]:
run_rows = 5000
rng = np.random.default_rng(seed=42)

In [3]:
buildings_data = {
    "sqft": rng.exponential(1000,  size=run_rows),
    "year": rng.integers(1900, 2022, size=run_rows),
    "building_type": rng.choice(["house", "apartment", "office"], size=run_rows),
}

In [4]:
buildings = pl.DataFrame(buildings_data)

In [5]:
buildings

sqft,year,building_type
f64,i64,str
2404.208604,1940,"""office"""
2336.189656,1902,"""apartment"""
2384.761,1988,"""apartment"""
279.79429,1964,"""office"""
86.4374,1955,"""apartment"""
…,…,…
667.344971,2003,"""house"""
3.45844,1996,"""apartment"""
618.507533,1902,"""house"""
63.796408,1984,"""office"""


In [6]:
buildings.schema

Schema([('sqft', Float64), ('year', Int64), ('building_type', String)])

In [None]:
buildings.describe()

statistic,sqft,year,building_type
str,f64,f64,str
"""count""",5000.0,5000.0,"""5000"""
"""null_count""",0.0,0.0,"""0"""
"""mean""",983.760414,1960.2354,
"""std""",986.045262,35.221972,
"""min""",0.071226,1900.0,"""apartment"""
"""25%""",289.645565,1930.0,
"""50%""",692.554718,1961.0,
"""75%""",1344.965604,1990.0,
"""max""",9518.1197,2021.0,"""office"""


In [8]:
buildings.group_by("building_type").agg(
    [
        pl.mean("sqft").alias("mean_sqft"),
        pl.median("year").alias("median_year"),
        pl.count(),
    ]
)

  pl.count(),


building_type,mean_sqft,median_year,count
str,f64,f64,u32
"""apartment""",953.724951,1959.0,1648
"""house""",1002.747608,1959.0,1698
"""office""",994.194628,1962.0,1654


## Lazy

In [11]:
buildings_data_lazy = {
    "sqft": rng.exponential(1000,  size=run_rows),
    "year": rng.integers(1900, 2022, size=run_rows),
    "building_type": rng.choice(["house", "apartment", "office"], size=run_rows),
    "price": rng.exponential(100_000, size=run_rows),
}

In [12]:
buildings_lazy = pl.LazyFrame(buildings_data_lazy)

In [13]:
buildings_lazy

In [14]:
lazy_query = (
    buildings_lazy
    .with_columns(
        (pl.col("price") / pl.col("sqft")).alias("price_per_sqft"),
    )
    .filter(pl.col("price_per_sqft") > 100)
    .filter(pl.col("year") < 2000)
)

In [15]:
lazy_query

In [17]:
# broken, due to graphviz missing
# lazy_query.show_graph()


In [18]:
print(lazy_query.explain())

FILTER [(col("price_per_sqft")) > (100.0)] FROM
   WITH_COLUMNS:
   [[(col("price")) / (col("sqft"))].alias("price_per_sqft")] 
    DF ["sqft", "year", "building_type", "price"]; PROJECT */4 COLUMNS; SELECTION: [(col("year")) < (2000)]
