In [2]:
import polars as pl
import polars.selectors as cs
import numpy as np
from datetime import datetime

In [4]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2025, 1, 1),
            datetime(2025, 1, 2),
            datetime(2025, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
        "string": ["a", "b", "c"],
    }
)

print(df)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘


In [11]:
df.select(pl.col("date").sort(descending=True), pl.col("integer").sum())

date,integer
datetime[μs],i64
2025-01-03 00:00:00,6
2025-01-02 00:00:00,6
2025-01-01 00:00:00,6


In [74]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.418254 ┆ A      │
│ 2    ┆ ham   ┆ 0.382234 ┆ A      │
│ 3    ┆ spam  ┆ 0.164418 ┆ B      │
│ null ┆ egg   ┆ 0.064461 ┆ C      │
│ 5    ┆ null  ┆ 0.975669 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [62]:
df.select((pl.col("groups") == "A").sum())

groups
u32
2


In [64]:
df.select(pl.all().approx_n_unique())

nrs,names,random,groups
u32,u32,u32,u32
5,5,5,3


In [67]:
df.select(pl.col("random"), pl.when(pl.col("nrs") > 1).then(pl.lit(True)).otherwise(pl.lit(False)))

random,literal
f64,bool
0.597688,False
0.903967,True
0.950718,True
0.839258,False
0.39649,True


In [71]:
df.select(pl.col("name").cast(pl.Boolean))

ColumnNotFoundError: name

Error originated just after this operation:
DF ["nrs", "names", "random", "groups"]; PROJECT */4 COLUMNS; SELECTION: "None"

In [76]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.418254,"""A"""
2.0,"""ham""",0.382234,"""A"""
3.0,"""spam""",0.164418,"""B"""
,"""egg""",0.064461,"""C"""
5.0,,0.975669,"""B"""


In [77]:
df.select(pl.col("names").str.len_bytes())
df.select(pl.col("names").str.len_chars())

names
u32
3.0
3.0
4.0
3.0
""


In [81]:
df.select(pl.col("names").str.contains("am"))

names
bool
False
True
True
False
""


In [11]:
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"

dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}

dataset = pl.read_csv(url, dtypes=dtypes).with_columns(
    pl.col("birthday").str.to_date(strict=False)
)

In [83]:
dataset


last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,twitter_id,facebook,youtube,youtube_id,mastodon,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,cat,str,str,str,str,date,cat,cat,cat,i64,i64,cat,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,str
"""Bassett""","""Richard""",,,,,1745-04-02,"""M""","""sen""","""DE""",,2,"""Anti-Administr…",,,,,,,,,,,,"""B000226""",,,,,,401222,,,,507,"""Richard Basset…"
"""Bland""","""Theodorick""",,,,,1742-03-21,"""M""","""rep""","""VA""",9,,,,,,,,,,,,,,"""B000546""",,,,,,401521,,,,786,"""Theodorick Bla…"
"""Burke""","""Aedanus""",,,,,1743-06-16,"""M""","""rep""","""SC""",2,,,,,,,,,,,,,,"""B001086""",,,,,,402032,,,,1260,"""Aedanus Burke"""
"""Carroll""","""Daniel""",,,,,1730-07-22,"""M""","""rep""","""MD""",6,,,,,,,,,,,,,,"""C000187""",,,,,,402334,,,,1538,"""Daniel Carroll…"
"""Clymer""","""George""",,,,,1739-03-16,"""M""","""rep""","""PA""",-1,,,,,,,,,,,,,,"""C000538""",,,,,,402671,,,,1859,"""George Clymer"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Stewart""","""Chris""",,,,"""Chris Stewart""",1960-07-15,"""M""","""rep""","""UT""",2,,"""Republican""","""https://stewar…","""166 Cannon Hou…","""202-225-9730""",,"""http://stewart…",,,,,,,"""S001192""","""02168""","""N00033932""",,"""H2UT02324""","""68466""",412581,"""135930""","""Chris Stewart""",,21367,"""Chris Stewart …"
"""Santos""","""George""","""Anthony Devold…",,,"""George Santos""",1988-07-22,"""M""","""rep""","""NY""",3,,"""Republican""","""https://santos…","""1117 Longworth…","""202-225-3335""",,,,,,,,,"""S001222""",,,,"""H0NY03083""",,456921,"""191234""","""George Devolde…",,,"""George Santos"""
"""McCarthy""","""Kevin""",,,,"""Kevin McCarthy…",1965-01-26,"""M""","""rep""","""CA""",20,,"""Republican""","""https://kevinm…","""2468 Rayburn H…","""202-225-2915""",,"""http://kevinmc…",,,,,,,"""M001165""","""01833""","""N00028152""",,"""H6CA22125""","""85231""",412190,"""28918""","""Kevin McCarthy…",,20703,"""Kevin McCarthy…"
"""Johnson""","""Bill""",,,,"""Bill Johnson""",1954-11-10,"""M""","""rep""","""OH""",6,,"""Republican""","""https://billjo…","""2082 Rayburn H…","""202-225-5705""",,"""http://billjoh…",,,,,,,"""J000292""","""02046""","""N00032088""",,"""H0OH06189""","""623472""",412460,"""120649""","""Bill Johnson (…",,21162,"""Bill Johnson (…"


In [19]:
q = dataset.lazy().group_by("first_name").agg(pl.len(),
                                   pl.col("gender"),
                                   pl.first("last_name")
                                   ).sort("len", descending=True).limit(5).show_graph(optimized=False)

#q.explain(optimized=True)



ImportError: Graphviz dot binary should be on your PATH

In [17]:
w = dataset.lazy().group_by("state").agg((pl.col("party") == "Anti-Administration").sum().alias("antigroup"),
                                         (pl.col("party") == "Pro-Administration").sum().alias("pro")).sort('state').collect()



AttributeError: 'DataFrame' object has no attribute 'show_graph'

In [4]:
q3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()

print(q3.schema)

OrderedDict([('foo', String), ('bar', Int64)])


In [9]:
q4 = (
    pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]})
    .lazy()
    .with_columns(pl.col("bar").round(0))
)
q4.explain(optimized=True)

q4.collect()

InvalidOperationError: `round` operation not supported for dtype `i64`

In [21]:
lf = pl.LazyFrame(
    {
        "a": ["a", "b", "a", "b", "b", "c"],
        "b": [1, 2, 3, 4, 5, 6],
        "c": [6, 5, 4, 3, 2, 1],
    }
)
lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort(
    "a"
).show_graph()  

ImportError: Graphviz dot binary should be on your PATH