# 🚀 Live Coding: Polars

## Setup

In [2]:
import polars as pl
from pathlib import Path
import time

# Your database path
p_departement = Path.cwd().parent / "data" / "villes_france.db"
connection_string = f"sqlite:///{p_departement}"

In [4]:
# Loading DATA with SQL
from sqlalchemy import create_engine

# Create SQLAlchemy engine
engine = create_engine(connection_string)


start = time.time()
df_villes = pl.read_database("""
    SELECT v.*, d.departement_nom 
    FROM villes v
    LEFT JOIN departement d ON v.department = d.departement_code
""", engine)
df_villes

id,department,name,simple_name,population_2012,surface,commune_code,departement_nom
i64,str,str,str,i64,f64,str,str
1,"""01""","""OZAN""","""ozan""",500,6.6,"""01284""","""Ain"""
2,"""01""","""CORMORANCHE-SUR-SAONE""","""cormoranche sur saone""",1000,9.85,"""01123""","""Ain"""
3,"""01""","""PLAGNE""","""plagne""",100,6.2,"""01298""","""Ain"""
4,"""01""","""TOSSIAT""","""tossiat""",1400,10.17,"""01422""","""Ain"""
5,"""01""","""POUILLAT""","""pouillat""",100,6.23,"""01309""","""Ain"""
…,…,…,…,…,…,…,…
36825,"""976""","""SADA""","""sada""",10195,10.92,"""97616""","""Mayotte"""
36826,"""976""","""TSINGONI""","""tsingoni""",10454,34.76,"""97617""","""Mayotte"""
36827,"""971""","""SAINT BARTHELEMY""","""st barthelemy""",8938,24.0,"""97123""","""Guadeloupe"""
36828,"""971""","""SAINT MARTIN""","""st martin""",36979,53.2,"""97127""","""Guadeloupe"""


In [22]:
# We can obviously load csv very easily,
# Read csv accept url
df_titanic = pl.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv")
df_titanic

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
888,1,1,"""Graham, Miss Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S"""
889,0,3,"""Johnston, Miss Catherine Helen…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S"""
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""


## Data Exploration - Cleaner Syntax

### Quick peek at our data

In [7]:
print(f"\nSchema: {df_titanic.schema}")


Schema: Schema([('PassengerId', Int64), ('Survived', Int64), ('Pclass', Int64), ('Name', String), ('Sex', String), ('Age', Float64), ('SibSp', Int64), ('Parch', Int64), ('Ticket', String), ('Fare', Float64), ('Cabin', String), ('Embarked', String)])


In [13]:
# SQL-like expression

# Let's get the people with the most expensive tickets
biggest_polars = (
    df_titanic
    .sort('Fare', descending=True)
    .head(10)
    .select(['Name', 'Sex', 'Age', 'Fare'])
)

biggest_polars

Name,Sex,Age,Fare
str,str,f64,f64
"""Ward, Miss Anna""","""female""",35.0,512.3292
"""Cardeza, Mr. Thomas Drake Mart…","""male""",36.0,512.3292
"""Lesurer, Mr. Gustave J""","""male""",35.0,512.3292
"""Fortune, Mr. Charles Alexander""","""male""",19.0,263.0
"""Fortune, Miss Mabel Helen""","""female""",23.0,263.0
"""Fortune, Miss Alice Elizabeth""","""female""",24.0,263.0
"""Fortune, Mr. Mark""","""male""",64.0,263.0
"""Ryerson, Miss Emily Borie""","""female""",18.0,262.375
"""Ryerson, Miss Susan Parker ""Su…","""female""",21.0,262.375
"""Baxter, Mr. Quigg Edmond""","""male""",24.0,247.5208


In [18]:
# Adding today_fare
# fare_2025 = fare_1912 * 145.97
today_fare = (
    df_titanic
    .with_columns(
        Today_fare = (pl.col('Fare') * 145.97).round(0)
    )
    .select(['Fare', 'Today_fare'])
    .sort('Today_fare', descending=True)
)
today_fare

Fare,Today_fare
f64,f64
512.3292,74785.0
512.3292,74785.0
512.3292,74785.0
263.0,38390.0
263.0,38390.0
…,…
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0


In [23]:
# Creating new column based on if/then statement
df_titanic = df_titanic.with_columns(
    pl.when(pl.col('Age') < 3)
    .then(pl.lit("infant"))
    .when(pl.col("Age") < 18)
    .then(pl.lit("children"))
    .when(pl.col('Age')< 60)
    .then(pl.lit("adult"))
    .when(pl.col('Age')> 60)
    .then(pl.lit("elderly"))
    .otherwise(pl.lit("Unknown"))
    .alias('age_group')
)
df_titanic

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_group
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""adult"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""","""adult"""
3,1,3,"""Heikkinen, Miss Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""","""adult"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""","""adult"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""","""adult"""
…,…,…,…,…,…,…,…,…,…,…,…,…
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S""","""adult"""
888,1,1,"""Graham, Miss Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S""","""adult"""
889,0,3,"""Johnston, Miss Catherine Helen…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S""","""Unknown"""
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C""","""adult"""


---

## Round 4: Window Functions - SQL Power in Python 💪


In [30]:
# Averaging by age group

df_group = df_titanic.group_by('age_group').agg(
    pl.len(),
    pl.col('Age').mean(),
    pl.col('Fare').mean(),
    (pl.col('Survived').mean()*100).round(0)
)
df_group

age_group,len,Age,Fare,Survived
str,u32,f64,f64,f64
"""children""",89,11.117978,30.131603,52.0
"""Unknown""",181,60.0,22.884344,30.0
"""adult""",575,32.158261,34.980463,39.0
"""infant""",24,1.340417,35.259896,62.0
"""elderly""",22,66.022727,41.371214,23.0


In [55]:
# Analysis by Title Extraction Analysis
# Mr, Miss, Mrs, Master ...
# we can remove everything before the first comma to see clearer
# https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_prefix.html#polars.Expr.str.strip_prefix
df_title = (df_titanic
    .with_columns(
        pl.col('Name').str.split(', ').list.get(1).str.split(" ").list.get(0).alias("title")
    )
    .with_columns(
        pl.col("Name").str.extract(r", (\w+\.?)").alias("title_regex") )
    .select(['Name', 'title', "title_regex"]))

print(df_title['title'].unique().to_list())
df_title

['Master', 'Miss', 'Mlle.', 'Mme.', 'Dr.', 'Ms.', 'Don.', 'Mrs.', 'Rev.', 'Mr.', 'Capt.', 'Sir.', 'Col.', 'the', 'Lady.', 'Major.', 'Jonkheer.']


Name,title,title_regex
str,str,str
"""Braund, Mr. Owen Harris""","""Mr.""","""Mr."""
"""Cumings, Mrs. John Bradley (Fl…","""Mrs.""","""Mrs."""
"""Heikkinen, Miss Laina""","""Miss""","""Miss"""
"""Futrelle, Mrs. Jacques Heath (…","""Mrs.""","""Mrs."""
"""Allen, Mr. William Henry""","""Mr.""","""Mr."""
…,…,…
"""Montvila, Rev. Juozas""","""Rev.""","""Rev."""
"""Graham, Miss Margaret Edith""","""Miss""","""Miss"""
"""Johnston, Miss Catherine Helen…","""Miss""","""Miss"""
"""Behr, Mr. Karl Howell""","""Mr.""","""Mr."""


In [None]:
# Joining back to the main table
df_titanic = df_titanic.join(df_title.select(['Name', 'title']), on='Name')

In [59]:
df_group_title = df_titanic.group_by('title').agg(
    pl.len().alias('count'),
    pl.col('Age').mean(),
    pl.col('Fare').mean(),
    (pl.col('Survived').mean()*100).round(0),
).sort('count', descending=True)
df_group_title

title,count,Age,Fare,Survived
str,u32,f64,f64,f64
"""Mr.""",517,32.36809,24.44156,16.0
"""Miss""",182,21.773973,43.797873,70.0
"""Mrs.""",125,35.898148,45.138533,79.0
"""Master""",40,4.574167,34.703125,57.0
"""Dr.""",7,42.0,49.168457,43.0
…,…,…,…,…
"""the""",1,33.0,86.5,100.0
"""Lady.""",1,48.0,39.6,100.0
"""Mme.""",1,24.0,69.3,100.0
"""Don.""",1,40.0,27.7208,0.0


---

## Round 6: Lazy Evaluation - The Game Changer 🎮

### Building a complex query without executing

In [61]:
# BUILD THE LAZY PIPELINE - Nothing executes yet!
lazy_analysis = (
    df_titanic
    .lazy()  # Make it lazy
    .filter(pl.col('Embarked').is_not_null())  # Filter 1
    .filter(pl.col('Age').is_not_null())       # Filter 2
    .filter(pl.col('Fare') > 0)                # Filter 3
    .with_columns([
        (pl.col('SibSp') + pl.col('Parch')).alias('family_members'),
        pl.col('Sex').map_elements(lambda x: 1 if x == 'female' else 0).alias('is_female')
    ])
    .group_by(['Embarked', 'Pclass'])
    .agg([
        pl.len().alias('passenger_count'),
        pl.col('Survived').mean().alias('survival_rate'),
        pl.col('Fare').mean().alias('avg_fare'),
        pl.col('family_members').mean().alias('avg_family_size'),
        pl.col('is_female').mean().alias('female_ratio')
    ])
    .sort('survival_rate', descending=True)
)


print(lazy_analysis.explain())
# Only NOW does the computation happen!
result = lazy_analysis.collect()
result

SORT BY [col("survival_rate")]
  AGGREGATE[maintain_order: false]
    [len().alias("passenger_count"), col("Survived").mean().alias("survival_rate"), col("Fare").mean().alias("avg_fare"), col("family_members").mean().alias("avg_family_size"), col("is_female").mean().alias("female_ratio")] BY [col("Embarked"), col("Pclass")]
    FROM
     WITH_COLUMNS:
     [[(col("SibSp")) + (col("Parch"))].alias("family_members"), col("Sex").python_udf().alias("is_female")] 
      FILTER [([(col("Age").is_not_null()) & ([(col("Fare")) > (0.0)])]) & (col("Embarked").is_not_null())]
      FROM
        DF ["PassengerId", "Survived", "Pclass", "Name", ...]; PROJECT["Survived", "Fare", "Embarked", "Pclass", ...] 8/15 COLUMNS
shape: (9, 7)
┌──────────┬────────┬─────────────────┬───────────────┬────────────┬────────────────┬──────────────┐
│ Embarked ┆ Pclass ┆ passenger_count ┆ survival_rate ┆ avg_fare   ┆ avg_family_siz ┆ female_ratio │
│ ---      ┆ ---    ┆ ---             ┆ ---           ┆ ---        ┆ e

In [None]:
# Common patterns to remember:

# SELECTING
# Pandas: df[['col1', 'col2']]
# Polars: df.select(['col1', 'col2'])

# FILTERING
# Pandas: df[df['col'] > 5]
# Polars: df.filter(pl.col('col') > 5)

# ADDING COLUMNS
# Pandas: df['new'] = df['old'] * 2
# Polars: df.with_columns((pl.col('old') * 2).alias('new'))

# GROUPING
# Pandas: df.groupby('x').agg({'y': 'sum'})
# Polars: df.group_by('x').agg(pl.col('y').sum())

# JOINING
# Pandas: pd.merge(df1, df2, on='key')
# Polars: df1.join(df2, on='key')