# 🚀 Live Coding: Polars vs Pandas with Real French Cities Data

## Setup: The Database Connection Battle

In [3]:
# First, let's see the traditional pandas way
import pandas as pd
import polars as pl
from pathlib import Path
import time

# Your database path
p_departement = Path.cwd().parent / "data" / "villes_france.db"
connection_string = f"sqlite:///{p_departement}"

# Let's time both approaches!

---

## Round 1: Loading Data - Who's Faster? ⚡

In [8]:
# PANDAS: The old reliable
start = time.time()
df_pandas = pd.read_sql("""
    SELECT v.*, d.departement_nom 
    FROM villes v
    LEFT JOIN departement d ON v.department = d.departement_code
""", connection_string)
pandas_load_time = time.time() - start
print(f"🐼 Pandas load time: {pandas_load_time:.3f}s")
print(f"Shape: {df_pandas.shape}")

🐼 Pandas load time: 0.119s
Shape: (36700, 8)


In [9]:
# POLARS: The challenger

from sqlalchemy import create_engine

# Create SQLAlchemy engine
engine = create_engine(connection_string)


start = time.time()
df_polars = pl.read_database("""
    SELECT v.*, d.departement_nom 
    FROM villes v
    LEFT JOIN departement d ON v.department = d.departement_code
""", engine)
polars_load_time = time.time() - start
print(f"🦀 Polars load time: {polars_load_time:.3f}s")
print(f"Shape: {df_polars.shape}")
print(f"\n⚡ Polars is {pandas_load_time/polars_load_time:.1f}x faster!")

🦀 Polars load time: 0.066s
Shape: (36700, 8)

⚡ Polars is 1.8x faster!


---

## Round 2: Data Exploration - Cleaner Syntax Wins 🎯

### Quick peek at our data

In [11]:
# PANDAS way
print("PANDAS APPROACH:")
df_pandas.head()


PANDAS APPROACH:


Unnamed: 0,id,department,name,simple_name,population_2012,surface,commune_code,departement_nom
0,1,1,OZAN,ozan,500,6.6,1284,Ain
1,2,1,CORMORANCHE-SUR-SAONE,cormoranche sur saone,1000,9.85,1123,Ain
2,3,1,PLAGNE,plagne,100,6.2,1298,Ain
3,4,1,TOSSIAT,tossiat,1400,10.17,1422,Ain
4,5,1,POUILLAT,pouillat,100,6.23,1309,Ain


In [12]:
# POLARS way - notice the better display!
print("POLARS APPROACH:")
df_polars.head()


POLARS APPROACH:


id,department,name,simple_name,population_2012,surface,commune_code,departement_nom
i64,str,str,str,i64,f64,str,str
1,"""01""","""OZAN""","""ozan""",500,6.6,"""01284""","""Ain"""
2,"""01""","""CORMORANCHE-SUR-SAONE""","""cormoranche sur saone""",1000,9.85,"""01123""","""Ain"""
3,"""01""","""PLAGNE""","""plagne""",100,6.2,"""01298""","""Ain"""
4,"""01""","""TOSSIAT""","""tossiat""",1400,10.17,"""01422""","""Ain"""
5,"""01""","""POUILLAT""","""pouillat""",100,6.23,"""01309""","""Ain"""


In [13]:
print(f"\nSchema: {df_polars.schema}")


Schema: Schema([('id', Int64), ('department', String), ('name', String), ('simple_name', String), ('population_2012', Int64), ('surface', Float64), ('commune_code', String), ('departement_nom', String)])


### Let's find the biggest cities in France

In [15]:
# PANDAS: Multiple ways, all slightly awkward
biggest_pandas = df_pandas.nlargest(10, 'population_2012')[['name', 'departement_nom', 'population_2012']]
print("Top 10 cities (Pandas):")
biggest_pandas

Top 10 cities (Pandas):


Unnamed: 0,name,departement_nom,population_2012
30437,PARIS,Paris,2211000
4439,MARSEILLE,Bouches-du-Rhône,851400
28152,LYON,Rhône,474900
11718,TOULOUSE,Haute-Garonne,439600
2049,NICE,Alpes-Maritimes,344900
16755,NANTES,Loire-Atlantique,283300
27303,STRASBOURG,Bas-Rhin,272100
13338,MONTPELLIER,Hérault,253000
12678,BORDEAUX,Gironde,235900
22744,LILLE,Nord,225800


In [16]:
# POLARS: Clean, expressive, SQL-like
biggest_polars = (
    df_polars
    .sort('population_2012', descending=True)
    .head(10)
    .select(['name', 'departement_nom', 'population_2012'])
)
print("Top 10 cities (Polars):")
biggest_polars

Top 10 cities (Polars):


name,departement_nom,population_2012
str,str,i64
"""PARIS""","""Paris""",2211000
"""MARSEILLE""","""Bouches-du-Rhône""",851400
"""LYON""","""Rhône""",474900
"""TOULOUSE""","""Haute-Garonne""",439600
"""NICE""","""Alpes-Maritimes""",344900
"""NANTES""","""Loire-Atlantique""",283300
"""STRASBOURG""","""Bas-Rhin""",272100
"""MONTPELLIER""","""Hérault""",253000
"""BORDEAUX""","""Gironde""",235900
"""LILLE""","""Nord""",225800


---

## Round 3: Complex Analytics - Where Polars Shines ✨

### Calculate population density and find urban hotspots

In [17]:
# PANDAS: Watch the memory and CPU struggle
start = time.time()
df_pandas['density'] = df_pandas['population_2012'] / df_pandas['surface']
df_pandas['is_urban'] = df_pandas['density'] > 1000
urban_pandas = df_pandas[df_pandas['is_urban']].copy()
urban_by_dept_pandas = (
    urban_pandas
    .groupby('departement_nom')
    .agg({
        'name': 'count',
        'population_2012': 'sum',
        'density': 'mean'
    })
    .rename(columns={'name': 'urban_cities_count'})
    .sort_values('urban_cities_count', ascending=False)
)
pandas_time = time.time() - start
print(f"🐼 Pandas time: {pandas_time:.3f}s")
print(urban_by_dept_pandas.head(10))

🐼 Pandas time: 0.004s
                   urban_cities_count  population_2012       density
departement_nom                                                     
Nord                               70          1435000   2499.149674
Yvelines                           53           992400   2842.373926
Essonne                            52           907400   2818.847389
Val-d'oise                         47           955500   3570.892782
Pas-de-Calais                      44           609200   1837.662018
Val-de-Marne                       44          1300600   7185.867394
Seine-Saint-Denis                  40          1506600   7401.078031
Seine-et-Marne                     40           624400   2060.365972
Rhône                              38          1209900   2466.024180
Hauts-de-Seine                     35          1547900  10825.816409


In [18]:
# POLARS: Multi-threaded, lazy evaluation magic
start = time.time()
urban_by_dept_polars = (
    df_polars
    .with_columns(
        (pl.col('population_2012') / pl.col('surface')).alias('density')
    )
    .filter(pl.col('density') > 1000)
    .group_by('departement_nom')
    .agg([
        pl.len().alias('urban_cities_count'),
        pl.col('population_2012').sum().alias('total_urban_pop'),
        pl.col('density').mean().alias('avg_density')
    ])
    .sort('urban_cities_count', descending=True)
)
polars_time = time.time() - start
print(f"🦀 Polars time: {polars_time:.3f}s")
print(f"⚡ {pandas_time/polars_time:.1f}x faster!\n")
print(urban_by_dept_polars.head(10))

🦀 Polars time: 0.001s
⚡ 3.1x faster!

shape: (10, 4)
┌───────────────────┬────────────────────┬─────────────────┬──────────────┐
│ departement_nom   ┆ urban_cities_count ┆ total_urban_pop ┆ avg_density  │
│ ---               ┆ ---                ┆ ---             ┆ ---          │
│ str               ┆ u32                ┆ i64             ┆ f64          │
╞═══════════════════╪════════════════════╪═════════════════╪══════════════╡
│ Nord              ┆ 70                 ┆ 1435000         ┆ 2499.149674  │
│ Yvelines          ┆ 53                 ┆ 992400          ┆ 2842.373926  │
│ Essonne           ┆ 52                 ┆ 907400          ┆ 2818.847389  │
│ Val-d'oise        ┆ 47                 ┆ 955500          ┆ 3570.892782  │
│ Val-de-Marne      ┆ 44                 ┆ 1300600         ┆ 7185.867394  │
│ Pas-de-Calais     ┆ 44                 ┆ 609200          ┆ 1837.662018  │
│ Seine-et-Marne    ┆ 40                 ┆ 624400          ┆ 2060.365972  │
│ Seine-Saint-Denis ┆ 40           

---

## Round 4: Window Functions - SQL Power in Python 💪

### Rank cities within each department

In [19]:
# PANDAS: Painful syntax, slow execution
start = time.time()
df_pandas['rank_in_dept'] = df_pandas.groupby('department')['population_2012'].rank(method='dense', ascending=False)
df_pandas['pct_of_dept_pop'] = df_pandas.groupby('department')['population_2012'].transform(lambda x: x / x.sum() * 100)
top_cities_pandas = df_pandas[df_pandas['rank_in_dept'] <= 3].sort_values(['department', 'rank_in_dept'])
pandas_window_time = time.time() - start
print(f"🐼 Pandas window operations: {pandas_window_time:.3f}s")

🐼 Pandas window operations: 0.022s


In [20]:
# POLARS: SQL-like window functions, blazing fast
start = time.time()
top_cities_polars = (
    df_polars
    .with_columns([
        pl.col('population_2012')
            .rank(method='dense', descending=True)
            .over('department')
            .alias('rank_in_dept'),
        (pl.col('population_2012') / pl.col('population_2012').sum().over('department') * 100)
            .alias('pct_of_dept_pop')
    ])
    .filter(pl.col('rank_in_dept') <= 3)
    .sort(['department', 'rank_in_dept'])
    .select(['department', 'departement_nom', 'name', 'population_2012', 'rank_in_dept', 'pct_of_dept_pop'])
)
polars_window_time = time.time() - start
print(f"🦀 Polars window operations: {polars_window_time:.3f}s")
print(f"⚡ {pandas_window_time/polars_window_time:.1f}x faster!\n")
print(top_cities_polars.head(15))

🦀 Polars window operations: 0.002s
⚡ 8.9x faster!

shape: (15, 6)
┌────────────┬─────────────────┬─────────────────┬─────────────────┬──────────────┬────────────────┐
│ department ┆ departement_nom ┆ name            ┆ population_2012 ┆ rank_in_dept ┆ pct_of_dept_po │
│ ---        ┆ ---             ┆ ---             ┆ ---             ┆ ---          ┆ p              │
│ str        ┆ str             ┆ str             ┆ i64             ┆ u32          ┆ ---            │
│            ┆                 ┆                 ┆                 ┆              ┆ f64            │
╞════════════╪═════════════════╪═════════════════╪═════════════════╪══════════════╪════════════════╡
│ 01         ┆ Ain             ┆ BOURG-EN-BRESSE ┆ 40200           ┆ 1            ┆ 6.910779       │
│ 01         ┆ Ain             ┆ OYONNAX         ┆ 23100           ┆ 2            ┆ 3.971119       │
│ 01         ┆ Ain             ┆ AMBERIEU-EN-BUG ┆ 12800           ┆ 3            ┆ 2.200447       │
│            ┆           

---

## Round 5: Advanced Aggregations - Multiple Stats at Once 📊

### Department statistics: the full picture

In [21]:
# PANDAS: Multiple aggregations, messy syntax
start = time.time()
dept_stats_pandas = df_pandas.groupby('departement_nom').agg({
    'population_2012': ['sum', 'mean', 'std', 'median'],
    'surface': ['sum', 'mean'],
    'name': 'count'
}).round(2)
dept_stats_pandas.columns = ['_'.join(col).strip() for col in dept_stats_pandas.columns.values]
dept_stats_pandas = dept_stats_pandas.rename(columns={'name_count': 'city_count'})
pandas_agg_time = time.time() - start
print(f"🐼 Pandas aggregation time: {pandas_agg_time:.3f}s")
print(dept_stats_pandas.head())

🐼 Pandas aggregation time: 0.009s
                         population_2012_sum  population_2012_mean  \
departement_nom                                                      
Ain                                   581700               1388.31   
Aisne                                 540200                662.01   
Allier                                343100               1072.19   
Alpes-Maritimes                      1083900               6649.69   
Alpes-de-Haute-Provence               158000                790.00   

                         population_2012_std  population_2012_median  \
departement_nom                                                        
Ain                                  2738.89                   700.0   
Aisne                                2622.47                   300.0   
Allier                               3108.20                   400.0   
Alpes-Maritimes                     28915.29                   800.0   
Alpes-de-Haute-Provence              2172.9

In [22]:
# POLARS: Clean, expressive, parallel execution
start = time.time()
dept_stats_polars = (
    df_polars
    .group_by('departement_nom')
    .agg([
        pl.len().alias('city_count'),
        pl.col('population_2012').sum().alias('total_population'),
        pl.col('population_2012').mean().alias('avg_population'),
        pl.col('population_2012').std().alias('std_population'),
        pl.col('population_2012').median().alias('median_population'),
        pl.col('surface').sum().alias('total_surface'),
        pl.col('surface').mean().alias('avg_surface'),
        # Bonus: custom expressions!
        (pl.col('population_2012').max() - pl.col('population_2012').min()).alias('pop_range'),
        pl.col('population_2012').quantile(0.9).alias('p90_population')
    ])
    .sort('total_population', descending=True)
)
polars_agg_time = time.time() - start
print(f"🦀 Polars aggregation time: {polars_agg_time:.3f}s")
print(f"⚡ {pandas_agg_time/polars_agg_time:.1f}x faster!\n")
print(dept_stats_polars.head())

🦀 Polars aggregation time: 0.001s
⚡ 6.4x faster!

shape: (5, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ departeme ┆ city_coun ┆ total_pop ┆ avg_popul ┆ … ┆ total_sur ┆ avg_surfa ┆ pop_range ┆ p90_popu │
│ nt_nom    ┆ t         ┆ ulation   ┆ ation     ┆   ┆ face      ┆ ce        ┆ ---       ┆ lation   │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ i64       ┆ ---      │
│ str       ┆ u32       ┆ i64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Nord      ┆ 650       ┆ 2565600   ┆ 3947.0769 ┆ … ┆ 5742.75   ┆ 8.835     ┆ 225800    ┆ 7900.0   │
│           ┆           ┆           ┆ 23        ┆   ┆           ┆           ┆           ┆          │
│ Paris     ┆ 1         ┆ 2211000   ┆ 2.211e6   ┆ … ┆ 105.4     ┆ 105.4     ┆ 0         ┆ 2.211e6  │
│ Bouches-d ┆ 119       ┆ 

---

## Round 6: Lazy Evaluation - The Game Changer 🎮

### Building a complex query without executing

In [23]:
# This is where Polars truly shines - build your entire pipeline first!

# Create a lazy frame from our data
lazy_df = df_polars.lazy()

# Build a complex analytical pipeline
analysis = (
    lazy_df
    # Add computed columns
    .with_columns([
        (pl.col('population_2012') / pl.col('surface')).alias('density'),
        pl.col('department').str.starts_with('97').alias('is_overseas'),
        pl.col('population_2012').log10().alias('log_population')
    ])
    # Filter for interesting cities
    .filter(
        (pl.col('population_2012') > 5000) & 
        (pl.col('density') > 100)
    )
    # Add rankings
    .with_columns([
        pl.col('population_2012').rank(descending=True).alias('national_rank'),
        pl.col('density').rank(descending=True).over('department').alias('dept_density_rank')
    ])
    # Select and reorder
    .select([
        'department',
        'departement_nom', 
        'name',
        'population_2012',
        'density',
        'national_rank',
        'dept_density_rank',
        'is_overseas'
    ])
    .sort('national_rank')
)

# Nothing has been computed yet! Let's see the optimized plan:
print("QUERY PLAN (what Polars will actually execute):")
print(analysis.explain())

QUERY PLAN (what Polars will actually execute):
SORT BY [col("national_rank")]
  simple π 8/8 ["department", ... 7 other columns]
     WITH_COLUMNS:
     [col("population_2012").rank().alias("national_rank"), col("density").rank().over([col("department")]).alias("dept_density_rank")] 
      FILTER [(col("density")) > (100.0)]
      FROM
         WITH_COLUMNS:
         [[(col("population_2012").cast(Float64)) / (col("surface"))].alias("density"), col("department").str.starts_with(["97"]).alias("is_overseas")] 
          FILTER [(col("population_2012")) > (5000)]
          FROM
            DF ["id", "department", "name", "simple_name", ...]; PROJECT["department", "departement_nom", "name", "population_2012", ...] 5/8 COLUMNS


In [24]:
# Now execute the optimized query
start = time.time()
result = analysis.collect()
execution_time = time.time() - start
print(f"\n⚡ Executed entire pipeline in {execution_time:.3f}s")
print(f"Result shape: {result.shape}")
print(result.head(20))


⚡ Executed entire pipeline in 0.001s
Result shape: (1935, 8)
shape: (20, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ department ┆ departemen ┆ name       ┆ populatio ┆ density   ┆ national_ ┆ dept_dens ┆ is_overse │
│ ---        ┆ t_nom      ┆ ---        ┆ n_2012    ┆ ---       ┆ rank      ┆ ity_rank  ┆ as        │
│ str        ┆ ---        ┆ str        ┆ ---       ┆ f64       ┆ ---       ┆ ---       ┆ ---       │
│            ┆ str        ┆            ┆ i64       ┆           ┆ f64       ┆ f64       ┆ bool      │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 75         ┆ Paris      ┆ PARIS      ┆ 2211000   ┆ 20977.229 ┆ 1.0       ┆ 1.0       ┆ false     │
│            ┆            ┆            ┆           ┆ 602       ┆           ┆           ┆           │
│ 13         ┆ Bouches-du ┆ MARSEILLE  ┆ 851400    ┆ 3538.3592 ┆ 2.0       ┆ 1.0       ┆ false     │
│            ┆

---

## Round 7: The "Impossible with Pandas" - Streaming Large Data 🌊

### Simulate processing a massive dataset

In [25]:
# Let's pretend we have yearly data files we need to process
# This would crash pandas on large files!

def process_year_data(year: int):
    """Simulate processing a year's worth of city data"""
    return (
        df_polars.lazy()
        .with_columns(pl.lit(year).alias('year'))
        .with_columns(
            # Simulate population growth
            (pl.col('population_2012') * (1 + (year - 2012) * 0.01)).cast(pl.Int32).alias('estimated_pop')
        )
        .group_by(['department', 'year'])
        .agg([
            pl.col('estimated_pop').sum().alias('dept_total'),
            pl.col('estimated_pop').mean().alias('dept_avg'),
            pl.len().alias('city_count')
        ])
    )

# Process multiple years and combine
all_years = pl.concat([
    process_year_data(year).collect() 
    for year in range(2012, 2025)
])

print("Multi-year department evolution:")
print(all_years.filter(pl.col('department').is_in(['75', '13', '69'])))  # Paris, Marseille, Lyon

Multi-year department evolution:
shape: (39, 5)
┌────────────┬──────┬────────────┬──────────────┬────────────┐
│ department ┆ year ┆ dept_total ┆ dept_avg     ┆ city_count │
│ ---        ┆ ---  ┆ ---        ┆ ---          ┆ ---        │
│ str        ┆ i32  ┆ i32        ┆ f64          ┆ u32        │
╞════════════╪══════╪════════════╪══════════════╪════════════╡
│ 75         ┆ 2012 ┆ 2211000    ┆ 2.211e6      ┆ 1          │
│ 69         ┆ 2012 ┆ 1690700    ┆ 5770.307167  ┆ 293        │
│ 13         ┆ 2012 ┆ 1965400    ┆ 16515.966387 ┆ 119        │
│ 13         ┆ 2013 ┆ 1985054    ┆ 16681.12605  ┆ 119        │
│ 69         ┆ 2013 ┆ 1707607    ┆ 5828.010239  ┆ 293        │
│ …          ┆ …    ┆ …          ┆ …            ┆ …          │
│ 13         ┆ 2023 ┆ 2181594    ┆ 18332.722689 ┆ 119        │
│ 69         ┆ 2023 ┆ 1876677    ┆ 6405.040956  ┆ 293        │
│ 13         ┆ 2024 ┆ 2201248    ┆ 18497.882353 ┆ 119        │
│ 69         ┆ 2024 ┆ 1893584    ┆ 6462.744027  ┆ 293        │
│ 75   

---

## Final Score: Performance Summary 🏆

In [26]:
print("=" * 50)
print("🏁 PERFORMANCE COMPARISON SUMMARY 🏁")
print("=" * 50)

comparisons = {
    'Data Loading': (pandas_load_time, polars_load_time),
    'Complex Analytics': (pandas_time, polars_time),
    'Window Functions': (pandas_window_time, polars_window_time),
    'Aggregations': (pandas_agg_time, polars_agg_time)
}

total_pandas = 0
total_polars = 0

for operation, (p_time, pl_time) in comparisons.items():
    speedup = p_time / pl_time
    total_pandas += p_time
    total_polars += pl_time
    print(f"{operation:20} | Pandas: {p_time:.3f}s | Polars: {pl_time:.3f}s | ⚡ {speedup:.1f}x faster")

print("-" * 50)
total_speedup = total_pandas / total_polars
print(f"{'TOTAL':20} | Pandas: {total_pandas:.3f}s | Polars: {total_polars:.3f}s | ⚡ {total_speedup:.1f}x faster")
print("=" * 50)

🏁 PERFORMANCE COMPARISON SUMMARY 🏁
Data Loading         | Pandas: 0.119s | Polars: 0.066s | ⚡ 1.8x faster
Complex Analytics    | Pandas: 0.004s | Polars: 0.001s | ⚡ 3.1x faster
Window Functions     | Pandas: 0.022s | Polars: 0.002s | ⚡ 8.9x faster
Aggregations         | Pandas: 0.009s | Polars: 0.001s | ⚡ 6.4x faster
--------------------------------------------------
TOTAL                | Pandas: 0.154s | Polars: 0.071s | ⚡ 2.2x faster


---
## Bonus: Quick Migration Cheatsheet 📝

In [None]:
# Common patterns to remember:

# SELECTING
# Pandas: df[['col1', 'col2']]
# Polars: df.select(['col1', 'col2'])

# FILTERING
# Pandas: df[df['col'] > 5]
# Polars: df.filter(pl.col('col') > 5)

# ADDING COLUMNS
# Pandas: df['new'] = df['old'] * 2
# Polars: df.with_columns((pl.col('old') * 2).alias('new'))

# GROUPING
# Pandas: df.groupby('x').agg({'y': 'sum'})
# Polars: df.group_by('x').agg(pl.col('y').sum())

# JOINING
# Pandas: pd.merge(df1, df2, on='key')
# Polars: df1.join(df2, on='key')