In [2]:
import polars as pl
import numpy as np
from sknetwork.data import from_edge_list
from sknetwork.hierarchy import Paris
from sknetwork.clustering import Louvain
import altair as alt
from sklearn.cluster import SpectralBiclustering
import matplotlib.pyplot as plt

In [3]:
df = pl.read_csv("data/dataset.csv", schema_overrides={"dpt": pl.String})
df = (
    # first, remove rare nouns
    df.filter(pl.col("name") != "_PRENOMS_RARES")
        
    .drop_nulls()

    # then, remove nouns that have less that 100 occurences in total
    .filter(pl.col("count").sum().over("name") >= 100)
    .filter((pl.col("dpt").str.len_chars() == 2) & (pl.col("dpt") != "94")) # pas les dom tom et la corse pour l'instant

    .with_columns(
        # add decade
        pl.col("year").floordiv(10).mul(10).alias("decade"),
        
        pl.col("count").sum().over("name").rank("dense", descending=True).sub(1).alias("name_id"),
        pl.col("count").sum().over("name").rank("dense", descending=True).sub(1).alias("dpt_id"),

        pl.col("count").rank("dense", descending=True).over("dpt", pl.col("year").floordiv(10), "sexe").alias("rank"),

        (pl.col("count").sum().over("dpt", "name", "year") / pl.col("count").sum().over("name", "year")).alias("score_over_dpt")
    )
    .sort("name_id")
)
df

sexe,dpt,year,name,count,decade,name_id,dpt_id,rank,score_over_dpt
str,str,i64,str,i64,i64,u32,u32,u32,f64
"""M""","""01""",1900,"""MARIE""",42,1900,0,0,54,0.014258
"""M""","""03""",1900,"""MARIE""",7,1900,0,0,105,0.013651
"""M""","""06""",1900,"""MARIE""",3,1900,0,0,103,0.006909
"""M""","""07""",1900,"""MARIE""",8,1900,0,0,90,0.010783
"""M""","""08""",1900,"""MARIE""",7,1900,0,0,93,0.006491
…,…,…,…,…,…,…,…,…,…
"""F""","""62""",1959,"""ROSE-AIMÉE""",3,1950,2640,2640,333,1.0
"""F""","""75""",1958,"""ROSE-HÉLÈNE""",3,1950,2640,2640,553,1.0
"""F""","""59""",2006,"""SAINA""",3,2000,2640,2640,274,1.0
"""F""","""75""",2008,"""VENISE""",3,2000,2640,2640,244,1.0


In [63]:
test = (
    df
    .with_columns([
        # Calculate total births per (department, sex, decade, noun) if needed
        pl.col("count").sum().over("dpt", "sexe", "decade", "name").alias("births")
    ])
    .with_columns([
        # Calculate total births per (department, sex, decade)
        pl.col("births").sum().over(["dpt", "sexe", "decade"]).alias("total_births_category"),
        
        # Rank names by birth count within each (department, sex, decade)
        pl.col("births").rank(method="dense", descending=True).over(["dpt", "sexe", "decade"]).alias("rank")
    ])
    .with_columns([
        # Mark if name is in top 50
        (pl.col("rank") <= 10).alias("is_top50"),
    ])
    .group_by(["dpt", "sexe", "decade"])
    .agg([
        # Sum births for top 50 names
        pl.col("births").filter(pl.col("is_top50")).sum().alias("top50_births"),
        # Get total births (same for all rows in group, so first() works)
        pl.col("total_births_category").first().alias("total_births"),
        pl.col("name").filter(pl.col("rank") == 1).first().alias("top_noun"),
        pl.col("births").filter(pl.col("rank") == 1).first().alias("top_noun_births")
    ])
    .with_columns([
        # Calculate proportion
        (pl.col("top50_births") / pl.col("total_births")).alias("top50_score"),
        (pl.col("top_noun_births") / pl.col("total_births")).alias("top_noun_proportion"),
    ])
    #.select(["dpt", "sexe", "decade", "top50_score", "top_noun", "top_noun_proportion"])
)
test

dpt,sexe,decade,top50_births,total_births,top_noun,top_noun_births,top50_score,top_noun_proportion
str,str,i64,i64,i64,str,i64,f64,f64
"""73""","""F""",1990,36330,148414,"""LAURA""",505,0.244788,0.003403
"""44""","""M""",1960,336310,821345,"""JEAN""",6008,0.409463,0.007315
"""87""","""M""",1910,117560,188501,"""JEAN""",2524,0.623657,0.01339
"""58""","""F""",1980,34980,101072,"""AURÉLIE""",541,0.34609,0.005353
"""05""","""F""",1970,16650,37942,"""SANDRINE""",258,0.438828,0.0068
…,…,…,…,…,…,…,…,…
"""44""","""M""",1940,303470,596665,"""JEAN""",8430,0.50861,0.014129
"""87""","""M""",2020,347,1161,"""GABIN""",28,0.29888,0.024117
"""76""","""M""",1980,248600,902588,"""NICOLAS""",3855,0.27543,0.004271
"""38""","""F""",2000,88570,470742,"""EMMA""",1244,0.18815,0.002643


In [64]:
url_geojson = "https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
geodata = alt.Data(url=url_geojson, format=alt.DataFormat(property="features"))
geodata

Data({
  format: DataFormat({
    property: 'features'
  }),
  url: 'https://france-geojson.gregoiredavid.fr/repo/departements.geojson'
})

In [68]:
data = (alt.Chart(test)
    .transform_lookup(
        lookup='dpt',
        from_=alt.LookupData(geodata, 'properties.code'),
        as_="geo"
    )
    .transform_lookup(
        lookup='dpt',
        from_=alt.LookupData(geodata, 'properties.code'),
        as_="geo"
    )
)

# Map visualization
data.mark_geoshape().encode(
    color='top50_score:Q',
    shape='geo:G',
    tooltip=[alt.Tooltip("top_noun_proportion"), alt.Tooltip("dpt")],
    #opacity= alt.value(0.1)
).facet("decade")

In [73]:
df.filter((pl.col("decade")==2010),pl.col("dpt")=="48")["count"].sum()

1321

In [None]:
df.filter((pl.col("decade")==2010),pl.col("dpt")=="48")["count"].sum()