In [3]:
import polars as pl
import json
import altair as alt
import geopolars as gpl

In [4]:
df = pl.read_csv("data/dataset.csv")

In [5]:
is_rare = pl.col("name") == "_PRENOMS_RARES"

In [108]:
name_dpt_sexe = (
    # first, remove rare nouns
    df.filter(pl.col("name") != "_PRENOMS_RARES")

    # then, remove nouns that have less that 100 occurences in total
    .filter(pl.sum("count").over("name") >= 400000)

    .group_by(["name", "dpt", "sexe"])
    .agg(pl.col("count").sum())
    .with_columns(pl.col("count").log().alias("score"))
    .with_columns(pl.col("dpt").map_elements(lambda x: f"0{x}" if x < 10 else str(x)))
)
name_dpt_sexe

  .with_columns(pl.col("dpt").map_elements(lambda x: f"0{x}" if x < 10 else str(x)))


name,dpt,sexe,count,score
str,str,str,i64,f64
"""MONIQUE""","""45""","""F""",3994,8.292549
"""ROGER""","""971""","""F""",44,3.78419
"""CLAUDE""","""30""","""M""",3294,8.099858
"""ALAIN""","""41""","""M""",2838,7.950855
"""PAUL""","""31""","""M""",6453,8.7723
…,…,…,…,…
"""DANIEL""","""81""","""M""",1763,7.474772
"""MARIE""","""40""","""F""",21364,9.969463
"""MONIQUE""","""61""","""F""",3117,8.044626
"""PIERRE""","""93""","""M""",1933,7.566828


In [109]:
alt.Chart(name_dpt_sexe).mark_circle().encode(
    alt.X("dpt:N"),
    alt.Y("name:N"),
    alt.Size("score"),
    alt.Color("sexe:N"),
)

In [107]:
dom_tom = name_dpt_sexe.filter(pl.col("dpt").is_between(970, 979))

InvalidOperationError: got invalid or ambiguous dtypes: '[str, dyn int, dyn int]' in expression 'is_between'

Consider explicitly casting your input types to resolve potential ambiguity.

In [10]:
alt.Chart(dom_tom).mark_bar().encode(
    alt.Y("name:N", sort="x"),
    alt.X("len:Q"),
    alt.Color("sexe:N"),
    alt.Column("dpt")
)

In [11]:
alt.Chart(name_dpt_sexe).mark_bar().encode(
    alt.X("dpt:N"),
    alt.Y("name:N"),
    alt.Color("len:Q"),
)

In [92]:
url_geojson = "https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
geodata = alt.Data(url=url_geojson, format=alt.DataFormat(property="features"))
geodata

Data({
  format: DataFormat({
    property: 'features'
  }),
  url: 'https://france-geojson.gregoiredavid.fr/repo/departements.geojson'
})

In [93]:
alt.Chart(geodata).mark_geoshape().encode(color='properties.code:N')

In [94]:
centers = pl.read_csv("data/dpt_positions.csv")

In [112]:
foo = (name_dpt_sexe
    .filter(pl.col("name")=="ANDRÉ")
    .with_columns(
    pl.col("count").sum().over("name", "dpt", "sexe") / pl.col("count").sum().over("name", "dpt")
))

In [113]:
data = (alt.Chart(foo)
    .transform_filter(alt.datum.dpt <= 100).transform_filter(alt.datum.sexe == "M")
    .encode(
        
        #facet=alt.Facet('name:N', columns=5),
    )
 .transform_lookup(
    lookup='dpt',
    from_=alt.LookupData(geodata, 'properties.code'),
      as_="geo"
)
.transform_lookup(
    lookup='dpt',
    from_=alt.LookupData(centers, 'dpt', ["lon", "lat"]),
)
)

bg = data.mark_geoshape().encode(color='count:Q', shape='geo:G')
fg = data.mark_circle().encode(longitude="lon:Q", latitude="lat:Q", shape="sexe:N")

(bg + fg)

In [119]:
df.filter(pl.col("name")=="ANDRÉ", pl.col("sexe")=="F").sort("count").tail(10)

sexe,dpt,year,name,count
str,i64,i64,str,i64
"""F""",971.0,1950.0,"""ANDRÉ""",3
"""F""",972.0,1958.0,"""ANDRÉ""",3
"""F""",972.0,1900.0,"""ANDRÉ""",4
"""F""",75.0,1909.0,"""ANDRÉ""",4
"""F""",75.0,1918.0,"""ANDRÉ""",4
"""F""",38.0,1924.0,"""ANDRÉ""",4
"""F""",75.0,1916.0,"""ANDRÉ""",5
"""F""",972.0,1954.0,"""ANDRÉ""",5
"""F""",75.0,1905.0,"""ANDRÉ""",6
"""F""",,,"""ANDRÉ""",563


In [None]:
df.select(
)

In [87]:
foo.select("dpt")

dpt
i64
25
3
48
40
62
…
51
80
83
63


# Entropy and diversity

In [134]:
name_dpt_sexe.with_columns(
     (
         pl.col("count").sum().over("name", "dpt") / pl.col("count").sum().over("name")
     ).alias("prob")
).group_by("name").agg(
    pl.col("prob").sum()
)

name,prob
str,f64
"""MARCEL""",1.005315
"""MONIQUE""",1.015637
"""CLAUDE""",2.0
"""LOUIS""",1.012183
"""PAUL""",1.016084
…,…
"""GEORGES""",1.027912
"""FRANÇOISE""",1.005011
"""RENÉ""",1.096401
"""JACQUES""",1.013703


In [141]:
name_dpt_sexe.with_columns(
     (
         pl.col("count").sum().over("name", "dpt") / pl.col("count").sum().over("name")
     ).alias("prob")
).group_by("name").agg(
    (-pl.col("prob").log(2)*pl.col("prob")).sum().alias("entropy")
).sort("entropy")

name,entropy
str,f64
"""JACQUES""",5.943753
"""FRANÇOISE""",6.037988
"""PHILIPPE""",6.099776
"""ALAIN""",6.114558
"""ROGER""",6.138869
…,…
"""PIERRE""",6.572402
"""ANDRÉ""",6.806552
"""DOMINIQUE""",11.883053
"""CLAUDE""",11.964841


# Timeline

In [None]:
name_dpt_yr = (
    # first, remove rare nouns
    df.filter(pl.col("name") != "_PRENOMS_RARES")

    # then, remove nouns that have less that 100 occurences in total
    .filter(pl.sum("count").over("name") >= 400000)

    .group_by(["name", "dpt", "year"])
    .agg(pl.col("count").sum())
    #.with_columns(pl.col("count").log().alias("score"))
    .with_columns(pl.col("dpt").map_elements(lambda x: f"0{x}" if x < 10 else str(x)))
)

>Caution :
>
>The following map is really heavy (at least for my poor computer) and will take a long time processing any of your clicks. Expect at least one, more often two, "This window is not responding" warning. Be patient and "Keep waiting" : the results will come at their own pace...

In [None]:
import pandas as pd

click_dpt = alt.selection_point(fields=['dpt'])
pop_selection = alt.selection_interval(encodings=['x'])
selection = alt.selection_point(fields=['name'])

color = alt.condition(selection, alt.Color('name:N', legend=None), alt.value('lightgray'))

make = pd.DataFrame({'name': name_dpt_yr.select("name").unique().to_series().to_list()})
make_selector = alt.Chart(make).mark_rect().encode(y='name', color=color).add_params(selection)


population = alt.Chart(name_dpt_yr, width=800, height=100, title = "Évolution des prénoms par date et par région").mark_bar().encode(
    x=alt.X('year:Q', bin=alt.Bin(maxbins=60)),
    y='count:Q',
    color = alt.Color('name').legend(columns=3),
    tooltip = [alt.Tooltip('name', title='Name'), alt.Tooltip('count:Q', title='Count')],
).add_params(pop_selection
).add_params(selection
).transform_filter(click_dpt
).transform_filter(selection
)



alt.data_transformers.disable_max_rows()
map2 = (alt.Chart(name_dpt_yr, width=800)
.mark_geoshape()
.transform_filter(alt.datum.dpt <= 100)
.transform_lookup(
    lookup='dpt',
    from_=alt.LookupData(geodata, 'properties.code'),
      as_="geo"
)
.encode(
    color='count:Q', 
    shape='geo:G',
    tooltip = [alt.Tooltip('dpt:N', title='Department Code'), alt.Tooltip('count:Q', title='Count')],
    stroke = alt.condition(
        click_dpt,  # Highlight the selected department
        alt.value('black'),  # Stroke color for the selected department
        alt.value('transparent')
    ))
.add_params(click_dpt)
.transform_filter(selection)
.transform_filter(pop_selection)
)


make_selector | (population & map2)