In [1]:
import os
from utils import load_baby_names_data
import altair as alt
import pandas as pd
import polars as pl

In [2]:
df = load_baby_names_data()
df

sexe,preusuel,annais,dpt,nombre,nom,aire,longitude,latitude
i64,str,str,str,i64,str,str,f64,f64
1,"""_PRENOMS_RARES…","""1900""","""02""",7,"""Aisne""","""7 419""",3.558333,49.559444
1,"""_PRENOMS_RARES…","""1900""","""04""",9,"""Alpes-de-Haute…","""6 996""",6.243889,44.106111
1,"""_PRENOMS_RARES…","""1900""","""05""",8,"""Hautes-Alpes""","""5 689""",6.263056,44.663611
1,"""_PRENOMS_RARES…","""1900""","""06""",23,"""Alpes-Maritime…","""4 297""",7.116389,43.9375
1,"""_PRENOMS_RARES…","""1900""","""07""",9,"""Ardèche""","""5 565""",4.424722,44.751667
1,"""_PRENOMS_RARES…","""1900""","""08""",4,"""Ardennes""","""5 244""",4.640833,49.615556
1,"""_PRENOMS_RARES…","""1900""","""09""",6,"""Ariège""","""4 908""",1.503889,42.920833
1,"""_PRENOMS_RARES…","""1900""","""10""",3,"""Aube""","""6 027""",4.161667,48.304444
1,"""_PRENOMS_RARES…","""1900""","""11""",11,"""Aude""","""6 345""",2.414167,43.103333
1,"""_PRENOMS_RARES…","""1900""","""12""",7,"""Aveyron""","""8 774""",2.679722,44.280278


In [3]:
preusuel_sexe = df.groupby(["preusuel", "sexe"], maintain_order=True).count()
preusuel_sexe = preusuel_sexe.groupby("preusuel").count()
preusuel_unisex = preusuel_sexe.with_columns((pl.col('count')>1).alias('unisex'))
df = df.join(preusuel_unisex, on='preusuel', how='left')

In [4]:
df = df.with_columns((pl.col("sexe").cast(pl.Utf8) + '_' + pl.col('unisex').cast(pl.Utf8)).alias('sexe_unisex'))

We should also add a threshold on the unisex names. So that a name with only one occurence within another sex is not considered as unisex.

This should reduce the amount of names considered unisex. There might be a lot with the exceptions !

In [5]:
# Filter data for the department "Aisne"
df_aisne = df.filter(pl.col('dpt') == '02')

In [6]:
# Pivot the DataFrame
df_pivot = df_aisne.pivot(values='nombre', index='annais', columns=['sexe', 'sexe_unisex'], aggregate_function='sum')

In [7]:
# Calculate part of male population with unisex name, part of female population with unisex name
df_pivot = df_pivot.with_columns([(-100 * pl.col('1_true')/pl.col('1')).alias('homme_unisexe'), 
                       (-100 * pl.col('1_false')/pl.col('1')).alias('homme_seulmt'), 
                       (100 * pl.col('2_true')/pl.col('2')).alias('femme_unisexe'), 
                       (100 * pl.col('2_false')/pl.col('2')).alias('femme_seulmt')])

In order to create a stacked bar chart we must go back to a long format.

In [8]:
df_bar = df_pivot.melt(id_vars="annais", value_vars=['homme_unisexe', 'homme_seulmt', 'femme_unisexe', 'femme_seulmt'], variable_name='categorie_nom_sexe')

In [9]:
# Create a stacked bar chart
# cf https://altair-viz.github.io/gallery/stacked_bar_chart.html
# x axis years
# y axis all the percentages
chart = alt.Chart(df_bar).mark_bar().encode(
    x=alt.X('annais:O', title='Year'),
    y=alt.Y('sum(value):Q'),
    color=alt.Color('categorie_nom_sexe:N')
)

We are ready to make a stacked bar plot representing for each year the part of the male population with unisex name and part of the female population with unisex name. 

In [10]:
chart.display()

In order to fit in a square, the bars width has to be diminished, and we can only keep a tick like every 20 years for instance. We might restrict ourselves to a more narrow interval for the years.

In [11]:
# Determine the years to use as ticks
# Determine the years to use as ticks
years = list(range(1900, 2021, 20))

# Convert the years to string
years = [str(year) for year in years]
years

['1900', '1920', '1940', '1960', '1980', '2000', '2020']

In [23]:
chart = alt.Chart(df_bar).mark_bar().encode(
    x=alt.X('annais:O', title=None, axis=alt.Axis(values=years)), # year
    y=alt.Y('sum(value):Q', title=None), # values
    color=alt.Color('categorie_nom_sexe:N', legend=None)
).properties(
    width=300,
    height=300
)

In [24]:
chart.display()

Maybe I'll invert x and y

---

Now we should place organize our graphs knowing their location. Unfortunately altair does not seem fit for this task.

But even before that we need to generate a graph for each dpt.

In [25]:
df_dpt = df.groupby(['nom', 'dpt', 'aire', 'longitude', 'latitude'], maintain_order=True).agg(pl.col('nombre').sum().alias("population_periode"))
df_dpt

nom,dpt,aire,longitude,latitude,population_periode
str,str,str,f64,f64,i64
"""Aisne""","""02""","""7 419""",3.558333,49.559444,848804
"""Alpes-de-Haute…","""04""","""6 996""",6.243889,44.106111,106288
"""Hautes-Alpes""","""05""","""5 689""",6.263056,44.663611,127230
"""Alpes-Maritime…","""06""","""4 297""",7.116389,43.9375,827519
"""Ardèche""","""07""","""5 565""",4.424722,44.751667,357088
"""Ardennes""","""08""","""5 244""",4.640833,49.615556,449218
"""Ariège""","""09""","""4 908""",1.503889,42.920833,157497
"""Aube""","""10""","""6 027""",4.161667,48.304444,383359
"""Aude""","""11""","""6 345""",2.414167,43.103333,358564
"""Aveyron""","""12""","""8 774""",2.679722,44.280278,405758


Actually, let's try first with 3 départements Aube, Aude, and Haute-Vienne for instance

In [125]:
# function to get the chart for a dpt
def gen_dpt_chart(dpt):
    df_dpt = df.filter(pl.col('dpt') == dpt)
    # Pivot the DataFrame
    df_dpt_pivot = df_dpt.pivot(values='nombre', index='annais', columns=['sexe', 'sexe_unisex'], aggregate_function='sum')
    # Calculate part of male population with unisex name, part of female population with unisex name
    df_dpt_pivot = df_dpt_pivot.with_columns([(-100 * pl.col('1_true')/pl.col('1')).alias('homme_unisexe'), 
                        (-100 * pl.col('1_false')/pl.col('1')).alias('homme_seulmt'), 
                        (100 * pl.col('2_true')/pl.col('2')).alias('femme_unisexe'), 
                        (100 * pl.col('2_false')/pl.col('2')).alias('femme_seulmt')])
    df_dpt_bar = df_dpt_pivot.melt(id_vars="annais", value_vars=['homme_unisexe', 'homme_seulmt', 'femme_unisexe', 'femme_seulmt'], variable_name='categorie_nom_sexe')
    # Create a stacked bar chart
    chart = alt.Chart(df_bar).mark_bar().encode(
        x=alt.X('annais:O', title=None, axis=None), # year
        y=alt.Y('sum(value):Q', title=None, axis=None), # values
        color=alt.Color('categorie_nom_sexe:N', legend=None)
    ).properties(
        width=100,
        height=100
    )
    return chart

In [126]:
gen_dpt_chart('10'), gen_dpt_chart('11'), gen_dpt_chart('87')

(alt.Chart(...), alt.Chart(...), alt.Chart(...))

In [127]:
# Location of the center of France, Allier is near that
df_dpt.filter(pl.col('dpt')=='87')

nom,dpt,aire,longitude,latitude,population_periode
str,str,str,f64,f64,i64
"""Haute-Vienne""","""87""","""5 560""",1.235278,45.891667,484520


In [128]:
# Create a new map centered at some location
# Allier : 46.393611, 3.188333

# Save the charts as HTML
chart1_html = gen_dpt_chart('10').save('chart1.html') # 48.304444, 4.161667
chart2_html = gen_dpt_chart('11').save('chart2.html') # 43.103333, 2.414167
chart3_html = gen_dpt_chart('87').save('chart3.html') # 45.891667, 1.235278


Web techs are worth trying here, as there are more flexible.
So as to plot all the graphs on a same web page.


In [129]:
# Generate a html graph for a dpt
import math
def gen_dpt_chart_html(dpt, precision=5):
    dpt_info = df_dpt.filter(pl.col('dpt')==dpt)
    longitude = int(round(dpt_info["longitude"][0], precision)*math.pow(10, precision))
    latitude = int(round(dpt_info["latitude"][0], precision)*math.pow(10, precision))
    gen_dpt_chart(dpt).save(f'html/chart_{dpt}_{longitude}_{latitude}.html')

In [130]:
gen_dpt_chart_html('10'), gen_dpt_chart_html('11'), gen_dpt_chart_html('87')

(None, None, None)

In [155]:
import os
import re

precision=5

# Get all html files in the html directory
dir = 'html'
html_files = [f for f in os.listdir(dir) if f.endswith('.html')]

# Define the base html structure
html = """
<!DOCTYPE html>
<html>
  <head>
    <script src="https://d3js.org/d3.v5.min.js"></script>
  </head>
  <body>
    <script>
      // Create the projection
      var projection = d3.geoMercator()
          .scale(10000)
          .center([2.454071, 46.279229]);  // Longitude and latitude of France's center

      // Load the positions and charts from a list of files
      var charts = {varcharts};

      // Create divs for each chart and load them
      for (var id in charts) {{
        var coords = projection([charts[id].long, charts[id].lat]);
        var div = d3.select('body').append('div')
            .attr('id', id)
            .style('position', 'absolute')
            .style('left', coords[0] + 'px')
            .style('top', coords[1] + 'px');
        
        div.html('<iframe src="' + charts[id].file + '" width="150" height="130"></iframe>');
      }}
    </script>
  </body>
</html>
"""


# Generate varcharts
varcharts = {}

for file in html_files:
    # Extract long and lat from the file name
    dpt, long, lat = map(float, re.search(r'chart_(.+?)_(.+?)_(.+?)\.html', file).groups())
    dpt = int(dpt)
    long = long/math.pow(10, precision)
    lat = lat/math.pow(10, precision)

    # id of the chart
    id = f'chart_{dpt}'
    
    # Generate charts var
    varcharts[id] = {'long': f'{long}', 'lat': f'{lat}', 'file': f'{dir}/{file}'}

# Insert divs, iframes and positions into the base html
html = html.format(varcharts=varcharts)

# Save the generated html to a file
with open('france_map_sex_name.html', 'w') as f:
    f.write(html)


Looks about right so let's generate all the departements.

In [153]:
for row in df_dpt.rows(named=True):
    if row['dpt'] != 'XX' and row['longitude'] is not None and row['latitude'] is not None:
        gen_dpt_chart_html(row['dpt'])