In this notebook, we'll try to answer question 2 and see if there's a regional effect on first names.

## Import 

In [2]:
import altair as alt
import pandas as pd
import numpy as np
import geopandas as gpd
alt.data_transformers.enable('json') 

pass

## Reading names data

In [3]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)

## Loading map data

In [4]:
depts = gpd.read_file('departements-version-simplifiee.geojson')

depts.sample(5)

Unnamed: 0,code,nom,geometry
37,37,Indre-et-Loire,"POLYGON ((0.61443 47.69421, 0.63131 47.70910, ..."
90,90,Territoire de Belfort,"POLYGON ((6.82354 47.81305, 6.84618 47.82295, ..."
20,22,Côtes-d'Armor,"POLYGON ((-3.65914 48.65921, -3.63649 48.67069..."
3,4,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ..."
52,52,Haute-Marne,"POLYGON ((4.67018 48.53189, 4.71801 48.54199, ..."


## Merging the two datasets

In [5]:
just_names = names

names = depts.merge(names, how='right', left_on='code', right_on='dpt')

names.sample(5)

Unnamed: 0,code,nom,geometry,sexe,preusuel,annais,dpt,nombre
633845,51.0,Marne,"POLYGON ((4.04797 49.40564, 4.07691 49.40161, ...",1,GINO,1984,51,3
3132813,42.0,Loire,"POLYGON ((3.89953 46.27591, 3.90940 46.25773, ...",2,MAUDE,1986,42,9
1635090,42.0,Loire,"POLYGON ((3.89953 46.27591, 3.90940 46.25773, ...",1,WILLIAM,2015,42,10
2352983,,,,2,ESTELLE,2000,973,6
2945539,75.0,Paris,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",2,MAÏA,2016,75,28


## Grouping the names by department

In [6]:
grouped = names.groupby(['annais', 'dpt', 'preusuel'], as_index=False)['nombre'].sum()
grouped = grouped.sort_values(['annais', 'dpt', 'nombre'], ascending=[True, True, False])

grouped

Unnamed: 0,annais,dpt,preusuel,nombre
122,1900,01,MARIE,681
90,1900,01,JEAN,144
92,1900,01,JEANNE,134
107,1900,01,LOUIS,100
96,1900,01,JOSEPH,80
...,...,...,...,...
3637568,2020,974,ZARA,3
3637570,2020,974,ZAYNAB,3
3637578,2020,974,ÉLIAS,3
3637580,2020,974,ÉLÉANORE,3


## Finding the 3 most popular names and merging them in one DF

In [7]:
top_names = grouped.groupby(['annais', 'dpt']).nth(0).reset_index()
second_names = grouped.groupby(['annais', 'dpt']).nth(1).reset_index()
third_names = grouped.groupby(['annais', 'dpt']).nth(2).reset_index()

top_names

Unnamed: 0,annais,dpt,preusuel,nombre
0,1900,01,MARIE,681
1,1900,02,MARIE,319
2,1900,03,MARIE,652
3,1900,04,MARIE,146
4,1900,05,MARIE,155
...,...,...,...,...
11634,2020,95,ADAM,133
11635,2020,971,GABRIEL,24
11636,2020,972,NOAH,22
11637,2020,973,SAMUEL,29


In [8]:
top_names = top_names.rename(columns={'preusuel': 'top_preusuel', 'nombre': 'top_nombre'})
second_names = second_names.rename(columns={'preusuel': 'second_preusuel', 'nombre': 'second_nombre'})
third_names = third_names.rename(columns={'preusuel': 'third_preusuel', 'nombre': 'third_nombre'})

In [9]:
merged = pd.merge(top_names, second_names, on=['annais', 'dpt'])
merged = pd.merge(merged, third_names, on=['annais', 'dpt'])

merged

Unnamed: 0,annais,dpt,top_preusuel,top_nombre,second_preusuel,second_nombre,third_preusuel,third_nombre
0,1900,01,MARIE,681,JEAN,144,JEANNE,134
1,1900,02,MARIE,319,JEANNE,153,ANDRÉ,138
2,1900,03,MARIE,652,JEAN,230,JEANNE,194
3,1900,04,MARIE,146,LOUIS,40,JOSEPH,35
4,1900,05,MARIE,155,JOSEPH,50,LOUIS,37
...,...,...,...,...,...,...,...,...
11634,2020,95,ADAM,133,MOHAMED,96,LINA,88
11635,2020,971,GABRIEL,24,LYAM,24,NOAH,23
11636,2020,972,NOAH,22,EDEN,21,ETHAN,19
11637,2020,973,SAMUEL,29,NOAH,28,ENZO,23


## Merging with the previous one 

In [10]:
merged = depts.merge(merged, how='right', left_on='code', right_on='dpt')

merged

Unnamed: 0,code,nom,geometry,annais,dpt,top_preusuel,top_nombre,second_preusuel,second_nombre,third_preusuel,third_nombre
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",1900,01,MARIE,681,JEAN,144,JEANNE,134
1,02,Aisne,"POLYGON ((4.04797 49.40564, 4.03991 49.39740, ...",1900,02,MARIE,319,JEANNE,153,ANDRÉ,138
2,03,Allier,"POLYGON ((3.03207 46.79491, 3.04907 46.75808, ...",1900,03,MARIE,652,JEAN,230,JEANNE,194
3,04,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ...",1900,04,MARIE,146,LOUIS,40,JOSEPH,35
4,05,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ...",1900,05,MARIE,155,JOSEPH,50,LOUIS,37
...,...,...,...,...,...,...,...,...,...,...,...
11634,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",2020,95,ADAM,133,MOHAMED,96,LINA,88
11635,,,,2020,971,GABRIEL,24,LYAM,24,NOAH,23
11636,,,,2020,972,NOAH,22,EDEN,21,ETHAN,19
11637,,,,2020,973,SAMUEL,29,NOAH,28,ENZO,23


## Calculating the 3 most popular names acros the country by year

In [11]:
df_top3 = just_names.groupby(['annais', 'preusuel'], as_index=False)['nombre'].sum()

# Trier par année et nombre (descendant)
df_top3 = df_top3.sort_values(['annais', 'nombre'], ascending=[True, False])

# Sélectionner les trois prénoms les plus populaires pour chaque année
top3_names_per_year = df_top3.groupby('annais').head(3).reset_index(drop=True)

top3_names_per_year

Unnamed: 0,annais,preusuel,nombre
0,1900,MARIE,49752
1,1900,JEAN,14100
2,1900,JEANNE,13981
3,1901,MARIE,53177
4,1901,JEAN,15638
...,...,...,...
358,2019,LÉO,4654
359,2019,RAPHAËL,4458
360,2020,LÉO,4491
361,2020,GABRIEL,4410


## Assembling the 3 most popular names across the country in our DF

In [12]:
# Ajouter des colonnes pour le top 3 national
merged['country_top1_preusuel'] = None
merged['country_top2_preusuel'] = None
merged['country_top3_preusuel'] = None

# Mettre à jour les lignes du DataFrame en fonction de l'année pour inclure les prénoms nationaux les plus populaires
for year in merged['annais'].unique():
    top3_for_year = top3_names_per_year[top3_names_per_year['annais'] == year]
    if not top3_for_year.empty:
        merged.loc[merged['annais'] == year, 'country_top1_preusuel'] = top3_for_year.iloc[0]['preusuel']
        merged.loc[merged['annais'] == year, 'country_top2_preusuel'] = top3_for_year.iloc[1]['preusuel']
        merged.loc[merged['annais'] == year, 'country_top3_preusuel'] = top3_for_year.iloc[2]['preusuel']

merged.dropna(inplace=True)

merged

Unnamed: 0,code,nom,geometry,annais,dpt,top_preusuel,top_nombre,second_preusuel,second_nombre,third_preusuel,third_nombre,country_top1_preusuel,country_top2_preusuel,country_top3_preusuel
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",1900,01,MARIE,681,JEAN,144,JEANNE,134,MARIE,JEAN,JEANNE
1,02,Aisne,"POLYGON ((4.04797 49.40564, 4.03991 49.39740, ...",1900,02,MARIE,319,JEANNE,153,ANDRÉ,138,MARIE,JEAN,JEANNE
2,03,Allier,"POLYGON ((3.03207 46.79491, 3.04907 46.75808, ...",1900,03,MARIE,652,JEAN,230,JEANNE,194,MARIE,JEAN,JEANNE
3,04,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ...",1900,04,MARIE,146,LOUIS,40,JOSEPH,35,MARIE,JEAN,JEANNE
4,05,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ...",1900,05,MARIE,155,JOSEPH,50,LOUIS,37,MARIE,JEAN,JEANNE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11630,91,Essonne,"POLYGON ((2.22656 48.77610, 2.23298 48.76620, ...",2020,91,GABRIEL,87,LÉO,86,ADAM,80,LÉO,GABRIEL,RAPHAËL
11631,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ...",2020,92,ADAM,214,RAPHAËL,183,GABRIEL,178,LÉO,GABRIEL,RAPHAËL
11632,93,Seine-Saint-Denis,"POLYGON ((2.55306 49.00982, 2.58031 48.99159, ...",2020,93,MOHAMED,238,ADAM,162,IBRAHIM,140,LÉO,GABRIEL,RAPHAËL
11633,94,Val-de-Marne,"POLYGON ((2.33190 48.81701, 2.36395 48.81632, ...",2020,94,ADAM,136,GABRIEL,114,MOHAMED,97,LÉO,GABRIEL,RAPHAËL


## Displaying with Altair the graph

In [13]:
# Convert 'annais' to int if not already done
merged['annais'] = merged['annais'].astype(int)

# Define a function to assign colors based on whether top_preusuel is in top3_for_year
def assign_color(row, top3_for_year):
    if row['top_preusuel'] == top3_for_year[0]:
        return 'top1'  
    elif row['top_preusuel'] == top3_for_year[1]:
        return 'top2' 
    elif row['top_preusuel'] == top3_for_year[2]:
        return 'top3'  
    else:
        return 'other'  # Return gray for others

# Iterate over unique years and assign colors based on the top names for each year
for year in merged['annais'].unique():
    top3_for_year = [
        merged.loc[merged['annais'] == year, 'country_top1_preusuel'].iloc[0],
        merged.loc[merged['annais'] == year, 'country_top2_preusuel'].iloc[0],
        merged.loc[merged['annais'] == year, 'country_top3_preusuel'].iloc[0]
    ]
    
    mask = merged['annais'] == year
    
    merged.loc[mask, 'color'] = merged[mask].apply(assign_color, axis=1, args=(top3_for_year,))


In [16]:
slider = alt.binding_range(min=1900, max=2020, step=1, name='année:')

selector = alt.param(value=2020, bind=slider)

color_scale = alt.Scale(domain= ['top1','top2','top3','other'], range=['#1f77b4', '#ff7f0e', '#2ca02c', 'gray'])
map_chart = alt.Chart(merged).mark_geoshape(
    stroke='white'
).encode(
    color=alt.Color('color:N', scale=color_scale, legend=alt.Legend(title="Top Names")),
    tooltip=[
        alt.Tooltip('nom:N', title='Department'),
        alt.Tooltip('top_preusuel:N', title='Top 1 Name'),
        alt.Tooltip('top_nombre:Q', title='Top 1 Count'),
        alt.Tooltip('second_preusuel:N', title='Top 2 Name'),
        alt.Tooltip('second_nombre:Q', title='Top 2 Count'),
        alt.Tooltip('third_preusuel:N', title='Top 3 Name'),
        alt.Tooltip('third_nombre:Q', title='Top 3 Count')
    ],
).properties(
    width=800,
    height=500,
    title=f"Top 3 Baby Names in France by Department for selected year"
).add_params(selector).transform_filter(alt.datum.annais==selector)

map_chart

As our team had planned, we succeeded in creating an interactive map with a slider to show the effect of the regions in real time. Users can quickly see which departments have the most popular first names in common, and which are in the national top 3, thanks to the colors. They can also place their cursor over the department to see details of the 3 most popular first names in the department, with the associated number. What's more, we can scroll the cursor over time to see whether a diversity of first names has always been present or not. 
With a little more time, we would have liked to add :
- an adaptive title that changes according to the year selected by the cursor 
- a legend showing not "top 1", "top 2" and "top 3", but the 3 most popular first names for the selected year