In [16]:
import altair as alt
import pandas as pd
import geopandas as gpd 
alt.data_transformers.enable('json') 

pass

* Read the data from the csv file, and clean the data.

In [17]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)
names=names[names.dpt.astype('int')<=96]
names

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
10885,1,AADIL,1983,84,3
10886,1,AADIL,1992,92,3
10888,1,AAHIL,2016,95,3
10892,1,AARON,1962,75,3
10893,1,AARON,1976,75,3
...,...,...,...,...,...
3727543,2,ZYA,2011,91,3
3727545,2,ZYA,2013,44,4
3727546,2,ZYA,2013,59,3
3727548,2,ZYA,2018,59,3


* Add a geo file to associate each department code in the data with its corresponding region in the map 

In [18]:
depts = gpd.read_file('departements-version-simplifiee.geojson')

In [19]:
depts

Unnamed: 0,code,nom,geometry
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ..."
1,02,Aisne,"POLYGON ((4.04797 49.40564, 4.03991 49.39740, ..."
2,03,Allier,"POLYGON ((3.03207 46.79491, 3.04907 46.75808, ..."
3,04,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ..."
4,05,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ..."
...,...,...,...
91,91,Essonne,"POLYGON ((2.22656 48.77610, 2.23298 48.76620, ..."
92,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ..."
93,93,Seine-Saint-Denis,"POLYGON ((2.55306 49.00982, 2.58031 48.99159, ..."
94,94,Val-de-Marne,"POLYGON ((2.33190 48.81701, 2.36395 48.81632, ..."


* Find the most popular name (locally popular) in each department. 

In [20]:
dic={}
dpt_code=[]
popu_name=[]
deptss=depts.copy()
for d in names.dpt.unique():
    indx=names[names.dpt==d].nombre.argmax()
    dpt_code.append(d)
    popu_name.append(names[names.dpt==d].iloc[indx].preusuel)

dic['code']=dpt_code
dic['local_popular']=popu_name
dics=pd.DataFrame(dic)
dics

Unnamed: 0,code,local_popular
0,84,JEAN
1,92,NATHALIE
2,95,SANDRINE
3,75,JEAN
4,69,JEAN
...,...,...
90,09,MARIE
91,32,JEAN
92,48,MARIE
93,23,MARIE


* Add the local popularity information into the department data by combining data frame "depts" and "dics" as "depts"

In [21]:
depts = depts.merge(dics, how='right',left_on='code', right_on='code')
depts.sample(5)

Unnamed: 0,code,nom,geometry,local_popular
66,63,Puy-de-Dôme,"POLYGON ((2.56538 46.14303, 2.64069 46.11848, ...",MARIE
88,5,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ...",MARIE
1,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ...",NATHALIE
67,64,Pyrénées-Atlantiques,"POLYGON ((-0.24284 43.58498, -0.21061 43.59324...",MARIE
12,6,Alpes-Maritimes,"POLYGON ((6.88743 44.36105, 6.92257 44.35073, ...",JEAN


* Combine the data frame "depts" and "names" as "names"  

In [22]:
just_names = names
names = depts.merge(names, how='right', left_on='code', right_on='dpt')
names=names.dropna()
names

Unnamed: 0,code,nom,geometry,local_popular,sexe,preusuel,annais,dpt,nombre
0,84,Vaucluse,"MULTIPOLYGON (((4.89291 44.36482, 4.90663 44.3...",JEAN,1,AADIL,1983,84,3
1,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.32697 48.94536, ...",NATHALIE,1,AADIL,1992,92,3
2,95,Val-d'Oise,"POLYGON ((2.59052 49.07965, 2.57203 49.06149, ...",SANDRINE,1,AAHIL,2016,95,3
3,75,Paris,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",JEAN,1,AARON,1962,75,3
4,75,Paris,"POLYGON ((2.41634 48.84924, 2.46226 48.84254, ...",JEAN,1,AARON,1976,75,3
...,...,...,...,...,...,...,...,...,...
3471082,91,Essonne,"POLYGON ((2.22656 48.77610, 2.23298 48.76620, ...",STÉPHANIE,2,ZYA,2011,91,3
3471083,44,Loire-Atlantique,"POLYGON ((-2.45849 47.44812, -2.45343 47.46207...",JEAN,2,ZYA,2013,44,4
3471084,59,Nord,"MULTIPOLYGON (((3.04040 50.15971, 3.06301 50.1...",JEAN,2,ZYA,2013,59,3
3471085,59,Nord,"MULTIPOLYGON (((3.04040 50.15971, 3.06301 50.1...",JEAN,2,ZYA,2018,59,3


In [23]:
names.local_popular.unique()

array(['JEAN', 'NATHALIE', 'SANDRINE', 'MARIE', 'STÉPHANIE', 'MICHEL'],
      dtype=object)

### Visualization 2

Is there a regional effect in the data? Are some names more popular in some regions? Are popular names generally popular across the whole country?

In [24]:
temp = names.groupby(['preusuel'], as_index=False).sum()[["preusuel","nombre"]]
temp = temp.sort_values(by=['nombre'])
most_popular = temp.tail(10)
most_unpopular = temp.head(10)

  temp = names.groupby(['preusuel'], as_index=False).sum()[["preusuel","nombre"]]


In [25]:
temp = names.groupby(['dpt', 'preusuel', 'sexe'], as_index=False).sum()
temp = depts.merge(temp, how='right', left_on='code', right_on='dpt') 
temp = temp[temp.preusuel.isin(most_popular.preusuel.tolist())]

  temp = names.groupby(['dpt', 'preusuel', 'sexe'], as_index=False).sum()


In [26]:
selection = alt.selection_single(empty='all', fields=['preusuel'])
brush = alt.selection_single()
d1 = alt.Chart(most_popular).mark_bar().encode(
    x='preusuel:O',
    y="nombre:Q",
    color=alt.condition(selection,alt.value('orange'),alt.value('grey'))
).add_selection(selection)



In [27]:
map = alt.Chart(temp).mark_geoshape(stroke='white').encode(
    tooltip=['nom', 'code', 'nombre','local_popular'],
    #color='nombre',
    color = alt.Color('nombre', scale=alt.Scale(range=['blue','red'])),
).transform_filter(selection).properties(width=600, height=400,title='Naitional and regional popularity in France')

In [28]:
d1 | map