In [1]:
import pandas as pd
import numpy as np
import altair as alt
import math
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

## 1. Importation et nettoyage des données

In [125]:
# importation des données
data = pd.read_csv('/Users/andressoto/Downloads/Names hints/dpt2020.csv', header = 0,
                   sep=';', names=['gender','name','year','dpt','births'], converters={'name': str.title})

In [9]:
data

Unnamed: 0,gender,name,year,dpt,births
0,1,_Prenoms_Rares,1900,02,7
1,1,_Prenoms_Rares,1900,04,9
2,1,_Prenoms_Rares,1900,05,8
3,1,_Prenoms_Rares,1900,06,23
4,1,_Prenoms_Rares,1900,07,9
...,...,...,...,...,...
3727548,2,Zya,2018,59,3
3727549,2,Zya,XXXX,XX,264
3727550,2,Zyna,2013,93,3
3727551,2,Zyna,XXXX,XX,59


In [126]:
# nettoyage des données
data = data.loc[(data['name'].str.len()>1) & (data['year'] != 'XXXX') 
                & -data['name'].str.startswith('_')].reset_index(drop=True)
data['name'] = data['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
data = data.drop('dpt', axis=1)
data.head()

Unnamed: 0,gender,name,year,births
0,1,Aadil,1983,3
1,1,Aadil,1992,3
2,1,Aahil,2016,3
3,1,Aaron,1962,3
4,1,Aaron,1976,3


In [123]:
# proportions totales des prénoms garçons/filles
data['gender'].value_counts()

2    1966205
1    1702054
Name: gender, dtype: int64

## 2. Graphique 

In [127]:
# noms les plus populaires sur toutes la période
data2 = data.groupby(['year', 'name','gender'],as_index=False).sum()
data_names = data2.groupby(['name'])['births'].sum().sort_values(ascending= False)
data_names.head(10)

name
Marie       2256072
Jean        1913130
Pierre       891794
Michel       818025
Andre        709721
Jeanne       556903
Philippe     535355
Louis        523576
Rene         514560
Alain        504106
Name: births, dtype: int64

In [128]:
data_gender = data2.merge(data2, how ='left', on = ['year','name'], suffixes = ['_h','_f'])
data_gender = data_gender.loc[data_gender['gender_f']> data_gender['gender_h']]
data_gender.rename(columns = {"births_h":"male","births_f":"female"}, inplace = True)
data_gender.drop(['gender_h','gender_f'], axis=1, inplace = True)
data_gender.reset_index()
data_gender['ratio'] = data_gender.apply(lambda x : math.log(x.female/x.male),axis=1)
data_gender

Unnamed: 0,year,name,male,female,ratio
16,1900,Agathe,3,62,3.028522
50,1900,Alix,6,47,2.058388
82,1900,Andre,5530,4,-7.231649
89,1900,Ange,157,23,-1.920752
144,1900,Arsene,209,12,-2.857428
...,...,...,...,...,...
266248,2020,Taylor,18,3,-1.791759
266263,2020,Tenzin,22,21,-0.046520
266276,2020,Thais,45,723,2.776747
266458,2020,Yacine,205,3,-4.224398


In [130]:
# Grahiques
# on se focalise sur les 30 prénoms les plus populaires
top30 = data_names[:30].index.to_list()
top30 = data_gender.loc[data_gender['name'].isin(top30)]

graph = alt.Chart(top30, width=800, height=800 ).mark_line().encode(
    x = alt.X('year:T', title = 'Year'),
    y = alt.Y('ratio:Q', title = 'Log ratio of female/male'),
    color=alt.Color('name:N')
    
).properties(title='Changes in the use of first names between girls and boys')


nearest = alt.selection(type='single', on='mouseover',fields=['name'], nearest=True)

points = graph.mark_circle().encode(
    opacity=alt.value(0),
    tooltip='name'
).add_selection(
    nearest
).properties(
    width=800, height=600
)

lines = graph.mark_line().encode(
    size=alt.condition(~nearest, alt.value(1), alt.value(3))
)

yrule = (
    alt.Chart().mark_rule(strokeDash=[12, 6], size=2).encode(y=alt.datum(0))
)

points + lines + yrule