In [None]:
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file = 'Honoraires_totaux_des_professionnels_de_sante_par_departement_en_2016.xls'
df_med = pd.read_excel(file, sheet_name='Spécialistes')
columns = ['Spécialistes', 'DEPARTEMENT', 'EFFECTIFS', 'HONORAIRES SANS DEPASSEMENT (Euros)', 'DEPASSEMENTS (Euros)']
df_med = df_med[columns]
df_med.head()

In [None]:
file = 'estim-pop-dep-sexe-gca-1975-2018.xls'
df_pop = pd.read_excel(file, sheet_name='2016', skiprows=4)
columns = ['Unnamed: 0', 'Unnamed: 1', 'Total','20 à 39 ans.2','40 à 59 ans.1']
df_pop = df_pop[columns]
df_pop.head()

In [None]:
file = 'estim-pop-dep-sexe-aq-1975-2018.xls'
df_pop_age = pd.read_excel(file, sheet_name='2016', skiprows=4)
df_pop_age_transf = df_pop_age.copy()
df_pop_age_transf = df_pop_age_transf.rename(index=str, columns={"Unnamed: 0": "DEPARTEMENT_NUM", "Unnamed: 1": "DEPARTEMENT_NOM","0 à 4 ans": "0-4","65 à 69 ans": "65-69","70 à 74 ans": "70-74","75 à 79 ans": "75-79","80 à 84 ans": "80-84","85 à 89 ans": "85-89","90 à 94 ans": "90-94","95 ans et plus":"95-"})
df_pop_age_transf.head()

In [None]:
df_med_transf = df_med.rename(index=str, columns={"Spécialistes": "SPECIALISTES", "HONORAIRES SANS DEPASSEMENT (Euros)": "HONORAIRES", "DEPASSEMENTS (Euros)": "DEPASSEMENTS"})
# Purge empty values
df_med_transf = df_med_transf.loc[df_med_transf['DEPASSEMENTS'] != 'nc']
df_med_transf = df_med_transf.loc[df_med_transf['HONORAIRES'] != 'nc']
df_med_transf = df_med_transf.loc[df_med_transf['EFFECTIFS'] != 0]
# Take only departments which are starting with digits
filter = df_med_transf['DEPARTEMENT'].str.contains(r'[\d]')
df_med_transf = df_med_transf[filter]
filter = df_med_transf['SPECIALISTES'].str.contains(r'TOTAL(.*)')
df_med_transf = df_med_transf[~filter]
# Format Paris
df_med_transf['DEPARTEMENT'] = df_med_transf['DEPARTEMENT'].str.replace(r'\(.*\)','').map(lambda x: x.strip())
# Remove digits
df_med_transf['SPECIALISTES'] = df_med_transf['SPECIALISTES'].str.replace(r'\d+', '').str.replace(r'^[\s]', '').str.replace(r'[^\w\s]', '').map(lambda x: x.strip())
# Cast to float
df_med_transf['DEPASSEMENTS'] = df_med_transf['DEPASSEMENTS'].astype(str).astype(float)
df_med_transf['HONORAIRES'] = df_med_transf['HONORAIRES'].astype(str).astype(float)
# Sort by department and depassement
df_med_transf = df_med_transf.sort_values(by=['DEPARTEMENT', 'DEPASSEMENTS'],  ascending=[True, False])
df_med_transf.head()


In [None]:
df_pop_transf = df_pop.rename(index=str, columns={"Unnamed: 0": "DEPARTEMENT_NUM", "Unnamed: 1": "DEPARTEMENT_NOM","Total": "POPULATION","20 à 39 ans.2": "femmes_20-39","40 à 59 ans.1": "hommes_40-59"})
df_pop_transf['DEPARTEMENT'] = df_pop_transf['DEPARTEMENT_NUM'].astype(str) + '- ' + df_pop_transf['DEPARTEMENT_NOM'].astype(str)
df_pop_transf.head()

In [None]:
df_pop_age_transf['DEPARTEMENT'] = df_pop_age_transf['DEPARTEMENT_NUM'].astype(str) + '- ' + df_pop_age_transf['DEPARTEMENT_NOM'].astype(str)
df_pop_age_transf.head()

In [None]:
df_med_transf = df_med_transf.merge(df_pop_transf, on='DEPARTEMENT')
df_med_transf = df_med_transf.drop(columns=['DEPARTEMENT_NUM', 'DEPARTEMENT_NOM'])
# Categorise SPECIALISTES
df_med_transf = df_med_transf.assign(ID_SPECIALITE=(df_med_transf['SPECIALISTES']).astype('category').cat.codes)
# Create new columns for DEPASSEMENTS/DEPASSEMENTS + HONORAIRES and by EFFECTIFS 
df_med_transf['RATIO_DEPASSEMENTS'] = df_med_transf['DEPASSEMENTS'] / (df_med_transf['DEPASSEMENTS'] + df_med_transf['HONORAIRES'])
df_med_transf['RATIO_DEPASSEMENTS_EFFECTIFS'] = df_med_transf['RATIO_DEPASSEMENTS'] / df_med_transf['EFFECTIFS']
col = ['DEPARTEMENT','SPECIALISTES','ID_SPECIALITE', 'EFFECTIFS', 'RATIO_DEPASSEMENTS_EFFECTIFS','POPULATION','femmes_20-39']
df_med_transf = df_med_transf[col]
df_med_transf.head()

In [None]:
df_pop_age_transf = df_pop_age_transf.merge(df_med_transf, on='DEPARTEMENT')
col = ['DEPARTEMENT','ID_SPECIALITE', 'EFFECTIFS','femmes_20-39']
df_pop_age_transf = df_pop_age_transf[col]
df_pop_age_transf.head()

In [None]:
col = ['RATIO_DEPASSEMENTS_EFFECTIFS','EFFECTIFS','POPULATION']

grid = sns.pairplot(df_med_transf[col], size=4)
grid = grid.map_upper(plt.scatter)
grid = grid.map_diag(plt.hist, bins = 10, edgecolor = 'k')
grid = grid.map_lower(plt.scatter)