In [28]:
import pandas as pd

# Notebook 1 : Data Preparation


## Objectif
Ce notebook vise à préparer les données pour l'analyse et l'application. Nous allons :
1. Charger les données brutes.
2. Nettoyer et structurer les données.
3. Exporter les données nettoyées.


### 1. Charger les données brutes.

In [29]:
mortalité_infantille = pd.read_csv("data/raw/child-mortality.csv")
pib_habitant = pd.read_csv("data/raw/gdp-per-capita.csv")
depense_santé = pd.read_csv("data/raw/health-expenditure.csv")
espérance_vie = pd.read_csv("data/raw/life-expectancy.csv")
population = pd.read_csv("data/raw/population.csv")
lit_hopital = pd.read_csv("data/raw/hospital-beds-per-1000-people.csv")
densité_population = pd.read_csv("data/raw/population-density.csv")

### 2. nettoyer les données

1. retrait de la colonne Code de tout les dataframes

In [30]:
mortalité_infantille.drop(columns=['Code'], inplace=True)
pib_habitant.drop(columns=['Code'], inplace=True)
depense_santé.drop(columns=['Code'], inplace=True)
espérance_vie.drop(columns=['Code'], inplace=True)
population.drop(columns=['Code'], inplace=True)
lit_hopital.drop(columns=['Code'], inplace=True)

2. renommer les colonnes qui nous interresse 

In [31]:
mortalité_infantille.rename(columns={'Under-five mortality rate': 'Child Mortality'}, inplace=True)
espérance_vie.rename(columns={'Period life expectancy at birth - Sex: total - Age: 0': 'Life Expectancy'}, inplace=True)
pib_habitant.rename(columns={'GDP per capita, PPP (constant 2017 international $)': 'GDP per Capita'}, inplace=True)
depense_santé.rename(columns={'public_health_expenditure_pc_gdp': 'Health Expenditure'}, inplace=True)
population.rename(columns={'Population (historical)': 'population'}, inplace=True)
lit_hopital.rename(columns={'Hospital beds (per 1,000 people)': 'Hospital Beds'}, inplace=True)
densité_population.rename(columns={'Population density - Sex: all - Age: all - Variant: estimates': 'Population Density'}, inplace=True)

3. fussionner les données dans un seul dataframe

In [32]:
merged_data = pd.merge(mortalité_infantille, espérance_vie, on=['Entity', 'Year'], how='outer')
merged_data = pd.merge(merged_data, pib_habitant, on=['Entity', 'Year'], how='outer')
merged_data = pd.merge(merged_data, depense_santé, on=['Entity', 'Year'], how='outer')
merged_data = pd.merge(merged_data, population, on=['Entity', 'Year'], how='outer')
merged_data = pd.merge(merged_data, lit_hopital, on=['Entity', 'Year'], how='outer')
merged_data = pd.merge(merged_data, densité_population, on=['Entity', 'Year'], how='outer')

4. garder que les années >= 1960

In [33]:
merged_data = merged_data[merged_data['Year'] >= 1960] 

In [34]:
print(merged_data.isnull().sum())

Entity                    0
Year                      0
Child Mortality        5735
Life Expectancy        2125
GDP per Capita        12267
Health Expenditure    16759
population             1910
Hospital Beds         13704
Population Density     2637
dtype: int64


5. Remplacer les valeurs manquante par la mediane ou la moyenne (selon nos besoin)

6. rennomer les colonnes pour que ca soit plus coherrent 

In [35]:
merged_data.rename(columns={
    'Entity': 'Country',
}, inplace=True)

In [None]:
merged_data

Unnamed: 0,Country,Year,Child Mortality,Life Expectancy,GDP per Capita,Health Expenditure,population,Hospital Beds,Population Density
197,Afghanistan,1960,35.459850,32.7987,,,9035048.0,0.170627,13.915
198,Afghanistan,1961,34.894880,33.2910,,,9214082.0,,14.191
199,Afghanistan,1962,34.369637,33.7565,,,9404411.0,,14.484
200,Afghanistan,1963,33.836605,34.2008,,,9604491.0,,14.792
201,Afghanistan,1964,33.310510,34.6726,,,9814318.0,,15.115
...,...,...,...,...,...,...,...,...,...
61581,Zimbabwe,2019,5.237655,61.0603,2203.3967,,15271377.0,,39.476
61582,Zimbabwe,2020,5.143166,61.5300,1990.3195,,15526888.0,,40.137
61583,Zimbabwe,2021,4.908745,60.1347,2115.1445,,15797220.0,,40.835
61584,Zimbabwe,2022,4.772906,62.3601,2207.9570,,16069061.0,,41.538


### 3. exporter le dataframe

In [None]:
merged_data.to_csv('data/clean/cleaned_data.csv', index=False)