In [6]:
import pandas as pd
from unidecode import unidecode

In [7]:
population = pd.read_csv('../database/population_2020.csv', dtype=str)
def preprocess_population_data(population):
    population = population.drop(columns=['Note', 'Description du Symbole', 'Symbole', 'Code année', 'Produit', 'Code Produit', 'Domaine', 'Code Domaine', 'Unité'])
    return population
population = preprocess_population_data(population)
population

Unnamed: 0,Code zone (FAO),Zone,Code Élément,Élément,Année,Valeur
0,2,Afghanistan,511,Population totale,2020,39068.979
1,2,Afghanistan,512,Hommes,2020,19725.45
2,2,Afghanistan,513,Femmes,2020,19343.528
3,2,Afghanistan,551,Population rurale,2020,28150.604
4,2,Afghanistan,561,Population urbaine,2020,9904.337
...,...,...,...,...,...,...
1165,181,Zimbabwe,511,Population totale,2020,15526.888
1166,181,Zimbabwe,512,Hommes,2020,7370.588
1167,181,Zimbabwe,513,Femmes,2020,8156.301
1168,181,Zimbabwe,551,Population rurale,2020,11980.005


In [8]:
def transform_population_data(population):
    population = population.pivot(index=['Code zone (FAO)','Zone'], columns='Élément', values='Valeur')
    population = population.reset_index()
    population = population.rename(columns={'Code zone (FAO)': 'country_code'})
    return population

population_transformed = transform_population_data(population)
population_transformed

Élément,country_code,Zone,Femmes,Hommes,Population rurale,Population totale,Population urbaine
0,1,Arménie,1554.457,1336.435,1078.125,2890.892,1860.554
1,10,Australie,12970.037,12773.754,3494.472,25743.791,21903.705
2,100,Inde,678442.228,724175.468,900099.113,1402617.695,483098.64
3,101,Indonésie,136741.617,138073.249,118034.441,274814.866,154188.546
4,102,Iran (République islamique d'),43070.948,44652.495,20166.625,87723.443,63420.504
...,...,...,...,...,...,...,...
231,95,Honduras,5021.043,5098.597,4047.211,10119.64,5672.054
232,96,Chine - RAS de Hong-Kong,4107.529,3382.707,0,7490.235,7547.652
233,97,Hongrie,5077.383,4672.074,2699.487,9749.457,6921.767
234,98,Croatie,2049.088,1904.87,1747.114,3953.958,2368.833


In [9]:
def convert_population_to_numeric(population_transformed):
    population_transformed['Population totale'] = population_transformed['Population totale'].astype(float) * 1000
    population_transformed['Population urbaine'] = population_transformed['Population urbaine'].astype(float) * 1000
    population_transformed['Population rurale'] = population_transformed['Population rurale'].astype(float) * 1000
    population_transformed['Femmes'] = population_transformed['Femmes'].astype(float) * 1000
    population_transformed['Hommes'] = population_transformed['Hommes'].astype(float) * 1000
    return population_transformed

population_transformed = convert_population_to_numeric(population_transformed)
population_transformed


Élément,country_code,Zone,Femmes,Hommes,Population rurale,Population totale,Population urbaine
0,1,Arménie,1554457.0,1336435.0,1078125.0,2.890892e+06,1860554.0
1,10,Australie,12970037.0,12773754.0,3494472.0,2.574379e+07,21903705.0
2,100,Inde,678442228.0,724175468.0,900099113.0,1.402618e+09,483098640.0
3,101,Indonésie,136741617.0,138073249.0,118034441.0,2.748149e+08,154188546.0
4,102,Iran (République islamique d'),43070948.0,44652495.0,20166625.0,8.772344e+07,63420504.0
...,...,...,...,...,...,...,...
231,95,Honduras,5021043.0,5098597.0,4047211.0,1.011964e+07,5672054.0
232,96,Chine - RAS de Hong-Kong,4107529.0,3382707.0,0.0,7.490235e+06,7547652.0
233,97,Hongrie,5077383.0,4672074.0,2699487.0,9.749457e+06,6921767.0
234,98,Croatie,2049088.0,1904870.0,1747114.0,3.953958e+06,2368833.0


In [10]:
def transform_regex_population(population_transformed):
    population_transformed.columns = population_transformed.columns.str.replace(' ', '_', regex=False)
    population_transformed.columns = population_transformed.columns.str.lower()
    population_transformed.columns = [unidecode(col) for col in population_transformed.columns]
    return population_transformed

population_transformed = transform_regex_population(population_transformed)
population_transformed

Unnamed: 0,country_code,zone,femmes,hommes,population_rurale,population_totale,population_urbaine
0,1,Arménie,1554457.0,1336435.0,1078125.0,2.890892e+06,1860554.0
1,10,Australie,12970037.0,12773754.0,3494472.0,2.574379e+07,21903705.0
2,100,Inde,678442228.0,724175468.0,900099113.0,1.402618e+09,483098640.0
3,101,Indonésie,136741617.0,138073249.0,118034441.0,2.748149e+08,154188546.0
4,102,Iran (République islamique d'),43070948.0,44652495.0,20166625.0,8.772344e+07,63420504.0
...,...,...,...,...,...,...,...
231,95,Honduras,5021043.0,5098597.0,4047211.0,1.011964e+07,5672054.0
232,96,Chine - RAS de Hong-Kong,4107529.0,3382707.0,0.0,7.490235e+06,7547652.0
233,97,Hongrie,5077383.0,4672074.0,2699487.0,9.749457e+06,6921767.0
234,98,Croatie,2049088.0,1904870.0,1747114.0,3.953958e+06,2368833.0


In [11]:
def save_transformed_population_data(population_transformed):
    population_transformed.to_csv('../export/population_2020.csv', index=False, encoding='utf-8')
    print("Population data transformed and saved successfully.")

save_transformed_population_data(population_transformed)

Population data transformed and saved successfully.
