In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set()

1. Regarder toutes les features et leurs caractéristiques :
    * Nom
    * Type
    * % de valeurs manquantes
    * Bruit
    * Utilité pour le projet 
    * Distribution (gaussien, uniforme…)
2. Visualiser les données
3. Regarder les corrélations entre les features
4. Identifier les transformations / créations / ajouts de features possibles 

1. Nettoyer les données
    * S’occuper des outliers (supprimer, modifier…)
    * S’occuper des valeurs manquantes (remplir avec 0, moyenne, médiane, supprimer les exemples associés…)
2. Sélectionner les features utiles et supprimer les features inutiles pour le projet
3. Faire du Feature engineering
    * Echantilloner  les features continues
    * Décomposer les features catégorielles, date/heure…
    * Appliquer les transformations identifiées (log, sqrt, ^2…)
    * Créer de nouvelles features 
4. Normaliser ou standardiser les features 

In [55]:
path = '/'.join(os.getcwd().split("/")[:-1])

urban_population = pd.read_csv(f'{path}/data/Urban-population-by-country.csv', skiprows=4)
population_growth = pd.read_csv(f'{path}/data/Population-growth-by-country.csv', skiprows=4)
population = pd.read_csv(f'{path}/data/Population-by-country.csv', skiprows=4)
access_electricity = pd.read_csv(f'{path}/data/Access-to-electricity-by-country.csv', skiprows=4)
nitrous_oxyd_emission = pd.read_csv(f'{path}/data/Nitrous-oxyd-emission.csv',skiprows=4)
gdp = pd.read_csv(f'{path}/data/gdp.csv',skiprows=4)
methan_emission = pd.read_csv(f'{path}/data/Methan-emission-by-country.csv',skiprows=4)
co2_emission = pd.read_csv(f'{path}/data/emission_CO2.csv',skiprows=4)

In [56]:
def clean_all_df(df, value):
    df_copy = df.copy()
    df_copy = df_copy.drop(df.columns[len(df.columns)-1], axis=1)
    df_copy = df_copy.drop(columns=['Indicator Code','Indicator Name','2019','Country Code'])
    df_copy = df_copy.fillna(method='bfill')
    df_copy = df_copy.T
    df_copy = df_copy.reset_index()
    header = np.array(df_copy.iloc[0])
    header[0] = 'Date'
    df_copy = df_copy[1:]
    df_copy.columns = header
    df_copy = df_copy.melt(id_vars=['Date'], var_name='Country', value_name=value)
    df_copy = df_copy.sort_values(value,ascending=False)
    return df_copy

def get_region(df, global_region):
    df_region = df[df['Country'].isin(global_region)]
    return df_region

def merge_df(df_list):
    df_demographic = df_list[0]
    for df in df_list[1::]:
        df_demographic = pd.merge(df_demographic, df, on=['Date', 'Country'])
    return df_demographic

def plot_time_serie(df, y, title):
    fig = px.line(df, x='Date', y=y,color='Country',title=title)
    fig.show()

In [57]:
urban_population_cleaned = clean_all_df(urban_population, 'Urban Population')
population_growth_cleaned = clean_all_df(population_growth, 'Population Growth')
population_cleaned = clean_all_df(population, 'Population')
access_electricity_cleaned = clean_all_df(access_electricity, 'Access to Electricity')
gdp_cleaned = clean_all_df(gdp, 'GDP in $')
nitrous_oxyd_emission_cleaned = clean_all_df(nitrous_oxyd_emission, 'Nitrous Oxyd Emission')
methan_emission_cleaned = clean_all_df(methan_emission, 'Methan Oxyd Emission')
co2_emission_cleaned = clean_all_df(co2_emission, 'CO2 Emission')

In [58]:
df_list = [urban_population_cleaned, population_growth_cleaned, population_cleaned,
           access_electricity_cleaned, gdp_cleaned, nitrous_oxyd_emission_cleaned,
          methan_emission_cleaned, co2_emission_cleaned]

In [59]:
df_demographic = merge_df(df_list)

In [60]:
global_region = ['World','North America', 'Euro area', 'Russian Federation', 'Sub-Saharan Africa',
                 'Middle East & North Africa', 'East Asia & Pacific', 'Latin America & Caribbean']

In [61]:
df_demographic_grouped = get_region(df_demographic, global_region)

In [62]:
regions = {'East Asia & Pacific':'Asia Pacific', 'Latin America & Caribbean':'Latin America',
       'Sub-Saharan Africa':'Africa', 'North America':'North America',
       'Middle East & North Africa':'Middle East', 'Euro area':'Europe', 'Russian Federation':'Russia'}

In [68]:
df_demographic_grouped.loc[:,'Country'] = df_demographic_grouped.loc[:,'Country'].replace(regions)

In [69]:
df_demographic_grouped

Unnamed: 0,Date,Country,Urban Population,Population Growth,Population,Access to Electricity,GDP in $,Nitrous Oxyd Emission,Methan Oxyd Emission,CO2 Emission
0,2018,World,4.19636e+09,1.10877,7.59427e+09,,8.58044e+13,,,
1,2017,World,4.11667e+09,1.14309,7.51099e+09,88.8486,8.08913e+13,,,
2,2016,World,4.03654e+09,1.16551,7.4261e+09,87.9769,7.61028e+13,,,
3,2015,World,3.95655e+09,1.17004,7.34055e+09,86.7787,7.50031e+13,,,
4,2014,World,3.87766e+09,1.18104,7.25565e+09,85.5879,7.92961e+13,,,3.61383e+07
...,...,...,...,...,...,...,...,...,...,...
3170,1963,Africa,3.80988e+07,2.42386,2.43893e+08,,3.8685e+10,,,143107
3243,1960,Middle East,3.6736e+07,3.11851,1.05203e+08,,1.304e+10,,,102253
3255,1962,Africa,3.64252e+07,2.38841,2.38122e+08,,3.37602e+10,,,136504
3306,1961,Africa,3.48633e+07,2.34729,2.32567e+08,,3.11574e+10,,,131879


In [73]:
df_demographic_grouped.to_csv(f'{path}/demographic_grouped.csv',index=False)

In [71]:
plots = {'Population': "Evolution de la population mondiale", 
         'Urban Population': 'Evolution de la population mondiale urbaine', 
         'GDP in $': 'Evolution du GDP in $',
         'Nitrous Oxyd Emission': "Evolution des émissions de protoxyde d'azote", 
         'Methan Oxyd Emission': "Evolution des émissions de protoxyde d'azote", 
         'CO2 Emission': "Evolution des émissions de CO2"}

In [72]:
for key in plots.keys():
    plot_time_serie(df_demographic_grouped, key, plots.get(key))