## Cleaning the study dataset

In [72]:
""" Import packages """
import sys
import logging
import warnings

import numpy as np
import pandas as pd

sys.path.insert(0, '../scripts/')

logging.basicConfig(filename='../logs/data_cleaning.log', filemode='a',
                    encoding='utf-8', level=logging.DEBUG)

warnings.filterwarnings("ignore")

In [73]:
from explorer import DataTransformer
from dataframe_info_extractor import DataFrameInfo
from utils_cleaner import DataFrameCleaner

transformer = DataTransformer()

In [74]:
REPO = "./"
filepath = "../data/cleaned/final/project_data.csv"
rev="updv1"

### Importing

In [75]:
data = transformer.load_data(ext="csv", filepath=filepath, repo=REPO, rev=rev)
data.head()

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Angola,AGO,2000-12-31,325.7036,283.27,74.62,2.82,13.15,15.5919,3.259,...,9.623866,205.1,3.244121,17.25515,50.087,5.64867,8182768.0,1053.39,21.72,14.94
1,Angola,AGO,2001-12-31,326.6507,304.97,80.3,3.67,13.5891,15.6426,3.311,...,24.009075,198.9,3.285217,17.778222,51.274,5.627442,8254958.0,1043.69,21.9,15.1
2,Angola,AGO,2002-12-31,309.12094,317.88,83.93,4.72,14.05,15.6964,3.359,...,11.406192,191.9,3.335132,18.309589,52.461,5.623762,8326997.0,1076.96,21.98,15.19
3,Angola,AGO,2003-12-31,313.731,336.87,89.76,5.3,14.5379,15.753,3.466,...,20.081014,184.2,3.413321,18.842102,53.645,5.645138,8401539.0,1022.52,22.08,15.25
4,Angola,AGO,2004-12-31,313.73257,369.5,98.39,4.49,15.0566,15.807,3.545,...,9.329239,175.5,3.506389,19.373746,54.827,5.685845,8479480.0,1059.9,21.81,15.02


In [76]:
data.columns

Index(['Country', 'ISO3', 'Date', 'Malaria_Incidence', 'Malaria_Deaths_U5',
       'Malaria_Deaths', 'ITN_Access', 'PopDensity', 'MedianAgePop',
       'PopGrowthRate', 'TFR', 'IMR', 'Q5', 'CNMR',
       'Population ages 0-14 (% of total population)',
       'Population ages 15-64 (% of total population)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'External health expenditure (% of current health expenditure)',
       'People using at least basic sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'Population living in slums (% of urban population)',
       'Average precipitation in depth (mm per year)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 million (% of to

In [78]:
datainfo = DataFrameInfo(data)
datacleaner = DataFrameCleaner(data)

### Inspecting the dataset

In [79]:
datainfo.get_dimension()

 There are 920 rows and 32 columns


In [80]:
datainfo.get_percent_missing()

The dataset contains 6.55 % missing values.


In [81]:
# datacleaner.remove_more_20p_missing_values("ISO3")

In [84]:
missing_val_table = datainfo.missing_values_table()
missing_val_table

Your selected dataframe has 32 columns.
There are 13 columns that have missing values greater than 0%.


Unnamed: 0,Missing Values,% of Total Values,Dtype
Population living in slums (% of urban population),539,58.6,float64
"People using safely managed sanitation services, rural (% of rural population)",303,32.9,float64
Population in urban agglomerations of more than 1 million (% of total population),253,27.5,float64
External health expenditure (% of current health expenditure),131,14.2,float64
Domestic general government health expenditure (% of general government expenditure),128,13.9,float64
Malaria_Deaths_U5,120,13.0,float64
Malaria_Deaths,120,13.0,float64
Average precipitation in depth (mm per year),104,11.3,float64
Malaria_Incidence,80,8.7,float64
"Foreign direct investment, net inflows (% of GDP)",43,4.7,float64


In [85]:
countries_having_no_data_for_a_variable = (data.groupby("Country")[missing_val_table.index].count() == 0).sum(axis=1)
countries_having_no_data_for_a_variable

Country
Angola                          1
Benin                           1
Burkina Faso                    0
Burundi                         2
Cameroon                        1
Central African Republic        1
Chad                            0
Comoros                         2
Congo                           1
Cote d'Ivoire                   0
Democratic Republic of Congo    0
Djibouti                        2
Equatorial Guinea               3
Eritrea                         3
Ethiopia                        0
Gabon                           2
Gambia                          1
Ghana                           0
Guinea                          1
Guinea-Bissau                   1
Kenya                           0
Liberia                         1
Madagascar                      0
Malawi                          0
Mali                            0
Mauritania                      1
Mozambique                      0
Niger                           0
Nigeria                         0
Rwanda

In [89]:
# Retained countries that have at least data for each variable
retained_countries = list(countries_having_no_data_for_a_variable[countries_having_no_data_for_a_variable == 0].index)
len(retained_countries)

21

In [90]:
retained_countries

['Burkina Faso',
 'Chad',
 "Cote d'Ivoire",
 'Democratic Republic of Congo',
 'Ethiopia',
 'Ghana',
 'Kenya',
 'Madagascar',
 'Malawi',
 'Mali',
 'Mozambique',
 'Niger',
 'Nigeria',
 'Rwanda',
 'Senegal',
 'Sierra Leone',
 'Tanzania',
 'Togo',
 'Uganda',
 'Zambia',
 'Zimbabwe']

### Subset data using retained countries 

In [92]:
cleaned_data = transformer.subset_study_countries(data, "Country", countries=retained_countries)

In [93]:
datainfo = DataFrameInfo(cleaned_data)
datacleaner = DataFrameCleaner(cleaned_data)

#### Inpecting the new dataset

In [125]:
datainfo.get_dimension()

 There are 483 rows and 32 columns


In [126]:
datainfo.get_percent_missing()

The dataset contains 3.95 % missing values.


In [127]:
# datacleaner.remove_more_20p_missing_values("ISO3")

In [128]:
missing_val_table = datainfo.missing_values_table()
missing_val_table

Your selected dataframe has 32 columns.
There are 9 columns that have missing values greater than 0%.


Unnamed: 0,Missing Values,% of Total Values,Dtype
Population living in slums (% of urban population),254,52.6,float64
Malaria_Deaths_U5,63,13.0,float64
Malaria_Deaths,63,13.0,float64
Domestic general government health expenditure (% of general government expenditure),52,10.8,float64
External health expenditure (% of current health expenditure),52,10.8,float64
Malaria_Incidence,42,8.7,float64
Average precipitation in depth (mm per year),42,8.7,float64
ITN_Access,21,4.3,float64
"Mortality rate, under-5 (per 1,000 live births)",21,4.3,float64


In [129]:
(cleaned_data.groupby("Country")[missing_val_table.index].count())

Unnamed: 0_level_0,Population living in slums (% of urban population),Malaria_Deaths_U5,Malaria_Deaths,Domestic general government health expenditure (% of general government expenditure),External health expenditure (% of current health expenditure),Malaria_Incidence,Average precipitation in depth (mm per year),ITN_Access,"Mortality rate, under-5 (per 1,000 live births)"
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Burkina Faso,11,20,20,21,21,21,21,22,22
Chad,11,20,20,21,21,21,21,22,22
Cote d'Ivoire,11,20,20,21,21,21,21,22,22
Democratic Republic of Congo,10,20,20,21,21,21,21,22,22
Ethiopia,11,20,20,21,21,21,21,22,22
Ghana,11,20,20,21,21,21,21,22,22
Kenya,11,20,20,21,21,21,21,22,22
Madagascar,11,20,20,21,21,21,21,22,22
Malawi,11,20,20,21,21,21,21,22,22
Mali,11,20,20,21,21,21,21,22,22


In [130]:
# Fill missing values
datacleaner.fill_missing("ISO3")

In [131]:
# Convert Date to datetime
datacleaner.convert_to_date("Date")

In [132]:
datainfo.get_data_types()

float64    29
object      3
Name: count, dtype: int64

In [133]:
cleaned_data = datacleaner.data
datainfo = DataFrameInfo(cleaned_data)

In [134]:
datainfo.get_percent_missing()

The dataset contains 0.0 % missing values.


In [137]:
datainfo.get_data_types()

float64           29
object             2
datetime64[ns]     1
Name: count, dtype: int64

In [138]:
datainfo.get_dimension()

 There are 483 rows and 32 columns


In [157]:
cleaned_data.columns

Index(['Country', 'ISO3', 'Date', 'Malaria_Incidence', 'Malaria_Deaths_U5',
       'Malaria_Deaths', 'ITN_Access', 'PopDensity', 'MedianAgePop',
       'PopGrowthRate', 'TFR', 'IMR', 'Q5', 'CNMR',
       'Population ages 0-14 (% of total population)',
       'Population ages 15-64 (% of total population)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'External health expenditure (% of current health expenditure)',
       'People using at least basic sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'Population living in slums (% of urban population)',
       'Average precipitation in depth (mm per year)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 million (% of to

In [158]:
cleaned_data.head()

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Burkina Faso,BFA,2000-12-31,603.211,874.85,249.82,2.55,43.4316,15.4232,3.02,...,0.778458,178.7,2.983886,7.749025,17.844,6.857565,9762505.0,714.73,29.06,22.72
1,Burkina Faso,BFA,2001-12-31,601.93774,918.92,264.6,2.97,44.7725,15.5302,3.06,...,0.196437,174.7,3.040729,8.086915,18.54,6.86702,9978658.0,749.26,29.19,22.77
2,Burkina Faso,BFA,2002-12-31,595.85205,958.85,274.54,2.9,46.1706,15.6492,3.089,...,0.414816,170.2,3.07479,8.437518,19.258,6.874386,10199547.0,690.37,29.47,23.24
3,Burkina Faso,BFA,2003-12-31,585.1233,965.41,278.27,2.6,47.6264,15.7656,3.12,...,0.614299,165.0,3.104518,8.800698,19.996,6.865103,10424994.0,935.59,29.34,23.12
4,Burkina Faso,BFA,2004-12-31,562.4113,925.19,267.83,3.0,49.1447,15.871,3.156,...,0.26319,159.1,3.138021,9.17737,20.757,6.873132,10654996.0,752.75,29.41,23.25


### Inspecting extreme outlier values in country datasets

In [144]:
subdata = datacleaner.split_in_subframes("ISO3")

In [145]:
subdatacleaner = [DataFrameCleaner(frame) for frame in subdata]

In [149]:
for i in range(len(subdata)):
    print(f"Number of very extreme outlier values in {list(cleaned_data.ISO3.unique())[i]} data:", subdatacleaner[i].manage_outlier(subdata[i].select_dtypes(include="number").columns).nb_outliers.sum())

Number of very extreme outlier values in BFA data: 0
Number of very extreme outlier values in TCD data: 0
Number of very extreme outlier values in CIV data: 0
Number of very extreme outlier values in COD data: 0
Number of very extreme outlier values in ETH data: 0
Number of very extreme outlier values in GHA data: 0
Number of very extreme outlier values in KEN data: 1
Number of very extreme outlier values in MDG data: 0
Number of very extreme outlier values in MWI data: 0
Number of very extreme outlier values in MLI data: 1
Number of very extreme outlier values in MOZ data: 0
Number of very extreme outlier values in NER data: 0
Number of very extreme outlier values in NGA data: 0
Number of very extreme outlier values in RWA data: 0
Number of very extreme outlier values in SEN data: 1
Number of very extreme outlier values in SLE data: 1
Number of very extreme outlier values in TZA data: 0
Number of very extreme outlier values in TGO data: 0
Number of very extreme outlier values in UGA d

In [154]:
for i in range(len(subdata)):
    subdatacleaner[i].manage_outlier(subdata[i].select_dtypes(include="number").columns, cat_values=True)

'Outliers in Average precipitation in depth (mm per year)'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Kenya,KEN,2000-12-31,216.79553,170.09,42.48,2.72,53.1257,15.6428,2.944,...,0.872896,98.8,2.915447,9.390928,19.892,4.64967,24714605.0,498.47,24.91,18.96


'Outliers in CNMR'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
12,Mali,MLI,2012-12-31,435.65057,805.77,190.55,64.52,13.5345,15.0482,2.635,...,3.197568,126.1,2.918109,12.186365,37.599,5.06878,10305330.0,365.1,29.01,22.06


'Outliers in Malaria_Deaths'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Senegal,SEN,2000-12-31,306.24222,629.66,205.47,2.32,50.404,16.4414,2.363,...,1.357745,129.7,2.353492,19.191209,40.32,2.708788,5791518.0,722.54,28.59,21.43


'Outliers in CNMR'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
1,Sierra Leone,SLE,2001-12-31,473.00534,1034.96,296.63,3.45,67.7042,16.8294,7.18,...,0.915079,220.4,5.785413,14.582252,35.867,6.459612,3115001.0,2662.4,26.48,21.45


'Outliers in Urban population growth (annual %)'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Zambia,ZMB,2000-12-31,397.11148,278.08,91.41,2.77,13.3054,14.6309,2.848,...,3.379962,155.7,2.766606,10.85112,34.802,1.464844,6448823.0,1075.27,21.96,15.3


'Outliers in Average precipitation in depth (mm per year)'

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Zimbabwe,ZWE,2000-12-31,113.03278,126.79,44.78,2.79,30.5924,16.7754,0.734,...,0.346788,96.2,1.003969,11.654447,33.758,2.22893,7839526.0,913.34,21.23,14.77


### Export this version of the data

In [34]:
cleaned_data.to_csv("../data/cleaned/final/cleaned_project_dataset.csv", index=False)