## Cleaning the study dataset

In [12]:
""" Import packages """
import sys
import logging
import warnings

import numpy as np
import pandas as pd

sys.path.insert(0, '../scripts/')

logging.basicConfig(filename='../logs/data_cleaning.log', filemode='a',
                    encoding='utf-8', level=logging.DEBUG)

warnings.filterwarnings("ignore")

In [13]:
from explorer import DataTransformer
from dataframe_info_extractor import DataFrameInfo
from utils_cleaner import DataFrameCleaner

transformer = DataTransformer()

In [14]:
REPO = "./"
filepath = "../data/cleaned/final/project_data.csv"
rev="updv0"

### Importing

In [15]:
data = transformer.load_data(ext="csv", filepath=filepath, repo=REPO, rev=rev)
data.head()

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,ISO3_code,PopDensity,MedianAgePop,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Angola,AGO,2000-12-31,325.7036,283.27,74.62,2.82,AGO,13.15,15.5919,...,9.623866,205.1,3.244121,17.25515,50.087,5.64867,8182768.0,1053.39,21.72,14.94
1,Angola,AGO,2001-12-31,326.6507,304.97,80.3,3.67,AGO,13.5891,15.6426,...,24.009075,198.9,3.285217,17.778222,51.274,5.627442,8254958.0,1043.69,21.9,15.1
2,Angola,AGO,2002-12-31,309.12094,317.88,83.93,4.72,AGO,14.05,15.6964,...,11.406192,191.9,3.335132,18.309589,52.461,5.623762,8326997.0,1076.96,21.98,15.19
3,Angola,AGO,2003-12-31,313.731,336.87,89.76,5.3,AGO,14.5379,15.753,...,20.081014,184.2,3.413321,18.842102,53.645,5.645138,8401539.0,1022.52,22.08,15.25
4,Angola,AGO,2004-12-31,313.73257,369.5,98.39,4.49,AGO,15.0566,15.807,...,9.329239,175.5,3.506389,19.373746,54.827,5.685845,8479480.0,1059.9,21.81,15.02


In [16]:
data.columns

Index(['Country', 'ISO3', 'Date', 'Malaria_Incidence', 'Malaria_Deaths_U5',
       'Malaria_Deaths', 'ITN_Access', 'ISO3_code', 'PopDensity',
       'MedianAgePop', 'PopGrowthRate', 'TFR', 'IMR', 'Q5', 'CNMR',
       'Population ages 0-14 (% of total population)',
       'Population ages 15-64 (% of total population)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'External health expenditure (% of current health expenditure)',
       'People using at least basic sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'Population living in slums (% of urban population)',
       'Average precipitation in depth (mm per year)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 mil

In [17]:
data.shape

(1000, 33)

In [23]:
datainfo = DataFrameInfo(data)
datacleaner = DataFrameCleaner(data)

### Inspecting the dataset

In [24]:
datainfo.get_dimension()

 There are 920 rows and 33 columns


In [25]:
datainfo.get_percent_missing()

The dataset contains 9.28 % missing values.


### Inspect and cleaning country by country

In [9]:
datacleaner.remove_more_20p_missing_values("ISO3")

Your selected dataframe has 33 columns.
There are 2 columns that have missing values greater than 20%.


Your selected dataframe has 33 columns.
There are 2 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 1 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 3 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 2 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 2 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 1 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 3 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 2 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 1 columns that have missing values greater than 20%.
Your selected dataframe has 33 columns.
There are 1 columns that have mi

In [10]:
datainfo.get_dimension()

 There are 1000 rows and 25 columns


In [11]:
datainfo.get_percent_missing()

The dataset contains 9.28 % missing values.


In [26]:
datainfo.missing_values_table()

Your selected dataframe has 33 columns.
There are 32 columns that have missing values greater than 0%.


Unnamed: 0,Missing Values,% of Total Values,Dtype
Population living in slums (% of urban population),539,58.6,float64
"People using safely managed sanitation services, rural (% of rural population)",330,35.9,float64
Population in urban agglomerations of more than 1 million (% of total population),282,30.7,float64
External health expenditure (% of current health expenditure),131,14.2,float64
Domestic general government health expenditure (% of general government expenditure),128,13.9,float64
Malaria_Deaths,120,13.0,float64
Malaria_Deaths_U5,120,13.0,float64
Average precipitation in depth (mm per year),104,11.3,float64
"Foreign direct investment, net inflows (% of GDP)",81,8.8,float64
Malaria_Incidence,80,8.7,float64


In [15]:
# Replace in object columns and convert to int
objects_columns_to_convert = data.select_dtypes(object).columns.to_list()
objects_columns_to_convert.remove("ISO3")
objects_columns_to_convert.remove("Date")

for column in objects_columns_to_convert:
    datacleaner.replace_in_string_to_int(column, " ", "")

In [16]:
# Fill missing values
datacleaner.fill_missing("ISO3")

In [17]:
# Convert Date to datetime
datacleaner.convert_to_date("Date")

In [18]:
datainfo.get_data_types()

float64    42
int64       3
object      2
Name: count, dtype: int64

In [19]:
data = datacleaner.data
datainfo = DataFrameInfo(data)

In [20]:
datainfo.get_percent_missing()

The dataset contains 0.19 % missing values.


"ITN" column will be removed because it contains a lot of 0. The series does not make real since.

In [21]:
datacleaner.drop_columns(["ITN"])

In [22]:
datainfo.get_data_types()

float64           41
int64              3
datetime64[ns]     1
object             1
Name: count, dtype: int64

In [23]:
data.select_dtypes(float).columns

Index(['Cases', 'Deaths_df3', 'Deaths0_4', 'Deaths5_14', 'Deaths15_49',
       'Deaths50_69', 'Deaths70p', 'Presumed cases', 'Total cases',
       'ITN Access Population (%)', 'Precipitation', 'Min Temperature',
       'Surface Temperature Change', 'Access to electricity (% of population)',
       'Agricultural land (% of land area)', 'Agricultural land (sq. km)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Annual freshwater withdrawals, total (% of internal resources)',
       'Annual freshwater withdrawals, total (billion cubic meters)',
       'Arable land (% of land area)',
       'Average precipitation in depth (mm per year)',
       'CO2 emissions (kg per 2015 US$ of GDP)',
       'CO2 emissions (kg per 2017 PPP $ of GDP)',
       'CO2 emissions (kg per PPP $ of GDP)', 'CO2 emissions (kt)',
       'CO2 emissions (metric tons per capita)',
       'Cereal yield (kg per hectare)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Fo

In [24]:
data.select_dtypes(int).columns

Index(['Participated in MVIP', 'Participated in RTS Trials',
       'Participated in R21 Trials'],
      dtype='object')

In [25]:
datainfo.get_dimension()

 There are 115 rows and 46 columns


In [26]:
data.head()

Unnamed: 0,Date,Cases,Deaths_df3,Deaths0_4,Deaths5_14,Deaths15_49,Deaths50_69,Deaths70p,Presumed cases,Total cases,...,Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Renewable energy consumption (% of total final energy consumption),Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Urban population (% of total population)_df9,Urban population growth (annual %)_df9,Participated in MVIP,Participated in RTS Trials,Participated in R21 Trials
0,2000-12-31,603.210999,235.128065,874.845987,25.571181,62.627879,450.176799,717.149334,70589.82,280123.1,...,7.749025,11882888.0,85.4,15984.91011,2120383.0,17.844,6.857565,0,0,0
1,2001-12-31,601.937744,256.009859,918.916428,27.68654,66.619251,469.845175,876.388458,352587.0,352587.0,...,8.086915,12249764.0,85.49,15108.11122,2271106.0,18.54,6.86702,0,0,0
2,2002-12-31,595.852051,260.730606,958.846796,29.485652,72.230232,505.134949,743.621255,1188870.0,1188870.0,...,8.437518,12632269.0,85.43,15786.33044,2432722.0,19.258,6.874386,0,0,0
3,2003-12-31,585.123291,271.081041,965.412955,29.644367,72.615529,508.274478,881.41382,1443184.0,1443184.0,...,8.800698,13030591.0,85.3,19200.72446,2605597.0,19.996,6.865103,0,0,0
4,2004-12-31,562.411316,264.250419,925.186169,27.621911,71.165711,498.971133,873.94562,1528388.0,1546644.0,...,9.17737,13445977.0,85.31,19733.29397,2790981.0,20.757,6.873132,0,0,0


In [27]:
data.columns

Index(['Date', 'Cases', 'Deaths_df3', 'Deaths0_4', 'Deaths5_14', 'Deaths15_49',
       'Deaths50_69', 'Deaths70p', 'Presumed cases', 'Total cases',
       'ITN Access Population (%)', 'Precipitation', 'Min Temperature', 'ISO3',
       'Surface Temperature Change', 'Access to electricity (% of population)',
       'Agricultural land (% of land area)', 'Agricultural land (sq. km)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Annual freshwater withdrawals, total (% of internal resources)',
       'Annual freshwater withdrawals, total (billion cubic meters)',
       'Arable land (% of land area)',
       'Average precipitation in depth (mm per year)',
       'CO2 emissions (kg per 2015 US$ of GDP)',
       'CO2 emissions (kg per 2017 PPP $ of GDP)',
       'CO2 emissions (kg per PPP $ of GDP)', 'CO2 emissions (kt)',
       'CO2 emissions (metric tons per capita)',
       'Cereal yield (kg per hectare)',
       'Foreign direct investment, net inflows (% of G

### Inspecting extreme outlier values in country datasets

In [28]:
subdata = datacleaner.split_in_subframes("ISO3")

In [29]:
subdatacleaner = [DataFrameCleaner(frame) for frame in subdata]

In [30]:
print("Number of very extreme outlier values in BFA data:", subdatacleaner[0].manage_outlier(subdata[0].select_dtypes(include="number").columns).nb_outliers.sum())
print("Number of very extreme outlier values in GHA data:", subdatacleaner[1].manage_outlier(subdata[1].select_dtypes(include="number").columns).nb_outliers.sum())
print("Number of very extreme outlier values in KEN data:", subdatacleaner[2].manage_outlier(subdata[2].select_dtypes(include="number").columns).nb_outliers.sum())
print("Number of very extreme outlier values in MWI data:", subdatacleaner[3].manage_outlier(subdata[3].select_dtypes(include="number").columns).nb_outliers.sum())
print("Number of very extreme outlier values in NGA data:", subdatacleaner[4].manage_outlier(subdata[4].select_dtypes(include="number").columns).nb_outliers.sum())

Number of very extreme outlier values in BFA data: 0
Number of very extreme outlier values in GHA data: 0
Number of very extreme outlier values in KEN data: 1
Number of very extreme outlier values in MWI data: 0
Number of very extreme outlier values in NGA data: 0


In [31]:
subdatacleaner[2].manage_outlier(subdata[2].select_dtypes(include="number").columns, cat_values=True)

'Outliers in Average precipitation in depth (mm per year)'

Unnamed: 0,Date,Cases,Deaths_df3,Deaths0_4,Deaths5_14,Deaths15_49,Deaths50_69,Deaths70p,Presumed cases,Total cases,...,Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Renewable energy consumption (% of total final energy consumption),Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Urban population (% of total population)_df9,Urban population growth (annual %)_df9,Participated in MVIP,Participated in RTS Trials,Participated in R21 Trials
0,2000-12-31,216.795532,40.316853,170.093842,9.731429,10.931158,68.345402,87.586865,4216531.0,4216531.0,...,9.390928,30851606.0,78.15,35347.18812,6137001.0,19.892,4.64967,0,0,0


['Average precipitation in depth (mm per year)']

In [32]:
subdatacleaner[3].manage_outlier(subdata[3].select_dtypes(include="number").columns, cat_values=True)

[]

In [33]:
subdatacleaner[4].manage_outlier(subdata[4].select_dtypes(include="number").columns, cat_values=True)

[]

### Export this version of the data

In [34]:
data.to_csv("../data/cleaned/study_dataset.csv", index=False)