# Explore source data

#### This notebook aims to explore source csv files to retrive useful data for the project while doing some cleaning tasks at the same time

##### Available in the data/source/final folder:

        - "../data/source/final/incidence-of-malaria.csv"
        - "../data/source/final/malaria-death-rates-by-age.csv"
        - "../data/source/final/pop_access_itn.csv"
        - "../data/source/final/WPP2022_Demographic_Indicators_Medium.csv"
        - "../data/source/final/API_4_DS2_en_csv_v2_5873665.csv"
        - "../data/source/final/API_8_DS2_en_csv_v2_5883819.csv"
        - "../data/source/final/API_11_DS2_en_csv_v2_5874711.csv"
        - "../data/source/final/API_19_DS2_en_csv_v2_5873088.csv"
        - "../data/source/final/API_SP.RUR.TOTL_DS2_en_csv_v2_5874544.csv"
        - "../data/source/final/Annual_Surface_Temperature_Change.csv"

In [1]:
""" Import packages """
import sys

import requests
import numpy as np
import pandas as pd

sys.path.insert(1, '../scripts/')


In [2]:
from explorer import DataTransformer

transformer = DataTransformer()

In [3]:
from country_converter import CountryConverter
cc = CountryConverter()

In [4]:
# Declaring notebook variables
REPO = "./"
filepath_targets_1 = "../data/source/final/incidence-of-malaria.csv"
filepath_targets_2 = "../data/source/final/malaria-death-rates-by-age.csv"

filepath_feat_1 = "../data/source/final/pop_access_itn.csv"
filepath_feat_2 = "../data/source/final/WPP2022_Demographic_Indicators_Medium.csv"
filepath_feat_3 = "../data/source/final/API_4_DS2_en_csv_v2_5873665.csv"
filepath_feat_4 = "../data/source/final/API_8_DS2_en_csv_v2_5883819.csv"
filepath_feat_5 = "../data/source/final/API_11_DS2_en_csv_v2_5874711.csv"
filepath_feat_6 = "../data/source/final/API_19_DS2_en_csv_v2_5873088.csv"
filepath_feat_7 = "../data/source/final/API_SP.RUR.TOTL_DS2_en_csv_v2_5874544.csv"

In [5]:
AFRICAN_COUNTRIES = [
    # "Algeria",
    "Angola",
    "Benin",
    # "Botswana",
    "Burkina Faso",
    "Burundi",
    # "Cabo Verde",
    "Cameroon",
    "Central African Republic",
    "Chad",
    "Comoros",
    "Congo",
    "Cote d'Ivoire",
    # "Democratic Republic of the Congo",
    "Democratic Republic of Congo",
    "Djibouti",
    # "Egypt",
    "Equatorial Guinea",
    "Eritrea",
    # "Eswatini",
    "Ethiopia",
    "Gabon",
    "Gambia",
    "Ghana",
    "Guinea",
    "Guinea-Bissau",
    # "Ivory Coast",
    "Kenya",
    # "Lesotho",
    "Liberia",
    # "Libya",
    "Madagascar",
    "Malawi",
    "Mali",
    "Mauritania",
    # "Mauritius",
    # "Morocco",
    "Mozambique",
    # "Namibia",
    "Niger",
    "Nigeria",
    "Rwanda",
    # "Sao Tome and Principe",
    "Senegal",
    # "Seychelles",
    "Sierra Leone",
    "Somalia",
    # "South Africa",
    "South Sudan",
    "Sudan",
    "Tanzania",
    "Togo",
    # "Tunisia",
    "Uganda",
    "Zambia",
    "Zimbabwe"
]

# ['Angola',
#  'Benin',
#  'Burkina Faso',
#  'Burundi',
#  'Cameroon',
#  'Central African Republic',
#  'Chad',
#  'Comoros',
#  'Congo',
#  "Côte d'Ivoire",
#  'Democratic Republic of the Congo',
#  'Djibouti',
#  'Equatorial Guinea',
#  'Eritrea',
#  'Ethiopia',
#  'Gabon',
#  'Gambia',
#  'Ghana',
#  'Guinea',
#  'Guinea-Bissau',
#  'Kenya',
#  'Liberia',
#  'Madagascar',
#  'Malawi',
#  'Mali',
#  'Mauritania',
#  'Mozambique',
#  'Niger',
#  'Nigeria',
#  'Rwanda',
#  'Senegal',
#  'Sierra Leone',
#  'Somalia',
#  'South Sudan',
#  'Sudan',
#  'Togo',
#  'Uganda',
#  'United Republic of Tanzania',
#  'Zambia',
#  'Zimbabwe']


In [6]:
len(AFRICAN_COUNTRIES)

40

### Variables of interest

    1- Malaria incidence rate for 1,000 people at risk
    2- Malaria Deaths rate for 100,000 individuals
    3- Malaria Deaths rate under 5 for 100,000 individuals 

url: https://ourworldindata.org/malaria#malaria-deaths

In [7]:
source_target_df1 = transformer.load_data(ext="csv", filepath=filepath_targets_1, repo=REPO, header=0, parse_dates=None)
source_target_df2 = transformer.load_data(ext="csv", filepath=filepath_targets_2, repo=REPO, header=0, parse_dates=None)
source_target_df2 = source_target_df2[source_target_df2.columns[:5]]

In [8]:
# Rename columns
source_target_df1.rename(
   columns={
        i:j for i,j in zip(source_target_df1.columns, ["Country", "ISO3", "Date", "Malaria_Incidence"])
    }, inplace=True
)

source_target_df2.rename(
   columns={
        i:j for i,j in zip(source_target_df2.columns, ["Country", "ISO3", "Date", "Malaria_Deaths_U5", "Malaria_Deaths"])
    }, inplace=True
)

In [9]:
# Subset study countries
cleaned_target_df1 = transformer.subset_study_countries(source_target_df1, "Country", countries=AFRICAN_COUNTRIES)
cleaned_target_df2 = transformer.subset_study_countries(source_target_df2, "Country", countries=AFRICAN_COUNTRIES)

# Display cleaned_df1
display(cleaned_target_df1.head())
display(cleaned_target_df2.head())

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence
0,Angola,AGO,2000,325.7036
1,Angola,AGO,2001,326.6507
2,Angola,AGO,2002,309.12094
3,Angola,AGO,2003,313.731
4,Angola,AGO,2004,313.73257


Unnamed: 0,Country,ISO3,Date,Malaria_Deaths_U5,Malaria_Deaths
0,Angola,AGO,1990,207.32,53.73
1,Angola,AGO,1991,211.26,54.71
2,Angola,AGO,1992,215.41,55.89
3,Angola,AGO,1993,220.27,57.54
4,Angola,AGO,1994,225.79,59.39


In [10]:
cleaned_targets = cleaned_target_df1.merge(cleaned_target_df2, on=["Country", "ISO3", "Date"], how="outer")
cleaned_targets

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths
0,Angola,AGO,2000,325.70360,283.27,74.62
1,Angola,AGO,2001,326.65070,304.97,80.30
2,Angola,AGO,2002,309.12094,317.88,83.93
3,Angola,AGO,2003,313.73100,336.87,89.76
4,Angola,AGO,2004,313.73257,369.50,98.39
...,...,...,...,...,...,...
1235,Zimbabwe,ZWE,1995,,76.06,25.98
1236,Zimbabwe,ZWE,1996,,109.36,37.70
1237,Zimbabwe,ZWE,1997,,109.11,38.10
1238,Zimbabwe,ZWE,1998,,107.86,39.63


In [11]:
# Convert year to datetime format
cleaned_targets = transformer.convert_to_dateformat(cleaned_targets, "Date")
cleaned_targets = cleaned_targets[cleaned_targets.Date >= "2000-12-31"]
cleaned_targets

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths
0,Angola,AGO,2000-12-31,325.703600,283.27,74.62
1,Angola,AGO,2001-12-31,326.650700,304.97,80.30
2,Angola,AGO,2002-12-31,309.120940,317.88,83.93
3,Angola,AGO,2003-12-31,313.731000,336.87,89.76
4,Angola,AGO,2004-12-31,313.732570,369.50,98.39
...,...,...,...,...,...,...
835,Zimbabwe,ZWE,2016-12-31,68.375830,57.43,14.33
836,Zimbabwe,ZWE,2017-12-31,118.824394,58.86,14.56
837,Zimbabwe,ZWE,2018-12-31,55.826330,57.32,14.14
838,Zimbabwe,ZWE,2019-12-31,67.874060,57.20,13.78


In [12]:
cleaned_targets.Country.nunique()

40

### Features variables

#### Population with access to an insecticide-treated bed net (ITN) for malaria protection (%), modelled

url: https://apps.who.int/gho/data/node.main.MALARIAITNCOVERAGE?lang=en

In [13]:
source_feat_df1 = transformer.load_data(ext="csv", filepath=filepath_feat_1, repo=REPO, header=1, parse_dates=None)
source_feat_df1

Unnamed: 0,"Countries, territories and areas",2021,2020,2019,2018,2017,2016,2015,2014,2013,...,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Angola,16.78,15.49,27.48,52.69,51.68,23.27,22.61,23.26,12.92,...,15.12,17.39,13.02,6.41,4.01,4.49,5.3,4.72,3.67,2.82
1,Benin,63.97,68.49,29.86,58.93,52.89,26.69,64.15,32.67,15.81,...,18.19,22.61,22.37,3.31,3.16,4.1,3.6,3.13,3.05,2.6
2,Burkina Faso,52.04,69.39,66.52,49.46,67.7,69.07,51.96,70.2,49.45,...,7.53,7.73,5.55,5.19,4.28,3.0,2.6,2.9,2.97,2.55
3,Burundi,66.91,65.19,49.99,71.64,59.25,38.81,66.1,65.6,39.48,...,44.59,34.65,27.56,9.09,4.04,4.15,3.28,3.36,3.3,2.78
4,Cameroon,68.36,73.51,69.3,62.92,71.06,70.02,56.71,55.86,60.05,...,6.88,6.14,6.41,6.63,3.3,2.22,2.21,2.49,2.94,2.62
5,Central African Republic,62.01,69.07,74.7,75.96,62.18,63.06,61.54,41.3,22.27,...,47.66,40.63,14.74,5.11,3.17,3.36,3.65,3.71,3.32,2.65
6,Chad,59.83,50.2,19.44,48.56,51.99,10.69,28.93,61.08,16.56,...,5.75,5.43,5.3,4.78,4.14,3.91,3.76,3.49,3.09,2.55
7,Comoros,78.49,71.59,47.26,66.96,81.88,77.65,59.6,75.03,69.67,...,31.0,39.67,56.15,67.58,57.25,19.7,24.23,13.48,3.91,2.84
8,Congo,75.99,86.29,74.83,30.9,32.09,42.34,53.4,60.66,63.37,...,8.97,11.29,5.62,4.64,4.0,3.72,3.17,3.14,3.08,2.62
9,Côte d'Ivoire,81.25,54.85,60.58,74.64,73.55,61.51,75.51,61.41,32.27,...,15.65,12.68,5.17,3.0,2.17,2.22,2.56,3.05,3.07,2.63


In [14]:
source_feat_df1 = source_feat_df1.replace({"Countries, territories and areas": {"Côte d'Ivoire":"Cote d'Ivoire", "Democratic Republic of the Congo": 'Democratic Republic of Congo', "United Republic of Tanzania": "Tanzania"}})

In [15]:
source_feat_df1.rename(
    columns={
        source_feat_df1.columns[0]: "Country",
    },
    inplace=True
)

In [16]:
# Reshape cleaned_df2 to get a better format
cleaned_feat_df1 = transformer.extract_unique_serie(source_feat_df1, "Country", "ITN_Access")

cleaned_feat_df1.rename(
    columns={
        "Year": "Date",
    },
    inplace=True
)

# Display cleaned_df2
cleaned_feat_df1.head()

Unnamed: 0,Country,Date,ITN_Access
0,Angola,2021,16.78
1,Angola,2020,15.49
2,Angola,2019,27.48
3,Angola,2018,52.69
4,Angola,2017,51.68


In [17]:
cleaned_feat_df1 = transformer.subset_study_countries(cleaned_feat_df1, "Country", countries=AFRICAN_COUNTRIES)

In [18]:
cleaned_feat_df1["Date"] = cleaned_feat_df1["Date"].astype(int)

In [19]:
cleaned_feat_df1["ISO3"] = cc.convert(cleaned_feat_df1['Country'], to="ISO3")

In [20]:
# Convert year to datetime format
cleaned_feat_df1 = transformer.convert_to_dateformat(cleaned_feat_df1, "Date")
cleaned_feat_df1

Unnamed: 0,Country,Date,ITN_Access,ISO3
0,Angola,2021-12-31,16.78,AGO
1,Angola,2020-12-31,15.49,AGO
2,Angola,2019-12-31,27.48,AGO
3,Angola,2018-12-31,52.69,AGO
4,Angola,2017-12-31,51.68,AGO
...,...,...,...,...
875,Zimbabwe,2004-12-31,3.03,ZWE
876,Zimbabwe,2003-12-31,3.33,ZWE
877,Zimbabwe,2002-12-31,3.34,ZWE
878,Zimbabwe,2001-12-31,3.22,ZWE


In [21]:
cleaned_feat_df1.dtypes

Country               object
Date          datetime64[ns]
ITN_Access           float64
ISO3                  object
dtype: object

#### Some Demographic indicators 

url: https://population.un.org/wpp/Download/Standard/CSV/

In [22]:
source_feat_df2 = transformer.load_data(ext="csv", filepath=filepath_feat_2, repo=REPO, header=0, parse_dates=None)
source_feat_df2.head()

  data = pd.read_csv(


Unnamed: 0,SortOrder,LocID,Notes,ISO3_code,ISO2_code,SDMX_code,LocTypeID,LocTypeName,ParentID,Location,...,Q0060Male,Q0060Female,Q1550,Q1550Male,Q1550Female,Q1560,Q1560Male,Q1560Female,NetMigrations,CNMR
0,1,900,,,,1.0,1,World,0,World,...,580.7496,498.0396,240.3164,271.6253,208.1916,378.6968,430.2594,324.9308,0.0,0.0
1,1,900,,,,1.0,1,World,0,World,...,566.7283,490.1993,231.1772,258.0905,203.7797,368.3186,415.8362,319.3364,0.0,0.0
2,1,900,,,,1.0,1,World,0,World,...,546.3173,477.2639,218.6745,240.0344,197.1422,353.0546,395.5327,309.9103,0.0,0.0
3,1,900,,,,1.0,1,World,0,World,...,535.8289,469.5325,212.872,232.6019,193.0492,345.0826,385.8433,303.9053,0.0,0.0
4,1,900,,,,1.0,1,World,0,World,...,523.1236,458.4842,205.7622,224.0498,187.4439,335.442,374.6576,295.9944,0.0,0.0


In [23]:
source_feat_df2.columns

Index(['SortOrder', 'LocID', 'Notes', 'ISO3_code', 'ISO2_code', 'SDMX_code',
       'LocTypeID', 'LocTypeName', 'ParentID', 'Location', 'VarID', 'Variant',
       'Time', 'TPopulation1Jan', 'TPopulation1July', 'TPopulationMale1July',
       'TPopulationFemale1July', 'PopDensity', 'PopSexRatio', 'MedianAgePop',
       'NatChange', 'NatChangeRT', 'PopChange', 'PopGrowthRate',
       'DoublingTime', 'Births', 'Births1519', 'CBR', 'TFR', 'NRR', 'MAC',
       'SRB', 'Deaths', 'DeathsMale', 'DeathsFemale', 'CDR', 'LEx', 'LExMale',
       'LExFemale', 'LE15', 'LE15Male', 'LE15Female', 'LE65', 'LE65Male',
       'LE65Female', 'LE80', 'LE80Male', 'LE80Female', 'InfantDeaths', 'IMR',
       'LBsurvivingAge1', 'Under5Deaths', 'Q5', 'Q0040', 'Q0040Male',
       'Q0040Female', 'Q0060', 'Q0060Male', 'Q0060Female', 'Q1550',
       'Q1550Male', 'Q1550Female', 'Q1560', 'Q1560Male', 'Q1560Female',
       'NetMigrations', 'CNMR'],
      dtype='object')

In [24]:
select_columns = ["ISO3_code", "Location", "Time", "PopDensity", "MedianAgePop", "PopGrowthRate", "TFR", "IMR", "Q5", "CNMR"]

# Select columns dict
# Population Density, as of 1 July (persons per square km)
# Median Age, as of 1 July (years)
# Population Growth Rate (percentage)
# Total Fertility Rate (live births per woman)
# Infant Mortality Rate (infant deaths per 1,000 live births)
# Under-five Mortality Rate (deaths under age 5 per 1,000 live births)
# Net Migration Rate (per 1,000 population)

In [25]:
source_feat_df2 = source_feat_df2[select_columns]
source_feat_df2.head()

Unnamed: 0,ISO3_code,Location,Time,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
0,,World,1950,19.1661,22.2243,1.732,4.8596,143.3869,224.0098,0.0
1,,World,1951,19.502,22.1716,1.743,4.8269,141.309,219.1188,0.0
2,,World,1952,19.8635,22.1083,1.929,5.0142,137.2952,212.1977,0.0
3,,World,1953,20.247,22.0364,1.896,4.9366,134.4685,206.944,0.0
4,,World,1954,20.6435,21.9524,1.982,5.0098,131.7228,202.1803,0.0


In [26]:
source_feat_df2.rename(
    columns={
        "Location": "Country",
        "Time": "Date"
    },
    inplace=True
)

In [27]:
source_feat_df2 = source_feat_df2.replace({"Country": {"Côte d'Ivoire":"Cote d'Ivoire", "Democratic Republic of the Congo": 'Democratic Republic of Congo', "United Republic of Tanzania": "Tanzania"}})

In [28]:
source_feat_df2 = transformer.subset_study_countries(source_feat_df2, "Country", countries=AFRICAN_COUNTRIES)

In [29]:
source_feat_df2.Country.nunique()

40

In [30]:
# Convert year to datetime format
source_feat_df2 = transformer.convert_to_dateformat(source_feat_df2, "Date")
source_feat_df2

Unnamed: 0,ISO3_code,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
0,BDI,Burundi,1950-12-31,86.8637,18.3147,2.200,6.9229,166.9203,282.5826,-5.893
1,BDI,Burundi,1951-12-31,88.7571,18.0842,2.114,6.9139,165.9564,280.7266,-5.721
2,BDI,Burundi,1952-12-31,90.6179,17.8744,2.036,6.9002,163.9716,277.5845,-5.815
3,BDI,Burundi,1953-12-31,92.4508,17.6693,1.969,6.9146,162.0241,274.2910,-6.216
4,BDI,Burundi,1954-12-31,94.2874,17.4706,1.965,6.9170,160.1124,271.3371,-5.948
...,...,...,...,...,...,...,...,...,...,...
6075,TGO,Togo,2097-12-31,486.5241,31.9424,0.668,2.1460,13.7251,18.1167,-0.076
6076,TGO,Togo,2098-12-31,489.7579,32.1299,0.657,2.1441,13.6079,17.9371,-0.075
6077,TGO,Togo,2099-12-31,492.9371,32.3163,0.637,2.1292,13.5078,17.7815,-0.075
6078,TGO,Togo,2100-12-31,496.0535,32.5034,0.624,2.1240,13.3691,17.5703,-0.074


In [31]:
source_feat_df2 = source_feat_df2.rename(columns={"ISO3_code": "ISO3"})

In [32]:
present_cleaned_feat_df2 = source_feat_df2[(source_feat_df2.Date >= "2000-12-31") & (source_feat_df2.Date <= "2022-12-31")]
display(present_cleaned_feat_df2.head())
display(present_cleaned_feat_df2.tail())

Unnamed: 0,ISO3,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
50,BDI,Burundi,2000-12-31,243.0697,14.6556,2.03,6.8715,95.3024,156.647,-6.338
51,BDI,Burundi,2001-12-31,249.161,14.8623,2.91,6.849,92.3659,150.8253,1.542
52,BDI,Burundi,2002-12-31,256.2211,15.0722,2.682,6.8187,89.1441,144.7844,-2.535
53,BDI,Burundi,2003-12-31,264.3871,15.2822,3.579,6.7855,85.9985,139.1727,5.365
54,BDI,Burundi,2004-12-31,274.3929,15.5281,3.846,6.7464,82.3523,132.0726,6.632


Unnamed: 0,ISO3,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
5996,TGO,Togo,2018-12-31,147.9441,18.3335,2.418,4.4425,48.1653,68.8169,-0.248
5997,TGO,Togo,2019-12-31,151.5553,18.4275,2.406,4.3851,46.7863,66.4965,-0.242
5998,TGO,Togo,2020-12-31,155.223,18.534,2.377,4.3227,45.4896,64.3925,-0.118
5999,TGO,Togo,2021-12-31,158.9415,18.6519,2.358,4.2574,44.2307,62.3662,-0.115
6000,TGO,Togo,2022-12-31,162.6898,18.7787,2.305,4.1956,43.8895,62.4325,-0.226


In [33]:
future_cleaned_feat_df2 = source_feat_df2[(source_feat_df2.Date >= "2023-12-31") & (source_feat_df2.Date <= "2070-12-31")]
display(future_cleaned_feat_df2.head())
display(future_cleaned_feat_df2.tail())

Unnamed: 0,ISO3,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
73,BDI,Burundi,2023-12-31,510.1564,15.9924,2.648,4.8784,36.9629,52.205,0.151
74,BDI,Burundi,2024-12-31,523.7633,16.2441,2.617,4.778,36.8812,52.4515,0.147
75,BDI,Burundi,2025-12-31,537.5006,16.5154,2.562,4.6623,36.368,51.7445,0.143
76,BDI,Burundi,2026-12-31,551.3609,16.8012,2.53,4.5732,35.8625,51.0459,0.14
77,BDI,Burundi,2027-12-31,565.434,17.0982,2.511,4.4937,35.3496,50.332,0.136


Unnamed: 0,ISO3,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
6044,TGO,Togo,2066-12-31,360.4891,26.1499,1.296,2.6284,19.8782,27.8772,-0.102
6045,TGO,Togo,2067-12-31,365.1698,26.3371,1.284,2.6186,19.5699,27.4026,-0.101
6046,TGO,Togo,2068-12-31,369.8343,26.5235,1.255,2.5896,19.2565,26.9129,-0.099
6047,TGO,Togo,2069-12-31,374.4571,26.7104,1.23,2.5655,18.9701,26.4599,-0.098
6048,TGO,Togo,2070-12-31,379.0608,26.896,1.214,2.5546,18.6794,25.9988,-0.097


#### Some additional demographic indicators

url: https://data.worldbank.org/

In [34]:
source_feat_df3 = transformer.load_data(ext="csv", filepath=filepath_feat_3, repo=REPO, header=2, parse_dates=None)
source_feat_df3.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Population ages 15-64 (% of total population),SP.POP.1564.TO.ZS,54.495678,54.588701,54.58563,54.674206,54.873448,55.181477,...,69.620827,69.352325,69.027743,68.681269,68.320569,67.962515,67.712003,67.657094,67.617079,
1,Aruba,ABW,Population ages 0-14 (% of total population),SP.POP.0014.TO.ZS,43.131043,42.949419,42.852732,42.661157,42.359159,41.936664,...,19.045505,18.799607,18.571721,18.334859,18.069771,17.767339,17.351022,16.799407,16.240782,
2,Aruba,ABW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,"Unemployment, male (% of male labor force) (mo...",SL.UEM.TOTL.MA.ZS,,,,,,,...,,,,,,,,,,
4,Aruba,ABW,"Unemployment, female (% of female labor force)...",SL.UEM.TOTL.FE.ZS,,,,,,,...,,,,,,,,,,


In [35]:
source_feat_df3 = source_feat_df3.replace({"Country Name": {"Congo, Rep.":"Congo", "Congo, Dem. Rep.": 'Democratic Republic of Congo', "Gambia, The": "Gambia"}})

In [36]:
source_feat_df3 = transformer.subset_study_countries(source_feat_df3, "Country Name", countries=AFRICAN_COUNTRIES)

In [37]:
source_feat_df3["Country Name"].nunique()

40

In [38]:
cleaned_feat_df3_needed_columns = [i for i in source_feat_df3.columns if i.startswith("2")]

In [39]:
cleaned_feat_df3 = transformer.extract_series(
    [cleaned_feat_df3_needed_columns, "Indicator Value", ""],
    source_data=source_feat_df3,
    immutable_columns=["Country Name", "Indicator Name"], 
    multiple_index=True
)

In [40]:
# Columns with 50% or more missing valaue are not useful
cokumns_50_more_missing_feat_df3 = [col for col in cleaned_feat_df3.columns if (cleaned_feat_df3[col].isna().sum() / len(cleaned_feat_df3)) >= 0.5]

# Drop columns_50_more_missing and non useful columns from cleaned_df9
cleaned_feat_df3 = cleaned_feat_df3.drop(columns=cokumns_50_more_missing_feat_df3)

In [41]:
cleaned_feat_df3.isna().sum()

Indicator Name
Year                                                                                          0
Country Name                                                                                  0
Children out of school (% of primary school age)                                            457
Compulsory education, duration (years)                                                      171
Government expenditure on education, total (% of GDP)                                       336
Government expenditure on education, total (% of government expenditure)                    297
Gross intake ratio in first grade of primary education, female (% of relevant age group)    401
Gross intake ratio in first grade of primary education, male (% of relevant age group)      401
Gross intake ratio in first grade of primary education, total (% of relevant age group)     401
Labor force, female (% of total labor force)                                                  0
Labor force, total       

In [42]:
select_columns = ["Year", "Country Name", "Population ages 0-14 (% of total population)", "Population ages 15-64 (% of total population)"]

# Population ages 0-14 (% of total population)
# Population ages 15-64 (% of total population)

In [43]:
cleaned_feat_df3 = cleaned_feat_df3[select_columns]
cleaned_feat_df3.head()

Indicator Name,Year,Country Name,Population ages 0-14 (% of total population),Population ages 15-64 (% of total population)
0,2000-12-31,Angola,46.442279,51.131172
1,2000-12-31,Benin,45.178538,51.539678
2,2000-12-31,Burkina Faso,46.574494,50.487924
3,2000-12-31,Burundi,48.198607,49.370484
4,2000-12-31,Cameroon,45.242154,51.437732


In [44]:
cleaned_feat_df3.rename(
    columns={
        "Country Name": "Country",
        "Year": "Date"
    },
    inplace=True
)

In [45]:
cleaned_feat_df3["ISO3"] = cc.convert(cleaned_feat_df3['Country'], to="ISO3")

In [46]:
cleaned_feat_df3.head()

Indicator Name,Date,Country,Population ages 0-14 (% of total population),Population ages 15-64 (% of total population),ISO3
0,2000-12-31,Angola,46.442279,51.131172,AGO
1,2000-12-31,Benin,45.178538,51.539678,BEN
2,2000-12-31,Burkina Faso,46.574494,50.487924,BFA
3,2000-12-31,Burundi,48.198607,49.370484,BDI
4,2000-12-31,Cameroon,45.242154,51.437732,CMR


In [47]:
cleaned_feat_df3.tail()

Indicator Name,Date,Country,Population ages 0-14 (% of total population),Population ages 15-64 (% of total population),ISO3
915,2022-12-31,Tanzania,43.357466,53.539108,TZA
916,2022-12-31,Togo,39.9603,56.904922,TGO
917,2022-12-31,Uganda,44.781386,53.524551,UGA
918,2022-12-31,Zambia,42.856553,55.395954,ZMB
919,2022-12-31,Zimbabwe,40.634003,56.044152,ZWE


#### Some additional demographic indicators

url: https://data.worldbank.org/

In [48]:
source_feat_df4 = transformer.load_data(ext="csv", filepath=filepath_feat_4, repo=REPO, header=2, parse_dates=None)
source_feat_df4.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Unmet need for contraception (% of married wom...,SP.UWT.TFRT,,,,,,,...,,,,,,,,,,
1,Aruba,ABW,Completeness of death registration with cause-...,SP.REG.DTHS.ZS,,,,,,,...,,,,,,,,,,
2,Aruba,ABW,Completeness of birth registration (%),SP.REG.BRTH.ZS,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,"Completeness of birth registration, urban (%)",SP.REG.BRTH.UR.ZS,,,,,,,...,,,,,,,,,,
4,Aruba,ABW,"Completeness of birth registration, rural (%)",SP.REG.BRTH.RU.ZS,,,,,,,...,,,,,,,,,,


In [49]:
source_feat_df4 = source_feat_df4.replace({"Country Name": {"Congo, Rep.":"Congo", "Congo, Dem. Rep.": 'Democratic Republic of Congo', "Gambia, The": "Gambia"}})

In [50]:
source_feat_df4 = transformer.subset_study_countries(source_feat_df4, "Country Name", countries=AFRICAN_COUNTRIES)

In [51]:
source_feat_df4["Country Name"].nunique()

40

In [52]:
cleaned_feat_df4_needed_columns = [i for i in source_feat_df4.columns if i.startswith("2")]

In [53]:
cleaned_feat_df4 = transformer.extract_series(
    [cleaned_feat_df4_needed_columns, "Indicator Value", ""],
    source_data=source_feat_df4,
    immutable_columns=["Country Name", "Indicator Name"], 
    multiple_index=True
)

In [54]:
# Columns with 50% or more missing valaue are not useful
cokumns_50_more_missing_feat_df4 = [col for col in cleaned_feat_df4.columns if (cleaned_feat_df4[col].isna().sum() / len(cleaned_feat_df4)) >= 0.5]

# Drop columns_50_more_missing and non useful columns from cleaned_df9
cleaned_feat_df4 = cleaned_feat_df4.drop(columns=cokumns_50_more_missing_feat_df4)

In [55]:
cleaned_feat_df4.isna().sum()

Indicator Name
Year                                                                          0
Country Name                                                                  0
Adolescent fertility rate (births per 1,000 women ages 15-19)                40
Adults (ages 15+) and children (ages 0-14) newly infected with HIV          106
Adults (ages 15-49) newly infected with HIV                                 106
                                                                           ... 
Tuberculosis case detection rate (%, all forms)                              77
Tuberculosis treatment success rate (% of new cases)                        131
Vitamin A supplementation coverage rate (% of children ages 6-59 months)    212
Women's share of population ages 15+ living with HIV (%)                     40
Young people (ages 15-24) newly infected with HIV                           106
Length: 166, dtype: int64

In [56]:
select_columns = ["Year", "Country Name", "Domestic general government health expenditure (% of general government expenditure)", "External health expenditure (% of current health expenditure)", "People using at least basic sanitation services, rural (% of rural population)", "People using safely managed sanitation services, rural (% of rural population)"]

# Domestic general government health expenditure (% of general government expenditure) : Public expenditure on health from domestic sources as a share of total public expenditure.  It indicates the priority of the government to spend on health from own domestic public resources.
# External health expenditure (% of current health expenditure): Share of current health expenditures funded from external sources. External sources compose of direct foreign transfers and foreign transfers distributed by government encompassing all financial inflows into the national health system from outside the country. External sources either flow through the government scheme or are channeled through non-governmental organizations or other schemes.
# 'People using at least basic sanitation services, rural (% of rural population)
# 'People using safely managed sanitation services, rural (% of rural population)',

In [57]:
cleaned_feat_df4 = cleaned_feat_df4[select_columns]
cleaned_feat_df4.head()

Indicator Name,Year,Country Name,Domestic general government health expenditure (% of general government expenditure),External health expenditure (% of current health expenditure),"People using at least basic sanitation services, rural (% of rural population)","People using safely managed sanitation services, rural (% of rural population)"
0,2000-12-31,Angola,2.738583,0.513654,7.531022,
1,2000-12-31,Benin,5.159786,16.741316,2.223545,0.444574
2,2000-12-31,Burkina Faso,4.848369,21.297304,2.024615,0.926887
3,2000-12-31,Burundi,5.94454,0.709387,45.633269,
4,2000-12-31,Cameroon,4.426269,2.665856,24.526613,


In [58]:
cleaned_feat_df4.rename(
    columns={
        "Country Name": "Country",
        "Year": "Date"
    },
    inplace=True
)

In [59]:
cleaned_feat_df4["ISO3"] = cc.convert(cleaned_feat_df4['Country'], to="ISO3")

In [60]:
cleaned_feat_df4.head()

Indicator Name,Date,Country,Domestic general government health expenditure (% of general government expenditure),External health expenditure (% of current health expenditure),"People using at least basic sanitation services, rural (% of rural population)","People using safely managed sanitation services, rural (% of rural population)",ISO3
0,2000-12-31,Angola,2.738583,0.513654,7.531022,,AGO
1,2000-12-31,Benin,5.159786,16.741316,2.223545,0.444574,BEN
2,2000-12-31,Burkina Faso,4.848369,21.297304,2.024615,0.926887,BFA
3,2000-12-31,Burundi,5.94454,0.709387,45.633269,,BDI
4,2000-12-31,Cameroon,4.426269,2.665856,24.526613,,CMR


In [61]:
cleaned_feat_df4.tail()

Indicator Name,Date,Country,Domestic general government health expenditure (% of general government expenditure),External health expenditure (% of current health expenditure),"People using at least basic sanitation services, rural (% of rural population)","People using safely managed sanitation services, rural (% of rural population)",ISO3
915,2022-12-31,Tanzania,,,21.260903,19.764468,TZA
916,2022-12-31,Togo,,,9.078689,4.681563,TGO
917,2022-12-31,Uganda,,,17.854375,16.177081,UGA
918,2022-12-31,Zambia,,,31.526604,30.502608,ZMB
919,2022-12-31,Zimbabwe,,,31.833031,30.400445,ZWE


#### Some indicator related to poverty

In [62]:
source_feat_df5 = transformer.load_data(ext="csv", filepath=filepath_feat_5, repo=REPO, header=2, parse_dates=None)
source_feat_df5.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Annualized average growth rate in per capita r...,SI.SPR.PCAP.ZG,,,,,,,...,,,,,,,,,,
1,Aruba,ABW,"Survey mean consumption or income per capita, ...",SI.SPR.PCAP,,,,,,,...,,,,,,,,,,
2,Aruba,ABW,Annualized average growth rate in per capita r...,SI.SPR.PC40.ZG,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,"Survey mean consumption or income per capita, ...",SI.SPR.PC40,,,,,,,...,,,,,,,,,,
4,Aruba,ABW,Poverty gap at $6.85 a day (2017 PPP) (%),SI.POV.UMIC.GP,,,,,,,...,,,,,,,,,,


In [63]:
source_feat_df5 = source_feat_df5.replace({"Country Name": {"Congo, Rep.":"Congo", "Congo, Dem. Rep.": 'Democratic Republic of Congo', "Gambia, The": "Gambia"}})

In [64]:
source_feat_df5 = transformer.subset_study_countries(source_feat_df5, "Country Name", countries=AFRICAN_COUNTRIES)

In [65]:
source_feat_df5["Country Name"].nunique()

40

In [66]:
cleaned_feat_df5_needed_columns = [i for i in source_feat_df5.columns if i.startswith("2")]

In [67]:
cleaned_feat_df5 = transformer.extract_series(
    [cleaned_feat_df5_needed_columns, "Indicator Value", ""],
    source_data=source_feat_df5,
    immutable_columns=["Country Name", "Indicator Name"], 
    multiple_index=True
)

In [68]:
# Columns with 50% or more missing valaue are not useful
cokumns_50_more_missing_feat_df5 = [col for col in cleaned_feat_df5.columns if (cleaned_feat_df5[col].isna().sum() / len(cleaned_feat_df5)) >= 0.5]

# Drop columns_50_more_missing and non useful columns from cleaned_df9
cleaned_feat_df5 = cleaned_feat_df5.drop(columns=cokumns_50_more_missing_feat_df5)

In [69]:
cleaned_feat_df5.isna().sum()

Indicator Name
Year                                                   0
Country Name                                           0
Population living in slums (% of urban population)    77
dtype: int64

In [70]:
select_columns = ["Year", "Country Name", "Population living in slums (% of urban population)"]

# Population living in slums (% of urban population)

In [71]:
cleaned_feat_df5 = cleaned_feat_df5[select_columns]
cleaned_feat_df5.head()

Indicator Name,Year,Country Name,Population living in slums (% of urban population)
0,2000-12-31,Angola,19.7
1,2000-12-31,Benin,71.86569
2,2000-12-31,Burkina Faso,82.21152
3,2000-12-31,Burundi,79.7
4,2000-12-31,Cameroon,65.3638


In [72]:
cleaned_feat_df5.rename(
    columns={
        "Country Name": "Country",
        "Year": "Date"
    },
    inplace=True
)

In [73]:
cleaned_feat_df5["ISO3"] = cc.convert(cleaned_feat_df5['Country'], to="ISO3")

In [74]:
cleaned_feat_df5.head()

Indicator Name,Date,Country,Population living in slums (% of urban population),ISO3
0,2000-12-31,Angola,19.7,AGO
1,2000-12-31,Benin,71.86569,BEN
2,2000-12-31,Burkina Faso,82.21152,BFA
3,2000-12-31,Burundi,79.7,BDI
4,2000-12-31,Cameroon,65.3638,CMR


In [75]:
cleaned_feat_df5.tail()

Indicator Name,Date,Country,Population living in slums (% of urban population),ISO3
453,2021-12-31,Central African Republic,,CAF
454,2021-12-31,Kenya,,KEN
455,2021-12-31,Mali,,MLI
456,2021-12-31,Nigeria,,NGA
457,2022-12-31,Somalia,,SOM


#### Other indicators

In [76]:
source_feat_df6 = transformer.load_data(ext="csv", filepath=filepath_feat_6, repo=REPO, header=2, parse_dates=None)
source_feat_df6.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Urban population (% of total population),SP.URB.TOTL.IN.ZS,50.776,50.761,50.746,50.73,50.715,50.7,...,43.041,43.108,43.192,43.293,43.411,43.546,43.697,43.866,44.052,
1,Aruba,ABW,Urban population,SP.URB.TOTL,27728.0,28330.0,28764.0,29157.0,29505.0,29802.0,...,44588.0,44943.0,45297.0,45648.0,45999.0,46351.0,46574.0,46734.0,46891.0,
2,Aruba,ABW,Urban population growth (annual %),SP.URB.GROW,,2.147858,1.520329,1.357042,1.186472,1.001576,...,0.810669,0.793026,0.784578,0.771899,0.765986,0.762321,0.479958,0.342951,0.335381,
3,Aruba,ABW,"Population, total",SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,
4,Aruba,ABW,Population growth (annual %),SP.POP.GROW,,2.179059,1.548572,1.389337,1.215721,1.032841,...,0.691615,0.637959,0.590062,0.537296,0.494795,0.45197,0.134255,-0.045045,-0.086392,


In [77]:
source_feat_df6 = source_feat_df6.replace({"Country Name": {"Congo, Rep.":"Congo", "Congo, Dem. Rep.": 'Democratic Republic of Congo', "Gambia, The": "Gambia"}})

In [78]:
source_feat_df6 = transformer.subset_study_countries(source_feat_df6, "Country Name", countries=AFRICAN_COUNTRIES)

In [79]:
source_feat_df6["Country Name"].nunique()

40

In [80]:
cleaned_feat_df6_needed_columns = [i for i in source_feat_df6.columns if i.startswith("2")]

In [81]:
cleaned_feat_df6 = transformer.extract_series(
    [cleaned_feat_df6_needed_columns, "Indicator Value", ""],
    source_data=source_feat_df6,
    immutable_columns=["Country Name", "Indicator Name"], 
    multiple_index=True
)

In [82]:
# Columns with 50% or more missing valaue are not useful
cokumns_50_more_missing_feat_df6 = [col for col in cleaned_feat_df6.columns if (cleaned_feat_df6[col].isna().sum() / len(cleaned_feat_df6)) >= 0.5]

# Drop columns_50_more_missing and non useful columns from cleaned_feat_df6
cleaned_feat_df6 = cleaned_feat_df6.drop(columns=cokumns_50_more_missing_feat_df6)

In [83]:
cleaned_feat_df6.isna().sum()

Indicator Name
Year                                                                                           0
Country Name                                                                                   0
Access to electricity (% of population)                                                       56
Agricultural land (% of land area)                                                            52
Agricultural land (sq. km)                                                                    52
Agriculture, forestry, and fishing, value added (% of GDP)                                    87
Annual freshwater withdrawals, total (% of internal resources)                               106
Annual freshwater withdrawals, total (billion cubic meters)                                  106
Arable land (% of land area)                                                                  52
Average precipitation in depth (mm per year)                                                 104
CO2 emissions (

In [84]:
select_columns = ["Year", "Country Name", "Average precipitation in depth (mm per year)", "Foreign direct investment, net inflows (% of GDP)", "Mortality rate, under-5 (per 1,000 live births)", "Population growth (annual %)", "Population in urban agglomerations of more than 1 million (% of total population)", "Urban population (% of total population)", "Urban population growth (annual %)"]


In [85]:
cleaned_feat_df6 = cleaned_feat_df6[select_columns]
cleaned_feat_df6.head()

Indicator Name,Year,Country Name,Average precipitation in depth (mm per year),"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %)
0,2000-12-31,Angola,1010.0,9.623866,205.1,3.244121,17.25515,50.087,5.64867
1,2000-12-31,Benin,1039.0,-0.363491,136.8,3.038457,,38.333,3.871488
2,2000-12-31,Burkina Faso,748.0,0.778458,178.7,2.983886,7.749025,17.844,6.857565
3,2000-12-31,Burundi,1218.0,1.342152,154.6,2.041721,,8.246,4.621538
4,2000-12-31,Cameroon,1604.0,1.511445,144.4,2.636027,18.827103,45.542,3.957808


In [86]:
cleaned_feat_df6.rename(
    columns={
        "Country Name": "Country",
        "Year": "Date"
    },
    inplace=True
)

In [87]:
cleaned_feat_df6["ISO3"] = cc.convert(cleaned_feat_df6['Country'], to="ISO3")

In [88]:
cleaned_feat_df6.head()

Indicator Name,Date,Country,Average precipitation in depth (mm per year),"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),ISO3
0,2000-12-31,Angola,1010.0,9.623866,205.1,3.244121,17.25515,50.087,5.64867,AGO
1,2000-12-31,Benin,1039.0,-0.363491,136.8,3.038457,,38.333,3.871488,BEN
2,2000-12-31,Burkina Faso,748.0,0.778458,178.7,2.983886,7.749025,17.844,6.857565,BFA
3,2000-12-31,Burundi,1218.0,1.342152,154.6,2.041721,,8.246,4.621538,BDI
4,2000-12-31,Cameroon,1604.0,1.511445,144.4,2.636027,18.827103,45.542,3.957808,CMR


In [89]:
cleaned_feat_df6.tail()

Indicator Name,Date,Country,Average precipitation in depth (mm per year),"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),ISO3
915,2022-12-31,Tanzania,,1.468065,,2.958573,13.206764,36.682,4.963155,TZA
916,2022-12-31,Togo,,-2.792602,,2.33091,21.760453,43.921,3.621041,TGO
917,2022-12-31,Uganda,,3.349903,,2.998628,7.728997,26.159,5.342485,UGA
918,2022-12-31,Zambia,,0.036292,,2.758032,15.195516,45.761,4.009238,ZMB
919,2022-12-31,Zimbabwe,,1.651509,,2.024036,9.544661,32.395,2.308437,ZWE


##### Rural pop

In [90]:
source_feat_df7 = transformer.load_data(ext="csv", filepath=filepath_feat_7, repo=REPO, header=2, parse_dates=None)
source_feat_df7.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Rural population,SP.RUR.TOTL,26880.0,27481.0,27918.0,28318.0,28673.0,28980.0,...,59006.0,59314.0,59577.0,59791.0,59963.0,60091.0,60011.0,59803.0,59554.0,
1,Africa Eastern and Southern,AFE,Rural population,SP.RUR.TOTL,111658758.0,114296883.0,117064799.0,119893122.0,122832934.0,125871499.0,...,385577648.0,393452021.0,401294164.0,409014341.0,416834821.0,424791846.0,432797498.0,440600045.0,448184780.0,
2,Afghanistan,AFG,Rural population,SP.RUR.TOTL,7898093.0,8026804.0,8163985.0,8308019.0,8458694.0,8617815.0,...,24672275.0,25381619.0,25970228.0,26643455.0,27333488.0,28042342.0,28829317.0,29547690.0,30181937.0,
3,Africa Western and Central,AFW,Rural population,SP.RUR.TOTL,82954373.0,84327418.0,85737206.0,87186879.0,88657553.0,90158852.0,...,222082250.0,225573122.0,229093774.0,232644696.0,236154623.0,239621856.0,243081107.0,246481855.0,249836404.0,
4,Angola,AGO,Rural population,SP.RUR.TOTL,4798172.0,4853778.0,4902782.0,4948903.0,4989125.0,5019280.0,...,10110460.0,10281807.0,10452268.0,10621656.0,10784991.0,10942954.0,11089900.0,11227528.0,11359649.0,


In [91]:
source_feat_df7 = source_feat_df7.replace({"Country Name": {"Congo, Rep.":"Congo", "Congo, Dem. Rep.": 'Democratic Republic of Congo', "Gambia, The": "Gambia"}})

In [92]:
source_feat_df7 = transformer.subset_study_countries(source_feat_df7, "Country Name", countries=AFRICAN_COUNTRIES)

In [93]:
source_feat_df7["Country Name"].nunique()

40

In [94]:
cleaned_feat_df7_needed_columns = [i for i in source_feat_df7.columns if i.startswith("2")]

In [95]:
cleaned_feat_df7 = transformer.extract_series(
    [cleaned_feat_df7_needed_columns, "Indicator Value", ""],
    source_data=source_feat_df7,
    immutable_columns=["Country Name", "Indicator Name"], 
    multiple_index=True
)

In [96]:
cleaned_feat_df7.isna().sum()

Indicator Name
Year                0
Country Name        0
Rural population    0
dtype: int64

In [97]:
cleaned_feat_df7.rename(
    columns={
        "Country Name": "Country",
        "Year": "Date"
    },
    inplace=True
)

In [98]:
cleaned_feat_df7["ISO3"] = cc.convert(cleaned_feat_df7['Country'], to="ISO3")

In [99]:
cleaned_feat_df7.head()

Indicator Name,Date,Country,Rural population,ISO3
0,2000-12-31,Angola,8182768.0,AGO
1,2000-12-31,Benin,4315471.0,BEN
2,2000-12-31,Burkina Faso,9762505.0,BFA
3,2000-12-31,Burundi,5787529.0,BDI
4,2000-12-31,Cameroon,8218580.0,CMR


In [100]:
cleaned_feat_df7.tail()

Indicator Name,Date,Country,Rural population,ISO3
915,2022-12-31,Tanzania,41471864.0,TZA
916,2022-12-31,Togo,4962262.0,TGO
917,2022-12-31,Uganda,34889566.0,UGA
918,2022-12-31,Zambia,10857387.0,ZMB
919,2022-12-31,Zimbabwe,11033499.0,ZWE


### Climatic factors


In [101]:
variables = ["pr", "tas", "tasmin"]
countries = list(cleaned_targets.ISO3.unique())

In [102]:
url = transformer.get_url(variables, countries)

In [103]:
response = requests.get(url)
response_data = response.json()

In [104]:
cleaned_feat_df8_pr = transformer.get_climate_data(response_data, "pr")
cleaned_feat_df8_tas = transformer.get_climate_data(response_data, "tas")
cleaned_feat_df8_tasmin = transformer.get_climate_data(response_data, "tasmin")

In [105]:
cleaned_feat_df8 = (
    cleaned_feat_df8_pr.
    merge(cleaned_feat_df8_tas, on=["ISO3", "Date"], how="outer").
    merge(cleaned_feat_df8_tasmin, on=["ISO3", "Date"], how="outer")
)

In [106]:
cleaned_feat_df8 = cleaned_feat_df8.rename(columns={"pr": "Precipitation", "tas": "Average Mean Surface Air Temperature", "tasmin": "Average Minimum Surface Air Temperature"})

In [107]:
cleaned_feat_df8["Country"] = cc.convert(cleaned_feat_df8['ISO3'], to="name_short")

In [108]:
cleaned_feat_df8 = cleaned_feat_df8.replace({"Country": {'DR Congo': 'Democratic Republic of Congo', 'Congo Republic': 'Congo'}})

In [109]:
cleaned_feat_df8

Unnamed: 0,ISO3,Date,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature,Country
0,AGO,2000-12-31,1053.39,21.72,14.94,Angola
1,AGO,2001-12-31,1043.69,21.90,15.10,Angola
2,AGO,2002-12-31,1076.96,21.98,15.19,Angola
3,AGO,2003-12-31,1022.52,22.08,15.25,Angola
4,AGO,2004-12-31,1059.90,21.81,15.02,Angola
...,...,...,...,...,...,...
915,ZWE,2018-12-31,635.43,22.09,15.30,Zimbabwe
916,ZWE,2019-12-31,732.28,22.37,15.40,Zimbabwe
917,ZWE,2020-12-31,684.29,21.93,15.25,Zimbabwe
918,ZWE,2021-12-31,634.60,21.87,15.15,Zimbabwe


### Saving source project dataset

In [110]:
#### Merge the dataset
dataset = (
    cleaned_targets.
    merge(cleaned_feat_df1, on=["Country", "ISO3", "Date"], how="outer").
    merge(present_cleaned_feat_df2, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df3, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df4, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df5, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df6, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df7, on=["Country", "ISO3", "Date"], how="outer").
    merge(cleaned_feat_df8, on=["Country", "ISO3", "Date"], how="outer")
)

In [111]:
dataset.head()

Unnamed: 0,Country,ISO3,Date,Malaria_Incidence,Malaria_Deaths_U5,Malaria_Deaths,ITN_Access,PopDensity,MedianAgePop,PopGrowthRate,...,"Foreign direct investment, net inflows (% of GDP)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),Urban population (% of total population),Urban population growth (annual %),Rural population,Precipitation,Average Mean Surface Air Temperature,Average Minimum Surface Air Temperature
0,Angola,AGO,2000-12-31,325.7036,283.27,74.62,2.82,13.15,15.5919,3.259,...,9.623866,205.1,3.244121,17.25515,50.087,5.64867,8182768.0,1053.39,21.72,14.94
1,Angola,AGO,2001-12-31,326.6507,304.97,80.3,3.67,13.5891,15.6426,3.311,...,24.009075,198.9,3.285217,17.778222,51.274,5.627442,8254958.0,1043.69,21.9,15.1
2,Angola,AGO,2002-12-31,309.12094,317.88,83.93,4.72,14.05,15.6964,3.359,...,11.406192,191.9,3.335132,18.309589,52.461,5.623762,8326997.0,1076.96,21.98,15.19
3,Angola,AGO,2003-12-31,313.731,336.87,89.76,5.3,14.5379,15.753,3.466,...,20.081014,184.2,3.413321,18.842102,53.645,5.645138,8401539.0,1022.52,22.08,15.25
4,Angola,AGO,2004-12-31,313.73257,369.5,98.39,4.49,15.0566,15.807,3.545,...,9.329239,175.5,3.506389,19.373746,54.827,5.685845,8479480.0,1059.9,21.81,15.02


In [112]:
dataset.ISO3.unique()

array(['AGO', 'BEN', 'BFA', 'BDI', 'CMR', 'CAF', 'TCD', 'COM', 'COG',
       'CIV', 'COD', 'DJI', 'GNQ', 'ERI', 'ETH', 'GAB', 'GMB', 'GHA',
       'GIN', 'GNB', 'KEN', 'LBR', 'MDG', 'MWI', 'MLI', 'MRT', 'MOZ',
       'NER', 'NGA', 'RWA', 'SEN', 'SLE', 'SOM', 'SSD', 'SDN', 'TZA',
       'TGO', 'UGA', 'ZMB', 'ZWE'], dtype=object)

In [113]:
dataset.columns

Index(['Country', 'ISO3', 'Date', 'Malaria_Incidence', 'Malaria_Deaths_U5',
       'Malaria_Deaths', 'ITN_Access', 'PopDensity', 'MedianAgePop',
       'PopGrowthRate', 'TFR', 'IMR', 'Q5', 'CNMR',
       'Population ages 0-14 (% of total population)',
       'Population ages 15-64 (% of total population)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'External health expenditure (% of current health expenditure)',
       'People using at least basic sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'Population living in slums (% of urban population)',
       'Average precipitation in depth (mm per year)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 million (% of to

In [114]:
dataset.to_csv("../data/cleaned/final/project_data.csv", index=False)

### Projections

#### Demographic factors

In [115]:
future_cleaned_feat_df2

Unnamed: 0,ISO3,Country,Date,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
73,BDI,Burundi,2023-12-31,510.1564,15.9924,2.648,4.8784,36.9629,52.2050,0.151
74,BDI,Burundi,2024-12-31,523.7633,16.2441,2.617,4.7780,36.8812,52.4515,0.147
75,BDI,Burundi,2025-12-31,537.5006,16.5154,2.562,4.6623,36.3680,51.7445,0.143
76,BDI,Burundi,2026-12-31,551.3609,16.8012,2.530,4.5732,35.8625,51.0459,0.140
77,BDI,Burundi,2027-12-31,565.4340,17.0982,2.511,4.4937,35.3496,50.3320,0.136
...,...,...,...,...,...,...,...,...,...,...
6044,TGO,Togo,2066-12-31,360.4891,26.1499,1.296,2.6284,19.8782,27.8772,-0.102
6045,TGO,Togo,2067-12-31,365.1698,26.3371,1.284,2.6186,19.5699,27.4026,-0.101
6046,TGO,Togo,2068-12-31,369.8343,26.5235,1.255,2.5896,19.2565,26.9129,-0.099
6047,TGO,Togo,2069-12-31,374.4571,26.7104,1.230,2.5655,18.9701,26.4599,-0.098


In [116]:
future_cleaned_feat_df2 = future_cleaned_feat_df2.rename(columns={"ISO3_code": "ISO3"})

In [117]:
study_countries = ["NGA", "GHA", "KEN", "MWI", "BFA"]

In [118]:
future_cleaned_feat_df2 = transformer.subset_study_countries(future_cleaned_feat_df2, "ISO3", countries=study_countries)

#### Climatic factors

##### Nigeria

url = https://climateknowledgeportal.worldbank.org/country/nigeria/climate-data-projections

In [119]:
filepath_projection_nga_1 = "../data/source/final/projections/nigeria/projected-precipitation_br__nigeria;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_nga_2 = "../data/source/final/projections/nigeria/projected-average-mean-surface-air-temperature_br__nigeria;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_nga_3 = "../data/source/final/projections/nigeria/projected-average-minimum-surface-air-temperature_br__nigeria;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"

In [120]:
nga_projections_df1 = transformer.load_data(ext="csv", filepath=filepath_projection_nga_1, repo=REPO, header=0, parse_dates=None)
nga_projections_df2 = transformer.load_data(ext="csv", filepath=filepath_projection_nga_2, repo=REPO, header=0, parse_dates=None)
nga_projections_df3 = transformer.load_data(ext="csv", filepath=filepath_projection_nga_3, repo=REPO, header=0, parse_dates=None)

In [121]:
nga_projections_df1 = nga_projections_df1[(nga_projections_df1["Category"] >= 2023) & (nga_projections_df1["Category"] <= 2070)]
nga_projections_df2 = nga_projections_df2[(nga_projections_df2["Category"] >= 2023) & (nga_projections_df2["Category"] <= 2070)]
nga_projections_df3 = nga_projections_df3[(nga_projections_df3["Category"] >= 2023) & (nga_projections_df3["Category"] <= 2070)]

In [122]:
nga_projections_df1 = nga_projections_df1[["Category", "SSP2-4.5"]]
nga_projections_df2 = nga_projections_df2[["Category", "SSP2-4.5"]]
nga_projections_df3 = nga_projections_df3[["Category", "SSP2-4.5"]]

In [123]:
nga_projections_df1 = nga_projections_df1.rename(columns={"Category": "Date", "SSP2-4.5": "Precipitation_SSP2-4.5"})
nga_projections_df2 = nga_projections_df2.rename(columns={"Category": "Date", "SSP2-4.5": "Average Mean Surface Air Temperature_SSP2-4.5"})
nga_projections_df3 = nga_projections_df3.rename(columns={"Category": "Date", "SSP2-4.5": "Average Minimum Surface Air Temperature_SSP2-4.5"})

In [124]:
nga_projections = (
    nga_projections_df1.
    merge(nga_projections_df2, on="Date", how="outer").
    merge(nga_projections_df3, on="Date", how="outer")
)

In [125]:
nga_projections.loc[:,"ISO3"] = "NGA"

In [126]:
# Convert to datetime format
nga_projections = transformer.convert_to_dateformat(nga_projections, "Date")

In [127]:
nga_projections.head()

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3
0,2023-12-31,1125.18,27.81,22.67,NGA
1,2024-12-31,1145.81,28.04,22.89,NGA
2,2025-12-31,1146.02,27.92,22.78,NGA
3,2026-12-31,1171.1,27.97,22.8,NGA
4,2027-12-31,1140.81,28.01,22.82,NGA


##### Ghana

url = https://climateknowledgeportal.worldbank.org/country/ghana/climate-data-projections

In [128]:
filepath_projection_gha_1 = "../data/source/final/projections/ghana/projected-precipitation_br__ghana;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_gha_2 = "../data/source/final/projections/ghana/projected-average-mean-surface-air-temperature_br__ghana;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_gha_3 = "../data/source/final/projections/ghana/projected-average-minimum-surface-air-temperature_br__ghana;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"

In [129]:
gha_projections_df1 = transformer.load_data(ext="csv", filepath=filepath_projection_gha_1, repo=REPO, header=0, parse_dates=None)
gha_projections_df2 = transformer.load_data(ext="csv", filepath=filepath_projection_gha_2, repo=REPO, header=0, parse_dates=None)
gha_projections_df3 = transformer.load_data(ext="csv", filepath=filepath_projection_gha_3, repo=REPO, header=0, parse_dates=None)

In [130]:
gha_projections_df1 = gha_projections_df1[(gha_projections_df1["Category"] >= 2023) & (gha_projections_df1["Category"] <= 2070)]
gha_projections_df2 = gha_projections_df2[(gha_projections_df2["Category"] >= 2023) & (gha_projections_df2["Category"] <= 2070)]
gha_projections_df3 = gha_projections_df3[(gha_projections_df3["Category"] >= 2023) & (gha_projections_df3["Category"] <= 2070)]

In [131]:
gha_projections_df1 = gha_projections_df1[["Category", "SSP2-4.5"]]
gha_projections_df2 = gha_projections_df2[["Category", "SSP2-4.5"]]
gha_projections_df3 = gha_projections_df3[["Category", "SSP2-4.5"]]

In [132]:
gha_projections_df1 = gha_projections_df1.rename(columns={"Category": "Date", "SSP2-4.5": "Precipitation_SSP2-4.5"})
gha_projections_df2 = gha_projections_df2.rename(columns={"Category": "Date", "SSP2-4.5": "Average Mean Surface Air Temperature_SSP2-4.5"})
gha_projections_df3 = gha_projections_df3.rename(columns={"Category": "Date", "SSP2-4.5": "Average Minimum Surface Air Temperature_SSP2-4.5"})

In [133]:
gha_projections = (
    gha_projections_df1.
    merge(gha_projections_df2, on="Date", how="outer").
    merge(gha_projections_df3, on="Date", how="outer")
)

In [134]:
gha_projections.loc[:,"ISO3"] = "GHA"

In [135]:
# Convert to datetime format
gha_projections = transformer.convert_to_dateformat(gha_projections, "Date")

In [136]:
gha_projections.head()

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3
0,2023-12-31,1179.0,28.05,23.79,GHA
1,2024-12-31,1230.83,28.19,23.94,GHA
2,2025-12-31,1205.24,28.1,23.9,GHA
3,2026-12-31,1202.37,28.12,23.89,GHA
4,2027-12-31,1183.24,28.24,23.98,GHA


##### Kenya

url = https://climateknowledgeportal.worldbank.org/country/kenya/climate-data-projections

In [137]:
filepath_projection_ken_1 = "../data/source/final/projections/kenya/projected-precipitation_br__kenya;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_ken_2 = "../data/source/final/projections/kenya/projected-average-mean-surface-air-temperature_br__kenya;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_ken_3 = "../data/source/final/projections/kenya/projected-average-minimum-surface-air-temperature_br__kenya;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"

In [138]:
ken_projections_df1 = transformer.load_data(ext="csv", filepath=filepath_projection_ken_1, repo=REPO, header=0, parse_dates=None)
ken_projections_df2 = transformer.load_data(ext="csv", filepath=filepath_projection_ken_2, repo=REPO, header=0, parse_dates=None)
ken_projections_df3 = transformer.load_data(ext="csv", filepath=filepath_projection_ken_3, repo=REPO, header=0, parse_dates=None)

In [139]:
ken_projections_df1 = ken_projections_df1[(ken_projections_df1["Category"] >= 2023) & (ken_projections_df1["Category"] <= 2070)]
ken_projections_df2 = ken_projections_df2[(ken_projections_df2["Category"] >= 2023) & (ken_projections_df2["Category"] <= 2070)]
ken_projections_df3 = ken_projections_df3[(ken_projections_df3["Category"] >= 2023) & (ken_projections_df3["Category"] <= 2070)]

In [140]:
ken_projections_df1 = ken_projections_df1[["Category", "SSP2-4.5"]]
ken_projections_df2 = ken_projections_df2[["Category", "SSP2-4.5"]]
ken_projections_df3 = ken_projections_df3[["Category", "SSP2-4.5"]]

In [141]:
ken_projections_df1 = ken_projections_df1.rename(columns={"Category": "Date", "SSP2-4.5": "Precipitation_SSP2-4.5"})
ken_projections_df2 = ken_projections_df2.rename(columns={"Category": "Date", "SSP2-4.5": "Average Mean Surface Air Temperature_SSP2-4.5"})
ken_projections_df3 = ken_projections_df3.rename(columns={"Category": "Date", "SSP2-4.5": "Average Minimum Surface Air Temperature_SSP2-4.5"})

In [142]:
ken_projections = (
    ken_projections_df1.
    merge(ken_projections_df2, on="Date", how="outer").
    merge(ken_projections_df3, on="Date", how="outer")
)

In [143]:
ken_projections.loc[:,"ISO3"] = "KEN"

In [144]:
# Convert to datetime format
ken_projections = transformer.convert_to_dateformat(ken_projections, "Date")

In [145]:
ken_projections.head()

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3
0,2023-12-31,742.53,26.01,21.18,KEN
1,2024-12-31,725.14,26.15,21.32,KEN
2,2025-12-31,777.75,25.97,21.17,KEN
3,2026-12-31,705.59,25.97,21.16,KEN
4,2027-12-31,757.56,26.02,21.21,KEN


##### Malawi

url = https://climateknowledgeportal.worldbank.org/country/malawi/climate-data-projections

In [146]:
filepath_projection_mwi_1 = "../data/source/final/projections/malawi/projected-precipitation_br__malawi;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_mwi_2 = "../data/source/final/projections/malawi/projected-average-mean-surface-air-temperature_br__malawi;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_mwi_3 = "../data/source/final/projections/malawi/projected-average-minimum-surface-air-temperature_br__malawi;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"

In [147]:
mwi_projections_df1 = transformer.load_data(ext="csv", filepath=filepath_projection_mwi_1, repo=REPO, header=0, parse_dates=None)
mwi_projections_df2 = transformer.load_data(ext="csv", filepath=filepath_projection_mwi_2, repo=REPO, header=0, parse_dates=None)
mwi_projections_df3 = transformer.load_data(ext="csv", filepath=filepath_projection_mwi_3, repo=REPO, header=0, parse_dates=None)

In [148]:
mwi_projections_df1 = mwi_projections_df1[(mwi_projections_df1["Category"] >= 2023) & (mwi_projections_df1["Category"] <= 2070)]
mwi_projections_df2 = mwi_projections_df2[(mwi_projections_df2["Category"] >= 2023) & (mwi_projections_df2["Category"] <= 2070)]
mwi_projections_df3 = mwi_projections_df3[(mwi_projections_df3["Category"] >= 2023) & (mwi_projections_df3["Category"] <= 2070)]

In [149]:
mwi_projections_df1 = mwi_projections_df1[["Category", "SSP2-4.5"]]
mwi_projections_df2 = mwi_projections_df2[["Category", "SSP2-4.5"]]
mwi_projections_df3 = mwi_projections_df3[["Category", "SSP2-4.5"]]

In [150]:
mwi_projections_df1 = mwi_projections_df1.rename(columns={"Category": "Date", "SSP2-4.5": "Precipitation_SSP2-4.5"})
mwi_projections_df2 = mwi_projections_df2.rename(columns={"Category": "Date", "SSP2-4.5": "Average Mean Surface Air Temperature_SSP2-4.5"})
mwi_projections_df3 = mwi_projections_df3.rename(columns={"Category": "Date", "SSP2-4.5": "Average Minimum Surface Air Temperature_SSP2-4.5"})

In [151]:
mwi_projections = (
    mwi_projections_df1.
    merge(mwi_projections_df2, on="Date", how="outer").
    merge(mwi_projections_df3, on="Date", how="outer")
)

In [152]:
mwi_projections.loc[:,"ISO3"] = "MWI"

In [153]:
# Convert to datetime format
mwi_projections = transformer.convert_to_dateformat(mwi_projections, "Date")

In [154]:
mwi_projections.head()

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3
0,2023-12-31,1274.74,22.93,18.93,MWI
1,2024-12-31,1309.63,22.97,19.01,MWI
2,2025-12-31,1286.7,22.87,18.93,MWI
3,2026-12-31,1298.59,22.93,18.99,MWI
4,2027-12-31,1272.87,23.09,19.06,MWI


##### Burkina Faso  

url = https://climateknowledgeportal.worldbank.org/country/burkina-faso/climate-data-projections

In [155]:
filepath_projection_bfa_1 = "../data/source/final/projections/burkina-faso/projected-precipitation_br__burkina-faso;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_bfa_2 = "../data/source/final/projections/burkina-faso/projected-average-mean-surface-air-temperature_br__burkina-faso;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"
filepath_projection_bfa_3 = "../data/source/final/projections/burkina-faso/projected-average-minimum-surface-air-temperature_br__burkina-faso;-(ref.-period_-1995-2014),-multi-model-ensemble.csv"

In [156]:
bfa_projections_df1 = transformer.load_data(ext="csv", filepath=filepath_projection_bfa_1, repo=REPO, header=0, parse_dates=None)
bfa_projections_df2 = transformer.load_data(ext="csv", filepath=filepath_projection_bfa_2, repo=REPO, header=0, parse_dates=None)
bfa_projections_df3 = transformer.load_data(ext="csv", filepath=filepath_projection_bfa_3, repo=REPO, header=0, parse_dates=None)

In [157]:
bfa_projections_df1 = bfa_projections_df1[(bfa_projections_df1["Category"] >= 2023) & (bfa_projections_df1["Category"] <= 2070)]
bfa_projections_df2 = bfa_projections_df2[(bfa_projections_df2["Category"] >= 2023) & (bfa_projections_df2["Category"] <= 2070)]
bfa_projections_df3 = bfa_projections_df3[(bfa_projections_df3["Category"] >= 2023) & (bfa_projections_df3["Category"] <= 2070)]

In [158]:
bfa_projections_df1 = bfa_projections_df1[["Category", "SSP2-4.5"]]
bfa_projections_df2 = bfa_projections_df2[["Category", "SSP2-4.5"]]
bfa_projections_df3 = bfa_projections_df3[["Category", "SSP2-4.5"]]

In [159]:
bfa_projections_df1 = bfa_projections_df1.rename(columns={"Category": "Date", "SSP2-4.5": "Precipitation_SSP2-4.5"})
bfa_projections_df2 = bfa_projections_df2.rename(columns={"Category": "Date", "SSP2-4.5": "Average Mean Surface Air Temperature_SSP2-4.5"})
bfa_projections_df3 = bfa_projections_df3.rename(columns={"Category": "Date", "SSP2-4.5": "Average Minimum Surface Air Temperature_SSP2-4.5"})

In [160]:
bfa_projections = (
    bfa_projections_df1.
    merge(bfa_projections_df2, on="Date", how="outer").
    merge(bfa_projections_df3, on="Date", how="outer")
)

In [161]:
bfa_projections.loc[:,"ISO3"] = "BFA"

In [162]:
# Convert to datetime format
bfa_projections = transformer.convert_to_dateformat(bfa_projections, "Date")

In [163]:
bfa_projections.head()

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3
0,2023-12-31,664.92,29.13,23.46,BFA
1,2024-12-31,695.9,29.43,23.73,BFA
2,2025-12-31,667.79,29.3,23.58,BFA
3,2026-12-31,704.17,29.38,23.68,BFA
4,2027-12-31,655.76,29.4,23.78,BFA


In [164]:
projections_downloaded_dataset = pd.concat([nga_projections, gha_projections, ken_projections, mwi_projections, bfa_projections])

In [165]:
projections_downloaded_dataset = projections_downloaded_dataset.merge(future_cleaned_feat_df2, on=["Date", "ISO3"], how="outer")

In [166]:
projections_downloaded_dataset

Unnamed: 0,Date,Precipitation_SSP2-4.5,Average Mean Surface Air Temperature_SSP2-4.5,Average Minimum Surface Air Temperature_SSP2-4.5,ISO3,Country,PopDensity,MedianAgePop,PopGrowthRate,TFR,IMR,Q5,CNMR
0,2023-12-31,1125.18,27.81,22.67,NGA,Nigeria,245.7312,17.2319,2.369,5.0629,70.5892,109.0630,-0.268
1,2024-12-31,1145.81,28.04,22.89,NGA,Nigeria,251.6027,17.3553,2.354,4.9930,69.3406,107.3496,-0.262
2,2025-12-31,1146.02,27.92,22.78,NGA,Nigeria,257.5553,17.4863,2.323,4.8989,68.1246,105.6722,-0.256
3,2026-12-31,1171.10,27.97,22.80,NGA,Nigeria,263.5731,17.6262,2.297,4.8108,66.9879,104.0770,-0.250
4,2027-12-31,1140.81,28.01,22.82,NGA,Nigeria,269.6610,17.7743,2.271,4.7240,65.9013,102.5497,-0.244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,2066-12-31,685.29,30.60,25.08,BFA,Burkina Faso,182.5827,27.2406,1.069,2.3786,27.3052,43.2057,-0.500
236,2067-12-31,727.20,30.45,24.88,BFA,Burkina Faso,184.5152,27.4981,1.037,2.3540,26.9009,42.4708,-0.495
237,2068-12-31,701.80,30.44,24.92,BFA,Burkina Faso,186.4127,27.7545,1.009,2.3356,26.5530,41.8300,-0.490
238,2069-12-31,707.28,30.61,25.12,BFA,Burkina Faso,188.2788,28.0091,0.983,2.3196,26.2066,41.1909,-0.485


In [167]:
projections_downloaded_dataset.to_csv("../data/cleaned/final/downloaded_projections_data.csv", index=False)