# About the Notebook

# PART 1. Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd

#Visualisation:
import seaborn               as sns
import matplotlib.pyplot     as plt
sns.set_theme(style="whitegrid")

#Import Data_analysis functions
from data_analysis import *

#Country Data Exploration
from geonamescache import GeonamesCache
import country_converter as coco
import pycountry

In [2]:
#Import csv files
age_grp_death_df = pd.read_csv("../data_input/2018_malaria_deaths_age.csv")
death_rate_df = pd.read_csv("../data_input/2018_malaria_deaths.csv")
inc_rate_df = pd.read_csv("../data_input/2018_malaria_inc.csv")

In [3]:
#List of data set names
dataframe_ls = "age_grp_death_df,death_rate_df,inc_rate_df"
dataframe_ls = dataframe_ls.split(',')

# PART 2. Initial Data Exploration and Processing

## 2.1 Quick Glance of the three data sets
> Snippet of the first three rows of each dataset

In [4]:
print("QUICK GLANCE OF DATA SETS:\n")

for data in dataframe_ls:
    #Name of dataframe
    print("--"*5, f"DATAFRAME: {data}", "--"*5)
    
    #Display first three rows of dataframe
    df_ = globals()[data]
    df_.head(3)
    
    #Show data size
    print(f"Dataframe size: {len(df_)}\n")

QUICK GLANCE OF DATA SETS:

---------- DATAFRAME: age_grp_death_df ----------


Unnamed: 0.1,Unnamed: 0,entity,code,year,age_group,deaths
0,1,Afghanistan,AFG,1990,Under 5,184.606435
1,2,Afghanistan,AFG,1991,Under 5,191.658193
2,3,Afghanistan,AFG,1992,Under 5,197.140197


Dataframe size: 30780

---------- DATAFRAME: death_rate_df ----------


Unnamed: 0,Entity,Code,Year,"Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)"
0,Afghanistan,AFG,1990,6.80293
1,Afghanistan,AFG,1991,6.973494
2,Afghanistan,AFG,1992,6.989882


Dataframe size: 6156

---------- DATAFRAME: inc_rate_df ----------


Unnamed: 0,Entity,Code,Year,"Incidence of malaria (per 1,000 population at risk) (per 1,000 population at risk)"
0,Afghanistan,AFG,2000,107.1
1,Afghanistan,AFG,2005,46.5
2,Afghanistan,AFG,2010,23.9


Dataframe size: 508



---
**COMMENT**: About the data<br>

- There are three data sets with varying size:
    1. `age_grp_death_df`: Showing no. of deaths, per age group, per entity, and per year
    1. `death_rate_df`: Showing death rates per entity, and per year
    1. `inc_rate_df`: Showing incidence rates per entity, and per year
    
PLAN:
- Rename dataframe columns (for simplicity & consistency)
- Remove redundant columns (as seen in age_grp_death_df)
---

## 2.2 Initial Data Processing

**RENAME COLUMNS**<br>
> For consistency and simplicity

`age_grp_death_df`: Upper case column names

In [5]:
# Function to uppercase the first letter of a string
def uppercase_first_letter(s):
    return s[0].upper() + s[1:]

# Uppercase the first letter of each column name for age_grp_death_df
age_grp_death_df.columns = [uppercase_first_letter(col) for col in age_grp_death_df.columns]
age_grp_death_df.head(3)

Unnamed: 0.1,Unnamed: 0,Entity,Code,Year,Age_group,Deaths
0,1,Afghanistan,AFG,1990,Under 5,184.606435
1,2,Afghanistan,AFG,1991,Under 5,191.658193
2,3,Afghanistan,AFG,1992,Under 5,197.140197


`death_rate_df` & `inc_rate_df`: Rename the last column for simplicity

In [6]:
#Simplify Last column name in death_rate_df
last_col = death_rate_df.columns.tolist()[-1]
death_rate_df.rename(
    columns = {last_col:'DeathRate_per100K'},
    inplace = True)
death_rate_df.head(3)

Unnamed: 0,Entity,Code,Year,DeathRate_per100K
0,Afghanistan,AFG,1990,6.80293
1,Afghanistan,AFG,1991,6.973494
2,Afghanistan,AFG,1992,6.989882


In [7]:
#Simplify Last column name in inc_rate_df
last_col = inc_rate_df.columns.tolist()[-1]
inc_rate_df.rename(
    columns = {last_col:'IncRate_per100K'},
    inplace = True)
inc_rate_df.head(3)

Unnamed: 0,Entity,Code,Year,IncRate_per100K
0,Afghanistan,AFG,2000,107.1
1,Afghanistan,AFG,2005,46.5
2,Afghanistan,AFG,2010,23.9


**DROP REDUNDANT COLUMN**<br>
> Drop first column in age_grp_death_df: `Unnamed: 0`

In [8]:
#Showing initial sets of columns in age_grp_death_df
age_grp_death_df.columns.tolist()

['Unnamed: 0', 'Entity', 'Code', 'Year', 'Age_group', 'Deaths']

In [9]:
#Removing the first column which is redundant
age_grp_death_df = age_grp_death_df.drop(age_grp_death_df.columns[0], axis=1)
age_grp_death_df.head(3)

Unnamed: 0,Entity,Code,Year,Age_group,Deaths
0,Afghanistan,AFG,1990,Under 5,184.606435
1,Afghanistan,AFG,1991,Under 5,191.658193
2,Afghanistan,AFG,1992,Under 5,197.140197


---
**COMMENT**: What's done and What's next<br>

We have renamed the columns and dropped redundant columns.<br>
We will now perform data checking and exploration in the next segment.
---

# Part 3: Data Exploration and Merging

## 3.1 Data Cleaning: Inspect for obvious issues
- Inspect datatype
- Inspect missing data
- Inspect data value
- Inspect for duplicates

In [10]:
for data in dataframe_ls:
    df_ = globals()[data]
    df_summary(df_, data)
    print("\n\n")

---------------DATAFRAME SUMMARY OF: age_grp_death_df---------------
SHAPE(col,rows): (30780, 5)

DUPLICATES
Number of duplicates: 0

DATA TYPE
Entity        object
Code          object
Year           int64
Age_group     object
Deaths       float64
dtype: object

MISSING DATA
Columns with missing values:
    col  num_nulls  perc_null
0  Code       4320       0.14

DATA VALUES: Quantitative data
  Column Name  Minimum Value  Maximum Value
0        Year         1990.0    2016.000000
1      Deaths            0.0  752025.548675

DATA VALUES: Qualitative data
--Column 'Entity' has
 228 unique values

--Column 'Code' has
 196 unique values

--Column 'Age_group' has
 5 unique values

---------------END OF DATA SUMMARY OF age_grp_death_df---------------



---------------DATAFRAME SUMMARY OF: death_rate_df---------------
SHAPE(col,rows): (6156, 4)

DUPLICATES
Number of duplicates: 0

DATA TYPE
Entity                object
Code                  object
Year                   int64
DeathRate_per1

---
**COMMENT**: Summary of Data Checking<br>

|Area of inspection|Description|Plan|
|---|---|---|
|Duplicates|Absent|NIL|
|Missing Data|Presence of missing data in `Code` column|`Code` column refers to the country code. Further examination needed to understand the nature of the missing data before coming up with appropriate actions to address the missing data. It is possible to be related to `Entity`, as not all elements in `Entity` column is necessarily a country.|
|Data type|Appropriate data type for respective columns|NIL|
|Data values|<li>Noticed there is different number of unique entities and code across the three data sets<li>Noticed there is different time period the data sets covers<li>There is no obvious abnormaly value (e.g. Negatie values) detected at the moment.|Explore the data further in detail, including exploring the entity to recognise if the data represents a country/region/ US state.|

---

## 3.2 Further Data Exploration: `Entity` & `Code`

**ABOUT [geonamescache](https://pypi.org/project/geonamescache/)**:<br>
A Python library that provides functions to retrieve names and other information of continents, countries as well as US states and counties as Python dictionaries. We will be using this library to label each data row whether it is representing a country, US State or region.

[countryconverter](https://pypi.org/project/country-converter/)<br>
[pycountry](https://pypi.org/project/pycountry/)

### Step 1: Check if `Code`is consistent with the  ISO 3166-1 alpha-3 standard
- We use pycountry to conduct this checking.
- Information about [ISO 3166 ](https://en.wikipedia.org/wiki/ISO_3166#:~:text=ISO%203166%20is%20an%20ISO,e.g.%2C%20provinces%20or%20states))

In [11]:
#Check if code is consistent with the  ISO 3166-1 alpha-3 standard.

#Set of ISO 3166-1 alpha-3 codes in `pycountry`
alpha3_codes = set([country.alpha_3 for country in pycountry.countries])

#Create an empty set
non_alpha3_set = set()

#Populate non_alpha3_set with elements in `Code` columns but not in ISO 3166-1 alpha-3 codes
for data in dataframe_ls:
    df = globals()[data]
    cond_= ~df['Code'].isin(alpha3_codes)
    sub_df = df.loc[cond_, 'Code'] 
    non_alpha3_set = non_alpha3_set| set(sub_df)
non_alpha3_set

{'OWID_WRL', nan}

---
**COMMENT**: About `Code` columns<br>
- The 3-alphabetical code are indeed ISO 3166 country code, each representing a country.
- Next step is to explore data with `Entity` as `OWID_WRL` or *NAN*.
---

### Step 2: What `Entity` uses the Code `OWID_WRL` or has missing Code

In [12]:
# Entity with Code: OWID_WRL
for data in dataframe_ls:
    df = globals()[data]
    cond_= df['Code']=="OWID_WRL"
    _d = df.loc[cond_, ['Entity']]
    _d['Entity'].unique()

array(['World'], dtype=object)

array(['World'], dtype=object)

array(['World'], dtype=object)

In [13]:
# Entity with no Code (nan)
no_code_entity = set()
for data in dataframe_ls:
    df = globals()[data]
    cond_= df['Code'].isna()
    _d = df.loc[cond_, ['Entity']]
    no_code_entity = no_code_entity|set(_d['Entity'].unique())
no_code_entity    

{'Andean Latin America',
 'Australasia',
 'Caribbean',
 'Central Asia',
 'Central Europe',
 'Central Latin America',
 'Central Sub-Saharan Africa',
 'Early-demographic dividend',
 'East Asia',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD)',
 'East Asia & Pacific (excluding high income)',
 'Eastern Europe',
 'Eastern Sub-Saharan Africa',
 'England',
 'Fragile and conflict affected situations',
 'Heavily indebted poor countries (HIPC)',
 'High SDI',
 'High-income Asia Pacific',
 'High-middle SDI',
 'IBRD only',
 'IDA & IBRD total',
 'IDA blend',
 'IDA only',
 'IDA total',
 'Late-demographic dividend',
 'Latin America & Caribbean',
 'Latin America & Caribbean (IDA & IBRD)',
 'Latin America & Caribbean (excluding high income)',
 'Latin America and Caribbean',
 'Least developed countries: UN classification',
 'Low & middle income',
 'Low SDI',
 'Low income',
 'Low-middle SDI',
 'Lower middle income',
 'Middle SDI',
 'Middle income',
 'North Africa and Middle East',
 'North Amer

In [14]:
#Check Entity with Code and Entity without Code are mutually exclusive
have_code_entity = set()
for data in dataframe_ls:
    df = globals()[data]
    cond_= ~df['Code'].isna()
    _d = df.loc[cond_, ['Entity']]
    have_code_entity = have_code_entity|set(_d['Entity'].unique())
have_code_entity & no_code_entity

set()

---
**COMMENT**: Findings<br>
1. Entity with Code are countries
1. Entity without Code are regions, classifications, or groupings related to demographic, economic, or geographical areas, but they do not represent individual countries.
1. 'England','Scotland','Wales','Northern Ireland' do not have `Code` as they are under the 'United Kingdom'
    - Subsequent analysis needs to be mindful of this.

PLAN:<br>
- Create new column `Entity_type` to label the data.
- Rename the countries to ensure consistency:
    - Countries may be named/ referred to in different ways: (e.g. cape verde vs cabo verde)
    - Countries will be renamed using their country code as reference and using the python library `GeonamesCache` 
---

### Step 3: New column `Entity_type` and Standardise Countries naming

**NEW COLUMN: `Entity_type`**

In [16]:
#UK Countries
UK_countries = {'England','Scotland','Wales','Northern Ireland'}

# Income Classifications Entity
keywords = ['income',
            'IDA',
            'IBRD',
            'SDI',
            'demographic',
            'debt',
            'developed']
Income_class_entity = {x for x in no_code_entity if any(keyword in x for keyword in keywords)}

# Region Classifications Entity
Region_class_entity = {x for x in no_code_entity if x not in Income_class_entity|UK_countries}

In [17]:
#Function for labelling data based on Entity type:
def entity_type(ent):
    if ent in UK_countries:
        return 'country'
    elif ent in Income_class_entity:
        return 'income_class'
    elif ent in Region_class_entity:
        return 'region_class'
    else:
        return 'others'

In [21]:
#Create new Enity_type column across the datasets:

for data in dataframe_ls:
    df = globals()[data].copy()
    not_na_rows = df['Code'].notna()
    
    df['Entity_type'] = df['Entity'].apply(lambda x: entity_type(x))
    df.loc[not_na_rows,'Entity_type'] = 'country'
    
    print(f"Dataframe: {data}")
    print("Unique Entity_type in dataframe:")
    df['Entity_type'].unique().tolist()
    df.head(3)
    globals()[data] = df

Dataframe: age_grp_death_df
Unique Entity_type in dataframe:


['country', 'region_class', 'income_class']

Unnamed: 0,Entity,Code,Year,Age_group,Deaths,Entity_type
0,Afghanistan,AFG,1990,Under 5,184.606435,country
1,Afghanistan,AFG,1991,Under 5,191.658193,country
2,Afghanistan,AFG,1992,Under 5,197.140197,country


Dataframe: death_rate_df
Unique Entity_type in dataframe:


['country', 'region_class', 'income_class']

Unnamed: 0,Entity,Code,Year,DeathRate_per100K,Entity_type
0,Afghanistan,AFG,1990,6.80293,country
1,Afghanistan,AFG,1991,6.973494,country
2,Afghanistan,AFG,1992,6.989882,country


Dataframe: inc_rate_df
Unique Entity_type in dataframe:


['country', 'income_class', 'region_class']

Unnamed: 0,Entity,Code,Year,IncRate_per100K,Entity_type
0,Afghanistan,AFG,2000,107.1,country
1,Afghanistan,AFG,2005,46.5,country
2,Afghanistan,AFG,2010,23.9,country


**STANDARDISING COUNTRIES NAME**

In [25]:
#Step 1: create a dictionary with key: country code, value: country name
gc = GeonamesCache()

# Access data for specific entities
gc_countries = gc.get_countries_by_names()

# List of country CODE in GeonamesCache
gc_country_codes = [value['iso3'] for key, value in gc_countries.items()]
# List of country NAME in GeonamesCache
gc_country_names = [key for key, value in gc_countries.items()]
#Create dictionary
gc_country_dict = {k:v for k,v in zip(gc_country_codes,gc_country_names)}

Use the Country ISO 3166-1 alpha-3 codes to classify the countries into their continential region based on their [Continentcode](https://datahub.io/core/continent-codes).
---

In [17]:
countries_to_exclude = ['England','Scotland','Wales','Northern Ireland']

for data in dataframe_ls:
    df = globals()[data]
    print(f"""
    Dataframe: {data}
    Datasize: {len(df)}""")
    df.drop(df[df['Entity'].isin(countries_to_exclude)].index,
           inplace=True)
    print(f"Datasize after dropping rows: {len(df)}")


    Dataframe: age_grp_death_df
    Datasize: 30780
Datasize after dropping rows: 30240

    Dataframe: death_rate_df
    Datasize: 6156
Datasize after dropping rows: 6048

    Dataframe: inc_rate_df
    Datasize: 508
Datasize after dropping rows: 508


In [24]:
country_dataframe_ls = []
for data in dataframe_ls:
    subdata = "country_"+data
    country_dataframe_ls.append(subdata)
    
    globals()[subdata] = globals()[data].dropna()
    print(f"""
    {data}: {len(globals()[data])} rows
    {subdata}: {len(globals()[subdata])} rows)""")
print(f"\nNew datasets with only Country Entities:\n {country_dataframe_ls}")


    age_grp_death_df: 30240 rows
    country_age_grp_death_df: 26460 rows)

    death_rate_df: 6048 rows
    country_death_rate_df: 5292 rows)

    inc_rate_df: 508 rows
    country_inc_rate_df: 400 rows)

New datasets with only Country Entities:
 ['country_age_grp_death_df', 'country_death_rate_df', 'country_inc_rate_df']


In [36]:
gc = GeonamesCache()

# Access data for specific entities
gc_countries = gc.get_countries()

# List of country codes in GeonamesCache
gc_country_codes = [value['iso3'] for key, value in gc_countries_codes.items()]
# List of countries respective continental codes GeonamesCache
gc_country_cont_codes = [value['continentcode'] for key, value in gc_countries_codes.items()]

gc_country_cont_dict = {k:v for k,v in zip(gc_country_codes,gc_country_cont_codes)}

In [41]:
for data in country_dataframe_ls:
    df = globals()[data].copy()
    df["Continental_Code"] = df['Code'].map(gc_country_cont_dict)
    globals()[data] = df

In [42]:
for data in country_dataframe_ls:
    globals()[data].head()

Unnamed: 0,Entity,Code,Year,Age_group,Deaths,Continental_Code
0,Afghanistan,AFG,1990,Under 5,184.606435,AS
1,Afghanistan,AFG,1991,Under 5,191.658193,AS
2,Afghanistan,AFG,1992,Under 5,197.140197,AS
3,Afghanistan,AFG,1993,Under 5,207.357753,AS
4,Afghanistan,AFG,1994,Under 5,226.209363,AS


Unnamed: 0,Entity,Code,Year,DeathRate_per100K,Continental_Code
0,Afghanistan,AFG,1990,6.80293,AS
1,Afghanistan,AFG,1991,6.973494,AS
2,Afghanistan,AFG,1992,6.989882,AS
3,Afghanistan,AFG,1993,7.088983,AS
4,Afghanistan,AFG,1994,7.392472,AS


Unnamed: 0,Entity,Code,Year,IncRate_per100K,Continental_Code
0,Afghanistan,AFG,2000,107.1,AS
1,Afghanistan,AFG,2005,46.5,AS
2,Afghanistan,AFG,2010,23.9,AS
3,Afghanistan,AFG,2015,23.6,AS
4,Algeria,DZA,2000,0.037746,AF


In [32]:
gc_countries['AD']

{'geonameid': 3041565,
 'name': 'Andorra',
 'iso': 'AD',
 'iso3': 'AND',
 'isonumeric': 20,
 'fips': 'AN',
 'continentcode': 'EU',
 'capital': 'Andorra la Vella',
 'areakm2': 468,
 'population': 77006,
 'tld': '.ad',
 'currencycode': 'EUR',
 'currencyname': 'Euro',
 'phone': '376',
 'postalcoderegex': '^(?:AD)*(\\d{3})$',
 'languages': 'ca',
 'neighbours': 'ES,FR'}

In [31]:
gc_country_codes

{'ABW',
 'AFG',
 'AGO',
 'AIA',
 'ALA',
 'ALB',
 'AND',
 'ANT',
 'ARE',
 'ARG',
 'ARM',
 'ASM',
 'ATA',
 'ATF',
 'ATG',
 'AUS',
 'AUT',
 'AZE',
 'BDI',
 'BEL',
 'BEN',
 'BES',
 'BFA',
 'BGD',
 'BGR',
 'BHR',
 'BHS',
 'BIH',
 'BLM',
 'BLR',
 'BLZ',
 'BMU',
 'BOL',
 'BRA',
 'BRB',
 'BRN',
 'BTN',
 'BVT',
 'BWA',
 'CAF',
 'CAN',
 'CCK',
 'CHE',
 'CHL',
 'CHN',
 'CIV',
 'CMR',
 'COD',
 'COG',
 'COK',
 'COL',
 'COM',
 'CPV',
 'CRI',
 'CUB',
 'CUW',
 'CXR',
 'CYM',
 'CYP',
 'CZE',
 'DEU',
 'DJI',
 'DMA',
 'DNK',
 'DOM',
 'DZA',
 'ECU',
 'EGY',
 'ERI',
 'ESH',
 'ESP',
 'EST',
 'ETH',
 'FIN',
 'FJI',
 'FLK',
 'FRA',
 'FRO',
 'FSM',
 'GAB',
 'GBR',
 'GEO',
 'GGY',
 'GHA',
 'GIB',
 'GIN',
 'GLP',
 'GMB',
 'GNB',
 'GNQ',
 'GRC',
 'GRD',
 'GRL',
 'GTM',
 'GUF',
 'GUM',
 'GUY',
 'HKG',
 'HMD',
 'HND',
 'HRV',
 'HTI',
 'HUN',
 'IDN',
 'IMN',
 'IND',
 'IOT',
 'IRL',
 'IRN',
 'IRQ',
 'ISL',
 'ISR',
 'ITA',
 'JAM',
 'JEY',
 'JOR',
 'JPN',
 'KAZ',
 'KEN',
 'KGZ',
 'KHM',
 'KIR',
 'KNA',
 'KOR',
 'KWT',


In [16]:
#Function to label 'Entity type'
def label_entity(ent):
    if ent in country_ls:
        return 'country'
    elif ent in US_states_ls:
        return 'US_state'
    else:
        return 'others'

#Create new column in each data set using the lamda function
for df in df_list:
    df["Entity_Type"] = df['Entity'].apply(lambda x: label_entity(x))

In [18]:
for df in df_list:
    df['Entity_Type'].value_counts()

country    24840
others      5940
Name: Entity_Type, dtype: int64

country    4968
others     1188
Name: Entity_Type, dtype: int64

country    372
others     136
Name: Entity_Type, dtype: int64

In [61]:
country_ls

{'Afghanistan',
 'Aland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire, Saint Eustatius and Saba ',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos Islands',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czechia',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',

In [21]:
non_country_set = set()
for df in df_list:
    cond_ = df['Entity_Type'] == 'others'
    sub_df = df.loc[cond_,'Entity']
    non_country_set = non_country_set| set(sub_df)
non_country_set

{'Andean Latin America',
 'Australasia',
 'Cape Verde',
 'Caribbean',
 'Central Asia',
 'Central Europe',
 'Central Latin America',
 'Central Sub-Saharan Africa',
 'Congo',
 "Cote d'Ivoire",
 'Czech Republic',
 'Democratic Republic of Congo',
 'Early-demographic dividend',
 'East Asia',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD)',
 'East Asia & Pacific (excluding high income)',
 'Eastern Europe',
 'Eastern Sub-Saharan Africa',
 'England',
 'Fragile and conflict affected situations',
 'Heavily indebted poor countries (HIPC)',
 'High SDI',
 'High-income Asia Pacific',
 'High-middle SDI',
 'IBRD only',
 'IDA & IBRD total',
 'IDA blend',
 'IDA only',
 'IDA total',
 'Late-demographic dividend',
 'Latin America & Caribbean',
 'Latin America & Caribbean (IDA & IBRD)',
 'Latin America & Caribbean (excluding high income)',
 'Latin America and Caribbean',
 'Least developed countries: UN classification',
 'Low & middle income',
 'Low SDI',
 'Low income',
 'Low-middle SDI',
 'Lower

In [48]:
len(non_country_set)

69

In [53]:
# Find Entity that are found some but not all the three datasets:
Set_death_rate = set(death_rate_df['Entity'])
Set_inc_rate = set(inc_rate_df['Entity'])
Set_age_death = set(age_grp_death_df['Entity'])

(Set_death_rate | Set_inc_rate | Set_age_death) - (Set_death_rate & Set_inc_rate & Set_age_death)

{'Albania',
 'American Samoa',
 'Andean Latin America',
 'Andorra',
 'Antigua and Barbuda',
 'Armenia',
 'Australasia',
 'Australia',
 'Austria',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Bosnia and Herzegovina',
 'Brunei',
 'Bulgaria',
 'Canada',
 'Caribbean',
 'Central Asia',
 'Central Europe',
 'Central Latin America',
 'Central Sub-Saharan Africa',
 'Chile',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominica',
 'Early-demographic dividend',
 'East Asia',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD)',
 'East Asia & Pacific (excluding high income)',
 'Eastern Europe',
 'Eastern Sub-Saharan Africa',
 'Egypt',
 'England',
 'Estonia',
 'Fiji',
 'Finland',
 'Fragile and conflict affected situations',
 'France',
 'Germany',
 'Greece',
 'Greenland',
 'Grenada',
 'Guam',
 'Heavily indebted poor countries (HIPC)',
 'High SDI',
 'High-income Asia Pacific',
 'High-middle SDI',
 'Hungary',
 'IBRD only',
 'IDA & IBRD total',
 

In [57]:
# Find Code that are found some but not all the three datasets:
Set_death_rate = set(death_rate_df['Code'])
Set_inc_rate = set(inc_rate_df['Code'])
Set_age_death = set(age_grp_death_df['Code'])

# (Set_death_rate | Set_inc_rate | Set_age_death) - (Set_death_rate & Set_inc_rate & Set_age_death)
(Set_death_rate | Set_inc_rate | Set_age_death)

{'AFG',
 'AGO',
 'ALB',
 'AND',
 'ARE',
 'ARG',
 'ARM',
 'ASM',
 'ATG',
 'AUS',
 'AUT',
 'AZE',
 'BDI',
 'BEL',
 'BEN',
 'BFA',
 'BGD',
 'BGR',
 'BHR',
 'BHS',
 'BIH',
 'BLR',
 'BLZ',
 'BMU',
 'BOL',
 'BRA',
 'BRB',
 'BRN',
 'BTN',
 'BWA',
 'CAF',
 'CAN',
 'CHE',
 'CHL',
 'CHN',
 'CIV',
 'CMR',
 'COD',
 'COG',
 'COL',
 'COM',
 'CPV',
 'CRI',
 'CUB',
 'CYP',
 'CZE',
 'DEU',
 'DJI',
 'DMA',
 'DNK',
 'DOM',
 'DZA',
 'ECU',
 'EGY',
 'ERI',
 'ESP',
 'EST',
 'ETH',
 'FIN',
 'FJI',
 'FRA',
 'FSM',
 'GAB',
 'GBR',
 'GEO',
 'GHA',
 'GIN',
 'GMB',
 'GNB',
 'GNQ',
 'GRC',
 'GRD',
 'GRL',
 'GTM',
 'GUM',
 'GUY',
 'HND',
 'HRV',
 'HTI',
 'HUN',
 'IDN',
 'IND',
 'IRL',
 'IRN',
 'IRQ',
 'ISL',
 'ISR',
 'ITA',
 'JAM',
 'JOR',
 'JPN',
 'KAZ',
 'KEN',
 'KGZ',
 'KHM',
 'KIR',
 'KOR',
 'KWT',
 'LAO',
 'LBN',
 'LBR',
 'LBY',
 'LCA',
 'LKA',
 'LSO',
 'LTU',
 'LUX',
 'LVA',
 'MAR',
 'MDA',
 'MDG',
 'MDV',
 'MEX',
 'MHL',
 'MKD',
 'MLI',
 'MLT',
 'MMR',
 'MNE',
 'MNG',
 'MNP',
 'MOZ',
 'MRT',
 'MUS',
 'MWI',
