In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process

### Loading Data

Loading Original Data

In [2]:
# From Scimagojr
df_academicRelevance = pd.read_csv("..\\Data_Sets\\originalData\\academic_relevance_1996-2022.csv")

# From OurWorldInData
df_HDI = pd.read_csv("..\\Data_Sets\\originalData\\human-development-index.csv")
df_migrants = pd.read_csv("..\\Data_Sets\\originalData\\migrant-stock-total.csv")
df_naturalResources = pd.read_csv("..\\Data_Sets\\originalData\\naturalResourcesWealth.csv")
df_educationQuality = pd.read_csv("..\\Data_Sets\\originalData\\education_quality_by_outcome.csv")

# From World Bank
df_arableLand = pd.read_csv("..\\Data_Sets\\originalData\\arableLand.csv", skiprows=4)
df_qualifiedLaborForce = pd.read_csv("..\\Data_Sets\\originalData\\Labor_force_with_advanced_education.csv", skiprows=4)

Loading Data processed in other notebooks

In [3]:
df_climateData = pd.read_csv("..\\Data_Sets\\processed\\addData_climateData.csv")
df_countriesSize_regions = pd.read_csv("..\\Data_Sets\\processed\\addData_countriesSize_Regions.csv")
df_fromWiki = pd.read_csv("..\\Data_Sets\\processed\\addData_fromWiki.csv")
df_economicFreedom = pd.read_csv("..\\Data_Sets\\processed\\economicData_1995-2022.csv")
df_naturalDisasters = pd.read_csv("..\\Data_Sets\\processed\\addData_disasterData.csv")

### Merging Data - Part 1 

In [4]:
# Selecting only necessary columns from our main DataSet
df_master = df_economicFreedom[list(df_economicFreedom)[0:15] +
                               ['GDP per capita (current USD)'] +
                               list(df_economicFreedom)[19:27] +
                               ['Under-5 mortality rate (per 1k live births)']]

In [5]:
df_master.columns

Index(['Country Name', 'Index Year', 'Overall Score', 'Property Rights',
       'Government Integrity', 'Judicial Effectiveness', 'Government Spending',
       'Tax Burden', 'Fiscal Health', 'Business Freedom', 'Monetary Freedom',
       'Labor Freedom', 'Financial Freedom', 'Investment Freedom',
       'Trade Freedom', 'GDP per capita (current USD)', 'Total population',
       'Land area (sq. km)', 'Gini', 'Inflation CPI', 'Real interest rate',
       'Labor force size', 'Trade (% of GDP)', 'Trade in services (% of GDP)',
       'Under-5 mortality rate (per 1k live births)'],
      dtype='object')

In [6]:
df_master = df_master.merge(df_fromWiki, how='left', on=['Country Name'])

display(df_fromWiki.columns)

Index(['Country Name', 'Borders Length (in KM)', 'Neighbouring Countries',
       'isLandLocked', 'n_accessToSea', 'Rail Density',
       'Pctg of Rail Electrified', 'Warred Against', 'Area Size (km2)',
       'Expanded EconZone Area', 'Amount of Ports', 'Distance from Equator',
       'Majoritary Religions'],
      dtype='object')

In [7]:
df_master = df_master.merge(df_countriesSize_regions[['Country Name', 'Shape_Leng', 'Region']], how='left', on=['Country Name'])

display(df_countriesSize_regions.columns)

Index(['Country Name', 'Shape_Leng', 'Shape_Area', 'SubRegion', 'Region'], dtype='object')

In [8]:
df_master = df_master.merge(df_climateData, how='left', on=['Country Name'])

display(df_climateData.columns)

Index(['Country Name', 'Climate Type', 'Average Temperature (C)'], dtype='object')

We'll need to reshape df_naturalDisasters before merging it

In [9]:
df_naturalDisasters.rename(columns={
    "Index Year (Decade)": 'Index Year'
}, inplace=True)

# Setting the minimum year in our data [note: 'Death rates from disasters' in 1990 was already halved, to look like half a decade]
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 1990, ['Index Year']] = 1995

# Transforming Decade Data into Yearly data
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 1995, ['Death rates from disasters']] /= 5
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 2000, ['Death rates from disasters']] /= 10
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 2010, ['Death rates from disasters']] /= 10

# List of years to add
years_to_add = list(range(1996, 2022+1))
years_to_add.remove(2000)
years_to_add.remove(2010)

new_rows = []
for year in years_to_add:
    for country in df_naturalDisasters['Country Name'].unique():
        new_rows.append({
            'Country Name': country,
            'Index Year': year,
            'Death rates from disasters': np.nan
            })
        
# Combining data
df_temp = pd.DataFrame(new_rows)
df_naturalDisasters = pd.concat([df_naturalDisasters, df_temp], ignore_index=True)

# Sorting By year, then by Country
df_naturalDisasters = df_naturalDisasters.sort_values(by=['Country Name', 'Index Year']).reset_index(drop=True)

# Interpolating 'Death rates from disasters'
df_naturalDisasters['Death rates from disasters'] = df_naturalDisasters.groupby('Country Name')['Death rates from disasters'].apply(lambda group: group.ffill(axis=0)).reset_index(drop=True)

# Doing the actual merging
df_master = df_master.merge(df_naturalDisasters, how='left', on=['Country Name', 'Index Year'])

display(df_naturalDisasters.columns)

Index(['Country Name', 'Index Year', 'Death rates from disasters'], dtype='object')

### Cleaning Original DataSets before merging them into df_master

In [10]:
def fuzzySearchName(name, refNames=df_master, score_threshold = 87):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    exceptions = {
        "Kyrgyzstan": "Kyrgyz Republic",
        "Congo": "Republic of Congo",
        'Congo, Rep.': "Republic of Congo",
        'Congo, Dem. Rep.': "Democratic Republic of Congo",
        'Democratic Republic Of The Congo': 'Democratic Republic of Congo',
        "Czechia": "Czech Republic",
        "Slovakia": "Slovak Republic",
        "Macao Sar, China": "Macau",
        "Macao": "Macau",
        'Republic Of The Congo': 'Republic of Congo',
        'Swaziland': 'Eswatini',
        'Korea, Rep.': 'South Korea',
        "Korea, Dem. People'S Rep.": 'North Korea',
        'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
        'Lao Pdr': 'Laos',
        "Hong Kong Sar, China": "Hong Kong",
        'Cabo Verde': 'Cape Verde',
        'Egypt, Arab Rep.': 'Egypt',
        'Turkiye': 'Turkey',
        'St. Lucia': 'Saint Lucia'
        
    }
    
    if name in exceptions:
        return exceptions[name]
    
    match, score = process.extractOne(name, refNames['Country Name'].unique())

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"

Academic Relevance

In [11]:
df_academicRelevance.rename(columns={'Country': 'Old Country Name', 'H index': 'H index (Academic Papers)'}, inplace=True)

df_academicRelevance = df_academicRelevance[['Old Country Name', 'H index (Academic Papers)']]

In [12]:
def cleanedNamesMask(df, column = str()):

    '''
        Takes a dataFrame (and its column with Country Names), and returns a mask of rows to exclude
    '''

    removeList = [
    'British Indian Ocean Territory', 'Africa', 'Europe', 'North America', 'South America', 'Asia', 'Oceania',
    'Sint Maarten (Dutch part)', 'Saint Martin (French Part)', 'Antigua and Barbuda',
    'Netherlands Antilles', 'Northern Mariana Islands', 'Saint Barthelemy',
    'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis',
    'Serbia-Montenegro', "Turks and Caicos Islands", 'United States Virgin Islands',
    'Wallis and Futuna', 'Yugoslavia', 'Cocos (Keeling) Islands', 'Isle Of Man', 'South Georgia And South Sandwich Islands',
    'South Sudan','United States Minor Outlying Islands','Saint Pierre And Miquelon', 'American Samoa',
    'Falkland Islands (Malvinas)', 'Virgin Islands (British)', 'Saint Martin (Dutch)', 'Saint Martin (French)',
    'Heard Island And Mcdonald Islands', 'South Georgia And The South Sandwich Islands', 'Republic Of South Sudan',
    'Somalia'
    ]

    df[column] = df[column].str.strip().str.title()
    removeList = [country.title() for country in removeList]

    mask = ~df[column].isin(removeList)

    return mask

In [13]:
# Excluding non-relevant countries to our DataSet
mask = cleanedNamesMask(df_academicRelevance, 'Old Country Name')
df_academicRelevance = df_academicRelevance[mask]


display(f'% of included countries in mask:', mask.mean())

'% of included countries in mask:'

0.9218106995884774

In [14]:
df_academicRelevance['Country Name'] = df_academicRelevance['Old Country Name'].apply(fuzzySearchName)

In [15]:
# Removing unmatched fuzzySearch names
mask = df_academicRelevance['Country Name'] == 'PLEASE FILL MANUALLY'
df_academicRelevance = df_academicRelevance[~mask]

# Removing old names
df_academicRelevance = df_academicRelevance[['Country Name', 'H index (Academic Papers)']].sort_values(by=['Country Name']).reset_index(drop=True)

In [16]:
# Manually adding Kosovo. Source: https://xk.h-index.com/en
# Manually adding North Macedonia. Source: https://www.adscientificindex.com/?country_code=mk

df_temp = pd.DataFrame({
    'Country Name': ['Kosovo', 'North Macedonia'],
    'H index (Academic Papers)': [147, 59]
})

df_academicRelevance = pd.concat([df_academicRelevance, df_temp], ignore_index=True)


In [17]:
set_A = set(df_academicRelevance['Country Name'].unique())
set_B = set(df_master['Country Name'].unique())

display(f'Names we should exclude: {set_A - set_B}')
display(f'Names that are still missing: {set_B - set_A}')

'Names we should exclude: set()'

'Names that are still missing: set()'

In [18]:
df_master = df_master.merge(df_academicRelevance, how='left', on=['Country Name'])

display(df_academicRelevance.columns)

Index(['Country Name', 'H index (Academic Papers)'], dtype='object')

World Bank:

Arable Land and Qualified Work Foce

In [19]:
#display(df_qualifiedLaborForce.head(2))

display(df_arableLand.head(2))

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,Arable land (% of land area),AG.LND.ARBL.ZS,,11.111111,11.111111,11.111111,11.111111,11.111111,...,11.111111,11.111111,11.111111,11.111111,11.111111,11.111111,11.111111,11.111111,,
1,Africa Eastern and Southern,AFE,Arable land (% of land area),AG.LND.ARBL.ZS,,4.702843,4.754588,4.866723,4.918674,4.972683,...,7.971467,8.016863,8.098773,8.193305,8.256337,8.284319,8.30422,8.322455,,


In [20]:
list(df_arableLand['Country Name'].unique())

['Aruba',
 'Africa Eastern and Southern',
 'Afghanistan',
 'Africa Western and Central',
 'Angola',
 'Albania',
 'Andorra',
 'Arab World',
 'United Arab Emirates',
 'Argentina',
 'Armenia',
 'American Samoa',
 'Antigua and Barbuda',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Burundi',
 'Belgium',
 'Benin',
 'Burkina Faso',
 'Bangladesh',
 'Bulgaria',
 'Bahrain',
 'Bahamas, The',
 'Bosnia and Herzegovina',
 'Belarus',
 'Belize',
 'Bermuda',
 'Bolivia',
 'Brazil',
 'Barbados',
 'Brunei Darussalam',
 'Bhutan',
 'Botswana',
 'Central African Republic',
 'Canada',
 'Central Europe and the Baltics',
 'Switzerland',
 'Channel Islands',
 'Chile',
 'China',
 "Cote d'Ivoire",
 'Cameroon',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Colombia',
 'Comoros',
 'Cabo Verde',
 'Costa Rica',
 'Caribbean small states',
 'Cuba',
 'Curacao',
 'Cayman Islands',
 'Cyprus',
 'Czechia',
 'Germany',
 'Djibouti',
 'Dominica',
 'Denmark',
 'Dominican Republic',
 'Algeria',
 'East Asia & Pacific (excluding high income)

In [21]:
df_qualifiedLaborForce = df_qualifiedLaborForce[cleanedNamesMask(df_qualifiedLaborForce, 'Country Name')]

df_arableLand = df_arableLand[cleanedNamesMask(df_arableLand, 'Country Name')]

In [22]:
df_arableLand['New Country Name'] = df_arableLand['Country Name'].apply(fuzzySearchName)
df_qualifiedLaborForce['New Country Name'] = df_qualifiedLaborForce['Country Name'].apply(fuzzySearchName)

In [23]:
# Excluding "PLEASE FILL MANUALLY"
df_arableLand = df_arableLand[~df_arableLand['New Country Name'].isin(["PLEASE FILL MANUALLY"])]
df_qualifiedLaborForce = df_qualifiedLaborForce[~df_qualifiedLaborForce['New Country Name'].isin(["PLEASE FILL MANUALLY"])]

# Checking for duplicated countries (top freq should be 1)
display(df_arableLand['New Country Name'].describe(include=['object']))
display(df_qualifiedLaborForce['New Country Name'].describe(include=['object']))

# Casting New Country Name to Country Name
df_arableLand['Country Name'] = df_arableLand['New Country Name']
df_qualifiedLaborForce['Country Name'] = df_qualifiedLaborForce['New Country Name']

count             183
unique            183
top       Afghanistan
freq                1
Name: New Country Name, dtype: object

count             183
unique            183
top       Afghanistan
freq                1
Name: New Country Name, dtype: object

We'll now exclude some unnecessary columns and then transpose the DataFrames

In [24]:
years = list(df_arableLand.columns)[-3:-31:-1]  # This selects the columns representing 1995 through 2022
df_arableLand = df_arableLand[['Country Name'] + years[::-1]]
df_qualifiedLaborForce = df_qualifiedLaborForce[['Country Name'] + years[::-1]]

In [25]:
df_arableLand = df_arableLand.melt(id_vars=["Country Name"], 
                               value_vars=list(map(str, range(1995, 2023))),
                               var_name="Index Year", 
                               value_name="Arable Land pct")

df_qualifiedLaborForce = df_qualifiedLaborForce.melt(id_vars=["Country Name"], 
                               value_vars=list(map(str, range(1995, 2023))),
                               var_name="Index Year", 
                               value_name="Qualified Labor Force pct")

In [26]:
# Converting Index Year to int
df_arableLand['Index Year'] = df_arableLand['Index Year'].astype('int')
df_qualifiedLaborForce['Index Year'] = df_qualifiedLaborForce['Index Year'].astype('int')

In [29]:
display('df_arableLand Stats')
set_A = set(df_arableLand['Country Name'].unique())
set_B = set(df_master['Country Name'].unique())

display(f'Names we should exclude: {set_A - set_B}')
display(f'Names that are still missing: {set_B - set_A}')


display(df_arableLand.shape)

'df_arableLand Stats'

'Names we should exclude: set()'

'Names that are still missing: set()'

(5124, 3)

In [31]:
display('df_qualifiedLaborForce Stats')
set_A = set(df_qualifiedLaborForce['Country Name'].unique())
set_B = set(df_master['Country Name'].unique())

display(f'Names we should exclude: {set_A - set_B}')
display(f'Names that are still missing: {set_B - set_A}')


display(df_qualifiedLaborForce.shape)

'df_qualifiedLaborForce Stats'

'Names we should exclude: set()'

'Names that are still missing: set()'

(5124, 3)

In [28]:
df_master = df_master.merge(df_arableLand, how='left', on=['Country Name', 'Index Year'])
df_master = df_master.merge(df_qualifiedLaborForce, how='left', on=['Country Name', 'Index Year'])

display(df_arableLand.columns)
display(df_qualifiedLaborForce.columns)

Index(['Country Name', 'Index Year', 'Arable Land pct'], dtype='object')

Index(['Country Name', 'Index Year', 'Qualified Labor Force pct'], dtype='object')

set_A = set(df_academicRelevance['Country Name'].unique())
set_B = set(df_master['Country Name'].unique())

display(set_A - set_B)
display(set_B - set_A)