In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process

### Loading Data

Loading Original Data

In [2]:
# From Scimagojr
df_academicRelevance = pd.read_csv("..\\Data_Sets\\originalData\\academic_relevance_1996-2022.csv")

# From OurWorldInData
df_HDI = pd.read_csv("..\\Data_Sets\\originalData\\human-development-index.csv")
df_migrants = pd.read_csv("..\\Data_Sets\\originalData\\migrant-stock-total.csv")
df_naturalResources = pd.read_csv("..\\Data_Sets\\originalData\\naturalResourcesWealth.csv")
df_educationQuality = pd.read_csv("..\\Data_Sets\\originalData\\education_quality_by_outcome.csv")

# From World Bank
df_arableLand = pd.read_csv("..\\Data_Sets\\originalData\\arableLand.csv", skiprows=4)
df_qualifiedLaborForce = pd.read_csv("..\\Data_Sets\\originalData\\Labor_force_with_advanced_education.csv", skiprows=4)

Loading Data processed in other notebooks

In [3]:
df_climateData = pd.read_csv("..\\Data_Sets\\processed\\addData_climateData.csv")
df_countriesSize_regions = pd.read_csv("..\\Data_Sets\\processed\\addData_countriesSize_Regions.csv")
df_fromWiki = pd.read_csv("..\\Data_Sets\\processed\\addData_fromWiki.csv")
df_economicFreedom = pd.read_csv("..\\Data_Sets\\processed\\economicData_1995-2022.csv")
df_naturalDisasters = pd.read_csv("..\\Data_Sets\\processed\\addData_disasterData.csv")

### Merging Data - Part 1 

In [4]:
# Selecting only necessary columns from our main DataSet
df_master = df_economicFreedom[list(df_economicFreedom)[0:15] +
                               ['GDP per capita (current USD)'] +
                               list(df_economicFreedom)[19:27] +
                               ['Under-5 mortality rate (per 1k live births)']]

In [5]:
df_master.columns

Index(['Country Name', 'Index Year', 'Overall Score', 'Property Rights',
       'Government Integrity', 'Judicial Effectiveness', 'Government Spending',
       'Tax Burden', 'Fiscal Health', 'Business Freedom', 'Monetary Freedom',
       'Labor Freedom', 'Financial Freedom', 'Investment Freedom',
       'Trade Freedom', 'GDP per capita (current USD)', 'Total population',
       'Land area (sq. km)', 'Gini', 'Inflation CPI', 'Real interest rate',
       'Labor force size', 'Trade (% of GDP)', 'Trade in services (% of GDP)',
       'Under-5 mortality rate (per 1k live births)'],
      dtype='object')

In [6]:
df_master = df_master.merge(df_fromWiki, how='left', on=['Country Name'])

display(df_fromWiki.columns)

Index(['Country Name', 'Borders Length (in KM)', 'Neighbouring Countries',
       'isLandLocked', 'n_accessToSea', 'Rail Density',
       'Pctg of Rail Electrified', 'Warred Against', 'Area Size (km2)',
       'Expanded EconZone Area', 'Amount of Ports', 'Distance from Equator',
       'Majoritary Religions'],
      dtype='object')

In [7]:
df_master = df_master.merge(df_countriesSize_regions[['Country Name', 'Shape_Leng', 'Region']], how='left', on=['Country Name'])

display(df_countriesSize_regions.columns)

Index(['Country Name', 'Shape_Leng', 'Shape_Area', 'SubRegion', 'Region'], dtype='object')

In [8]:
df_master = df_master.merge(df_climateData, how='left', on=['Country Name'])

display(df_climateData.columns)

Index(['Country Name', 'Climate Type', 'Average Temperature (C)'], dtype='object')

We'll need to reshape df_naturalDisasters before merging it

In [9]:
df_naturalDisasters.rename(columns={
    "Index Year (Decade)": 'Index Year'
}, inplace=True)

# Setting the minimum year in our data [note: 'Death rates from disasters' in 1990 was already halved, to look like half a decade]
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 1990, ['Index Year']] = 1995

# Transforming Decade Data into Yearly data
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 1995, ['Death rates from disasters']] /= 5
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 2000, ['Death rates from disasters']] /= 10
df_naturalDisasters.loc[df_naturalDisasters['Index Year'] == 2010, ['Death rates from disasters']] /= 10

# List of years to add
years_to_add = list(range(1996, 2022+1))
years_to_add.remove(2000)
years_to_add.remove(2010)

new_rows = []
for year in years_to_add:
    for country in df_naturalDisasters['Country Name'].unique():
        new_rows.append({
            'Country Name': country,
            'Index Year': year,
            'Death rates from disasters': np.nan
            })
        
# Combining data
df_temp = pd.DataFrame(new_rows)
df_naturalDisasters = pd.concat([df_naturalDisasters, df_temp], ignore_index=True)

# Sorting By year, then by Country
df_naturalDisasters = df_naturalDisasters.sort_values(by=['Country Name', 'Index Year']).reset_index(drop=True)

# Interpolating 'Death rates from disasters'
df_naturalDisasters['Death rates from disasters'] = df_naturalDisasters.groupby('Country Name')['Death rates from disasters'].apply(lambda group: group.ffill(axis=0)).reset_index(drop=True)

# Doing the actual merging
df_master = df_master.merge(df_naturalDisasters, how='left', on=['Country Name', 'Index Year'])

display(df_naturalDisasters.columns)

Index(['Country Name', 'Index Year', 'Death rates from disasters'], dtype='object')

### Cleaning Original DataSets before merging them into df_master