In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process
import ast # used to convert strings to Lists

In [2]:
df_master = pd.read_csv("..\\Data_Sets\\processed\\mergedData_toClean_1995-2022.csv")

# Custom Functions

In [3]:
def interpolateMiddle(series, timeField_series, n_times=0, extrapolate=4):
    '''
    Essentially, just fill the middle values (n_times) for any given Series, and project past/future for (extrapolate) times.
    
    series: the series to interpolate
    timeField_series: corresponding time data for the series
    n_times: interpolation reach. If zero, find maximum value based on 'field'.
    extrapolate: how much time to project past "middle data"
    '''
    
    # If invalid, we'll pick a value that works in this DataSet.
    if n_times <= 0:
        n_times = int((2022-1960)/2)
    
    # Setting points to help determine 'middle' data
    earliest_time = timeField_series.loc[series.notna()].min()
    latest_time = timeField_series.loc[series.notna()].max()
    
    # If there's no non-NaN value, return the Series as-is
    if pd.isna(earliest_time) or pd.isna(latest_time):
        return series
    
    # Only interpolate what has been defined as "middle data"
    maskedtimes = (timeField_series >= earliest_time)&(timeField_series <= latest_time)    
    temp_series = series[maskedtimes]
    temp_series.interpolate(method='linear', limit=n_times, limit_direction='both', inplace=True)    
    
    # Update the original Series with interpolated middle data
    series[maskedtimes] = temp_series
    
    if extrapolate > 0:
        temp_series = series
        temp_series.interpolate(method='linear', limit=extrapolate, limit_direction='both', inplace=True)
    
        # Update the original Series (on extrapolate step)
        series = temp_series
    
    return series

def process_group(group, column, *args, **kwargs):
    timeField_series = group['Index Year']
    group[column] = interpolateMiddle(group[column], timeField_series, *args, **kwargs)
    return group

# Additional Cleaning procedures
There's still a few things left to clean in this DataSet

In [4]:
# We'll drop 'Land area (sq. km)' as we already have a column with that information
df_master.drop('Land area (sq. km)', axis=1, inplace=True)

In [5]:
# Checking for duplicated countries
display('Checking for duplicated countries')
display(df_master['Country Name'].describe(include=['object']))

'Checking for duplicated countries'

count        4767
unique        183
top       Albania
freq           28
Name: Country Name, dtype: object

# Preliminary Filling
We'll extrapolate some more values, and then evaluate which rows we'll have to drop depending on the column it's value is missing from

In [6]:
df_master['Borders Length (in KM)'] = df_master['Borders Length (in KM)'].str.replace(',', '')
df_master['Borders Length (in KM)'] = df_master['Borders Length (in KM)'].astype('float').reset_index(drop=True)

In [7]:
#df_master.info()

In [8]:
to_interpolate_easy = list(df_master.columns)[15:] # skipping Economic Freedom cols

# These are the fields that lack 1000 or more rows. We'll need to look at them more carefully before interpolating
to_interpolate_easy.remove('Gini')
to_interpolate_easy.remove('Real interest rate')
to_interpolate_easy.remove('Qualified Labor Force pct')
to_interpolate_easy.remove('Natural Resources')

# It doesn't make sense to interpolate these fields, as they are categorical
to_interpolate_easy.remove('Neighbouring Countries')
to_interpolate_easy.remove('Majoritary Religions')
to_interpolate_easy.remove('Warred Against')
to_interpolate_easy.remove('Climate Type')
to_interpolate_easy.remove('Region')

In [9]:
# Process each column in DataFrame
for col in to_interpolate_easy:
    #print(f'Interpolating {col}...')
    df_master = df_master.groupby(by=['Country Name']).apply(lambda group: process_group(group, col, n_times=100, extrapolate=100)).reset_index(drop=True)

# Manual filling
We'll treat the rest on a case by case basis.

Fiscal Health, Judicial Effectiveness and Labor Freedom: They're Economic Freedom metrics that didn't start from 1995. So we'll extrapolate 4 more years, and then fill the rest with whatever the overall score is.

In [10]:
#df_master[df_master['Country Name'] == 'Jordan'].loc[:, ['Country Name','Index Year','Qualified Labor Force pct', 'Gini']]

In [11]:
to_interpolate_EF = list(df_master.columns)[2:15]

# Forcing Overall Score calculation when overall score is missing
average_values = df_master[df_master.columns[3:15]].mean(axis=1)
df_master['Overall Score'] = df_master['Overall Score'].fillna(average_values)

for col in to_interpolate_EF:    
    df_master = df_master.groupby(by=['Country Name']).apply(lambda group: process_group(group, col, n_times=100, extrapolate=4)).reset_index(drop=True)    

    df_master[str(col)] = df_master[str(col)].fillna(df_master['Overall Score'])


Gini, Real interest rate:

In [12]:
# We'll extrapolate 6 years, then fill the rest with the average of (self and region).
to_interpolate_giniInterest = ['Gini', 'Real interest rate']


for col in to_interpolate_giniInterest:    
    df_master = df_master.groupby(by=['Country Name']).apply(lambda group: process_group(group, col, n_times=100, extrapolate=6)).reset_index(drop=True)    


# Filling the remainder with the average of self values and regional values.

country_avg = df_master.groupby('Country Name')[to_interpolate_giniInterest].transform('mean')
country_avg.fillna(0, inplace=True)
region_avg = df_master.groupby('Region')[to_interpolate_giniInterest].transform('mean')
combined_avg = (country_avg + region_avg) / 2

df_master[to_interpolate_giniInterest] = df_master[to_interpolate_giniInterest].fillna(combined_avg)


Qualified Labor Force pct, Natural Resources:

In [13]:
# We'll extrapolate all years, then fill the rest with the average of (self and region).
to_interpolate_laborFNatRes = ['Qualified Labor Force pct', 'Natural Resources']


for col in to_interpolate_laborFNatRes:    
    df_master = df_master.groupby(by=['Country Name']).apply(lambda group: process_group(group, col, n_times=100, extrapolate=30)).reset_index(drop=True)    


# Filling the remainder with the average of self values and regional values.

country_avg = df_master.groupby('Country Name')[to_interpolate_laborFNatRes].transform('mean')
country_avg.fillna(0, inplace=True)
region_avg = df_master.groupby('Region')[to_interpolate_laborFNatRes].transform('mean')
combined_avg = (country_avg + region_avg) / 2

df_master[to_interpolate_laborFNatRes] = df_master[to_interpolate_laborFNatRes].fillna(combined_avg)


Manually inserting North Korea's GDP per Capita information
Source: http://data.un.org/Data.aspx?q=korea+gdp&d=SNAAMA&f=grID:101;currID:USD;pcFlag:1;crID:408,410

In [14]:
gdp_per_capita_values = [
    222, 479, 462, 456, 452, 462, 476, 468, 471, 473,
    548, 576, 599, 553, 497, 573, 642, 648, 672, 702,
    654, 671, 690, 692, 643, 621, 654, 666]

for year, gdp in zip(range(1995, 2023), gdp_per_capita_values):
    df_master.loc[(df_master['Index Year'] == year) & (df_master['Country Name'] == 'North Korea'), ['GDP per capita (current USD)']] = gdp


From here, if a value is missing for any category, it's missing for all the years in that specific country.

So we'll fill them with the average regional values. Ideally we'd seek other databases to get this information. But I don't have enough time budget for that.

We'll only skip categorical values (which we'll process later)

In [15]:
to_interpolate_regional = [
    'Inflation CPI', 'Labor force size', f'Trade (% of GDP)', f'Trade in services (% of GDP)',
    'Under-5 mortality rate (per 1k live births)', 'Arable Land pct', 'Human Development Index',
    'Migration Volume', 'Harmonized Test Scores']


region_avg = df_master.groupby('Region')[to_interpolate_regional].transform('mean')

df_master[to_interpolate_regional] = df_master[to_interpolate_regional].fillna(region_avg)

In [16]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4767 entries, 0 to 4766
Data columns (total 48 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Country Name                                 4767 non-null   object 
 1   Index Year                                   4767 non-null   int64  
 2   Overall Score                                4767 non-null   float64
 3   Property Rights                              4767 non-null   float64
 4   Government Integrity                         4767 non-null   float64
 5   Judicial Effectiveness                       4767 non-null   float64
 6   Government Spending                          4767 non-null   float64
 7   Tax Burden                                   4767 non-null   float64
 8   Fiscal Health                                4767 non-null   float64
 9   Business Freedom                             4767 non-null   float64
 10  

In [17]:
# df_countryNameAnalysis = df_master.groupby('Country Name').apply(lambda x: x.isna().mean())
# df_countryNameAnalysis.to_csv('isna_groupedBy_CountryName.csv')

# df_regionAnalysis = df_master.groupby('Region').apply(lambda x: x.isna().mean())
# df_regionAnalysis.to_csv('isna_groupedBy_Region.csv')

# Column Calculations

In [18]:
# We need to convert some of our categorical features from Strings to Lists.

def string_to_list(value):
    try:
        return ast.literal_eval(value)
    except:
        return []

df_master['Neighbouring Countries'] = df_master['Neighbouring Countries'].apply(string_to_list)
df_master['Warred Against'] = df_master['Warred Against'].apply(string_to_list)



In [19]:
df_master['Amount of Country Neighbours'] = df_master['Neighbouring Countries'].apply(lambda x: len(x))
df_master['Amount of Major Conflicts_Since 1900'] = df_master['Warred Against'].apply(lambda x: len(x))

df_master['Population Density'] = df_master['Total population'] / df_master['Area Size (km2)']

In [20]:
# Transforming Shape_Leng to the same unit as borders length
# To achieve that, we need to multiply Shape_Leng by 92.337581
# Empirically found this out by looking at both columns for 'Czech Republic'
df_master['Shape_Leng'] *= 92.337581

df_master['Shared Borders (in pct)'] = df_master['Borders Length (in KM)'] / df_master['Shape_Leng']

### Calculating Proportion of Rival Neighbours

In [21]:
dict_neighConflictAmnt = dict()

def conflictsToPercentage(row, dict_master = dict_neighConflictAmnt):
    '''
        Given a series (which is a row in the DataFrame),
        extract which countries are his neighbours, and how many of those he has fought with (in %)

        Store the info in a dict. If extracted country already in dict, return dict values.
    '''
    # Extracting basic info
    country = str(row['Country Name'])
    neighbours = row['Neighbouring Countries']
    if not isinstance(neighbours, list):
        neighbours = []

    opponents = row['Warred Against']
    if not isinstance(opponents, list):
        opponents = []        

    # Early Returns
    if not opponents or not neighbours:        
        return {
                #'Country Name': country,
                #'Neighbouring Countries': neighbours,
                #'Warred Against': opponents,
                'Proportion of Rival Neighbours': 0
            }
    
    if country in dict_master:        
        return {
            #'Country Name': country,
            #'Neighbouring Countries': neighbours,
            #'Warred Against': opponents,
            'Proportion of Rival Neighbours': dict_master[country]
        }
    
    # ------------------ Doing the actual calculation
    neighbours = set(neighbours) # remove duplicates, if any
    opponents = set(opponents) # remove duplicates
    rivalCount = 0
    totalNeighs = len(neighbours) 
    for neigh in neighbours:
        if neigh in opponents:
            rivalCount += 1

    proportion = rivalCount/totalNeighs
    dict_master[country] = proportion

    return {
        #'Country Name': country,
        #'Neighbouring Countries': neighbours,
        #'Warred Against': opponents,
        'Proportion of Rival Neighbours': proportion
    }
    

df_master['Proportion of Rival Neighbours'] = np.nan
df_results = df_master.apply(conflictsToPercentage, axis=1, result_type='expand')
df_master.update(df_results)

### Calculating Mean Regional Statistics

In [22]:
df_master['Regional Mean GDP per Capita'] = df_master.groupby(['Region', 'Index Year'])['GDP per capita (current USD)'].transform('mean')

df_master['Regional Mean GDP'] = \
    df_master.groupby(['Region', 'Index Year'])['GDP per capita (current USD)'].transform('mean') \
    * df_master.groupby(['Region', 'Index Year'])['Total population'].transform('mean')

df_master['Regional Mean Gini'] = df_master.groupby(['Region', 'Index Year'])['Gini'].transform('mean')

In [23]:
def neigh_meanStats(row, df=df_master):
    '''
        Returns calculated columns for:
            • Mean GDP of neighboring + regional countries
            • Mean GDP per Capita of neighboring + regional countries
            • Mean Neighboring + regional Countries Gini coefficient
    '''
    
    country = str(row['Country Name'])
    year = row['Index Year']
    neighbours = row['Neighbouring Countries']

    regional_GDP_perCapita = row['Regional Mean GDP per Capita']
    regional_GDP = row['Regional Mean GDP']
    regional_Gini = row['Regional Mean Gini']


    if not isinstance(neighbours, list):
        neighbours = []
    
    # If there are no neighbours, we don't even need to calculate anything
    if not neighbours:
        #display(f'No neighbours found for {country}')
        return {
            'Local Mean GDP per Capita': regional_GDP_perCapita,
            'Local Mean GDP': regional_GDP,
            'Local Mean Gini': regional_Gini
    }
    
    # ------------------------------- Actual calculation
    # We'll do a mean of Regional and Neighbourhood Data. But first, we need to create Neighbourhood Data

    neighCount = 0
    GDP_count = 0
    GDPperCapita_count = 0
    Gini_count = 0

    for neigh in neighbours:
        neigh_data_GDP = df.loc[(df['Index Year'] == year) & (df['Country Name'] == neigh), 'GDP per capita (current USD)']
        neigh_data_population = df.loc[(df['Index Year'] == year) & (df['Country Name'] == neigh), 'Total population']
        neigh_data_Gini = df.loc[(df['Index Year'] == year) & (df['Country Name'] == neigh), 'Gini']

        if not neigh_data_GDP.empty and not neigh_data_population.empty:
            GDP_count += neigh_data_GDP.iloc[0] * neigh_data_population.iloc[0]
            neighCount += 1
        #else:
            #display(f'{country} is having problems on neigh_data_GDP or neigh_data_population. Neigh: {neigh}')

        if not neigh_data_GDP.empty:
            GDPperCapita_count += neigh_data_GDP.iloc[0]
        #else:
            #display(f'{country} is having problems on neigh_data_GDP (only the second time). Neigh: {neigh}')

        if not neigh_data_Gini.empty:
            Gini_count += neigh_data_Gini.iloc[0]
        #else:
            #display(f'{country} is having problems on Gini_count. Neigh: {neigh}')


    return {
        'Local Mean GDP per Capita': ((GDPperCapita_count/neighCount) + regional_GDP_perCapita)/2,
        'Local Mean GDP': ((GDP_count/neighCount) + regional_GDP)/2,
        'Local Mean Gini': ((Gini_count/neighCount) + regional_Gini)/2
    }
    
df_master['Local Mean GDP per Capita'] = np.nan
df_master['Local Mean GDP'] = np.nan
df_master['Local Mean Gini'] = np.nan
df_results = df_master.apply(neigh_meanStats, axis=1, result_type='expand')
#df_results = df_master.loc[(df_master['Country Name'] == 'Germany')].apply(neigh_meanStats, axis=1, result_type='expand')
df_master.update(df_results)

# One-Hot Encoding some categorical features

In [24]:
# Convert the string-list representation into a comma-separated string
df_master['Majoritary Religions'] = df_master['Majoritary Religions'].apply(lambda x: ','.join(ast.literal_eval(x)))

# One-hot encode the 'Majoritary Religions' using str.get_dummies()
one_hot_encoding_religions = df_master['Majoritary Religions'].str.get_dummies(sep=',').add_prefix('Religion_')

# Concatenate the one-hot encoded dataframe with the original dataframe
df_master = pd.concat([df_master, one_hot_encoding_religions], axis=1)

In [25]:
one_hot_encoded_climate = pd.get_dummies(df_master['Climate Type'], prefix='ClimateType')

# Concatenate the one-hot encoded dataframe with the original dataframe
df_master = pd.concat([df_master, one_hot_encoded_climate], axis=1)

# Deleting unusable columns
We don't have a use for some of the columns anymore, so...

In [27]:
df_master.drop(columns=['Overall Score','Borders Length (in KM)','Neighbouring Countries','Warred Against','Majoritary Religions','Shape_Leng','Region','Regional Mean GDP per Capita','Regional Mean GDP','Regional Mean Gini'],inplace=True)

# Exporting

In [31]:
df_master.to_csv('..\\Data_Sets\\processed\\completeData_1995-2022.csv',index=False)